ckmeans 1.1.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 508f78311a643e1fa8e693e4abf1cdf6df4eb06ff09756fa534ff4a514d0f34f
4
- data.tar.gz: 4ef313387c2e45df4a8afde58e429093023a555a32f4af395a8b79c048a9d98d
3
+ metadata.gz: 1d63d8f65d386bf27082e0a65b1ea82a7d150394b1424ab5c2c274e139f91482
4
+ data.tar.gz: 1f3c4e91fcc9f3bda3d83521cac164ff83e3e5095705cd15420c6278635fc266
5
5
  SHA512:
6
- metadata.gz: ae0f1aff4bd6a78da04123d3728234012d0692ec22396b9529b245c8fa473343314508f053ee02ac876131b243704316948f476840d3c495d4e72eba68e095fd
7
- data.tar.gz: ab95cfdacac4d9204887d4d5c5a7b85aafa3c869ec4b7a851ae994d8f15ddf096e99cb2d1691e6120a7a6bbe51cbde0524c93ad09b5811b7561c43c60a49256c
6
+ metadata.gz: 0101cd5f6d5ba925d8f37cc73416008ace4ffce7ea33a437e0189549ede4cbc23b7284de2fe28af181ddf08396b74225c67626e94ce015d54ac14fde17b53bda
7
+ data.tar.gz: abbcc012e9378ea1fbf15566fd47691bd4cecaaeaf95947c45414dfb7b304db87d803120749aab3ccbf806ab90dd554cce2461f340c348e4f1b820f47be421a2
data/README.md CHANGED
@@ -18,17 +18,31 @@ gem install ckmeans
18
18
 
19
19
  ## Usage
20
20
 
21
+ ### Fixed Cluster Count
22
+
21
23
  ```rb
22
24
  # Fixed cluster count
23
25
  Ckmeans::Clusterer(data, kmin).clusters
24
26
  Ckmedian::Clusterer(data, kmin).clusters
27
+ ```
25
28
 
26
- # Estimate optimal cluster count within kmin and kmax
29
+ ### Estimate optimal cluster count within kmin and kmax
30
+
31
+ ```rb
27
32
  Ckmeans::Clusterer(data, kmin, kmax).clusters
28
33
  Ckmedian::Clusterer(data, kmin, kmax).clusters
34
+ ```
35
+
36
+ ### Fast & Stable Estimation of K
29
37
 
30
- # Adjust Bayesian Information Criteria favoring more smaller clusters (Ckmeans only)
31
- Ckmeans::Clusterer(data, kmin, kmax, :sensitive).clusters
38
+ For big collections without many duplicates, use regular estimation.
39
+ For relatively small sets or sets with many duplicates use Gaussian Mixture Model (GMM)-based estimation.
40
+ It works slower but is more resilient for various data patterns like big numbers of duplicates or clusters with different
41
+ numbers of elements.
42
+
43
+ ```rb
44
+ Ckmeans::Clusterer(data, kmin, kmax, :gmm).clusters
45
+ Ckmedian::Clusterer(data, kmin, kmax, :gmm).clusters
32
46
  ```
33
47
 
34
48
  ## License
@@ -39,7 +39,6 @@ typedef struct State {
39
39
  uint32_t xcount;
40
40
  uint32_t kmin;
41
41
  uint32_t kmax;
42
- bool apply_deviation;
43
42
  Arena *arena;
44
43
  VectorF *xsorted;
45
44
  MatrixF *cost;
@@ -99,8 +98,8 @@ VectorI *prune_candidates(State, RowParams, VectorI*);
99
98
  void fill_even_positions(State, RowParams, VectorI*);
100
99
  SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
101
100
  VectorI *backtrack_sizes(State, VectorI*, uint32_t);
102
- uint32_t find_koptimal(State);
103
-
101
+ uint32_t find_koptimal_fast(State);
102
+ uint32_t find_koptimal_gmm(State);
104
103
 
105
104
  void Init_extensions(void) {
106
105
  VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
@@ -128,13 +127,13 @@ VALUE rb_ckmedian_sorted_group_sizes(VALUE self)
128
127
 
129
128
  VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
130
129
  {
131
- uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
132
- uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
133
- uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
134
- bool apply_deviation = RTEST(rb_iv_get(self, "@apply_bic_deviation"));
135
- VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
136
- size_t capacity = sizeof(LDouble) * (xcount + 1) * (kmax + 1) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
137
- Arena *arena = arena_create(capacity);
130
+ uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
131
+ uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
132
+ uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
133
+ bool use_gmm = RTEST(rb_iv_get(self, "@use_gmm"));
134
+ VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
135
+ size_t capacity = sizeof(LDouble) * (xcount + 2) * (kmax + 2) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
136
+ Arena *arena = arena_create(capacity);
138
137
 
139
138
  if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
140
139
 
@@ -150,17 +149,16 @@ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
150
149
  }
151
150
 
152
151
  State state = {
153
- .arena = arena,
154
- .xcount = xcount,
155
- .kmin = kmin,
156
- .kmax = kmax,
157
- .apply_deviation = apply_deviation,
158
- .xsorted = xsorted,
159
- .cost = cost,
160
- .splits = splits,
161
- .xsum = xsum,
162
- .xsumsq = xsumsq,
163
- .dissim = criteria
152
+ .arena = arena,
153
+ .xcount = xcount,
154
+ .kmin = kmin,
155
+ .kmax = kmax,
156
+ .xsorted = xsorted,
157
+ .cost = cost,
158
+ .splits = splits,
159
+ .xsum = xsum,
160
+ .xsumsq = xsumsq,
161
+ .dissim = criteria
164
162
  };
165
163
 
166
164
 
@@ -187,7 +185,7 @@ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
187
185
  fill_row(state, q, imin, xcount - 1);
188
186
  }
189
187
 
190
- uint32_t koptimal = find_koptimal(state);
188
+ uint32_t koptimal = use_gmm ? find_koptimal_gmm(state) : find_koptimal_fast(state);
191
189
 
192
190
  VectorI *sizes = vector_create_i(arena, koptimal);
193
191
  backtrack_sizes(state, sizes, koptimal);
@@ -209,7 +207,7 @@ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
209
207
  return response;
210
208
  }
211
209
 
212
- uint32_t find_koptimal(State state)
210
+ uint32_t find_koptimal_fast(State state)
213
211
  {
214
212
  uint32_t kmin = state.kmin;
215
213
  uint32_t kmax = state.kmax;
@@ -256,8 +254,7 @@ uint32_t find_koptimal(State state)
256
254
  loglikelihood += -(xi - mean) * (xi - mean) / (2.0 * variance);
257
255
  }
258
256
  loglikelihood += npoints * (
259
- (state.apply_deviation ? 0.0 : log(npoints / (LDouble) xcount)) -
260
- (0.5 * log(PIx2 * variance))
257
+ log(npoints / (LDouble) xcount) - (0.5 * log(PIx2 * variance))
261
258
  );
262
259
  } else {
263
260
  loglikelihood += npoints * log(1.0 / bin_width / xcount);
@@ -280,6 +277,101 @@ uint32_t find_koptimal(State state)
280
277
  return kopt;
281
278
  }
282
279
 
280
+ uint32_t find_koptimal_gmm(State state)
281
+ {
282
+ uint32_t kmin = state.kmin;
283
+ uint32_t kmax = state.kmax;
284
+ uint32_t xcount = state.xcount;
285
+
286
+ if (kmin > kmax || xcount < 2) {
287
+ return (kmin < kmax) ? kmin : kmax;
288
+ }
289
+
290
+ Arena *arena = state.arena;
291
+ VectorF *xsorted = state.xsorted;
292
+ uint32_t kopt = kmin;
293
+ LDouble max_bic = 0.0;
294
+ LDouble log_xcount = log((LDouble) xcount);
295
+ VectorF *lambda = vector_create_f(arena, kmax);
296
+ VectorF *mu = vector_create_f(arena, kmax);
297
+ VectorF *sigma2 = vector_create_f(arena, kmax);
298
+ VectorF *coeff = vector_create_f(arena, kmax);
299
+ VectorI *sizes = vector_create_i(arena, kmax);
300
+
301
+ for (uint32_t kouter = kmin; kouter <= kmax; ++kouter)
302
+ {
303
+ uint32_t ileft = 0;
304
+ uint32_t iright;
305
+
306
+ backtrack_sizes(state, sizes, kouter);
307
+
308
+ for (uint32_t k = 0; k < kouter; ++k)
309
+ {
310
+ uint32_t size = vector_get_i(sizes, k);
311
+ vector_set_f(lambda, k, size / (LDouble) xcount);
312
+ iright = ileft + size - 1;
313
+ SegmentStats stats = shifted_data_variance(xsorted, ileft, iright);
314
+
315
+ vector_set_f(mu, k, stats.mean);
316
+ vector_set_f(sigma2, k, stats.variance);
317
+
318
+ if (stats.variance == 0 || size == 1) {
319
+ LDouble dmin;
320
+
321
+ if (ileft > 0 && iright < xcount - 1) {
322
+ LDouble left_diff = vector_get_diff_f(xsorted, ileft, ileft - 1);
323
+ LDouble right_diff = vector_get_diff_f(xsorted, iright + 1, iright);
324
+
325
+ dmin = (left_diff < right_diff) ? left_diff : right_diff;
326
+ } else if (ileft > 0) {
327
+ dmin = vector_get_diff_f(xsorted, ileft, ileft - 1);
328
+ } else {
329
+ dmin = vector_get_diff_f(xsorted, iright + 1, iright);
330
+ }
331
+
332
+ if (stats.variance == 0) vector_set_f(sigma2, k, dmin * dmin / 4.0 / 9.0);
333
+ if (size == 1) vector_set_f(sigma2, k, dmin * dmin);
334
+ }
335
+
336
+ LDouble lambda_k = vector_get_f(lambda, k);
337
+ LDouble sigma2_k = vector_get_f(sigma2, k);
338
+ vector_set_f(coeff, k, lambda_k / sqrt(PIx2 * sigma2_k));
339
+ ileft = iright + 1;
340
+ }
341
+
342
+ LDouble loglikelihood = 0.0;
343
+
344
+ for (uint32_t i = 0; i < xcount; ++i)
345
+ {
346
+ LDouble L = 0.0;
347
+ LDouble xi = vector_get_f(xsorted, i);
348
+
349
+ for (uint32_t k = 0; k < kouter; ++k)
350
+ {
351
+ LDouble coeff_k = vector_get_f(coeff, k);
352
+ LDouble mu_k = vector_get_f(mu, k);
353
+ LDouble sigma2_k = vector_get_f(sigma2, k);
354
+ LDouble x_mu_diff = xi - mu_k;
355
+ L += coeff_k * exp(- x_mu_diff * x_mu_diff / (2.0 * sigma2_k));
356
+ }
357
+ loglikelihood += log(L);
358
+ }
359
+
360
+ LDouble bic = 2 * loglikelihood - (3 * kouter - 1) * log_xcount;
361
+
362
+ if (kouter == kmin) {
363
+ max_bic = bic;
364
+ kopt = kmin;
365
+ } else {
366
+ if (bic > max_bic) {
367
+ max_bic = bic;
368
+ kopt = kouter;
369
+ }
370
+ }
371
+ }
372
+ return kopt;
373
+ }
374
+
283
375
  VectorI *backtrack_sizes(State state, VectorI *sizes, uint32_t k)
284
376
  {
285
377
  MatrixI *splits = state.splits;
@@ -416,12 +508,12 @@ inline void fill_even_positions(State state, RowParams rparams, VectorI *split_c
416
508
 
417
509
  inline void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
418
510
  {
419
- const uint32_t row = rparams.row;
420
- const uint32_t imin = rparams.imin;
421
- const uint32_t imax = rparams.imax;
422
- const uint32_t istep = rparams.istep;
423
- MatrixF *const cost = state.cost;
424
- MatrixI *const splits = state.splits;
511
+ const uint32_t row = rparams.row;
512
+ const uint32_t imin = rparams.imin;
513
+ const uint32_t imax = rparams.imax;
514
+ const uint32_t istep = rparams.istep;
515
+ MatrixF *const cost = state.cost;
516
+ MatrixI *const splits = state.splits;
425
517
  FnDissim *const dissim = state.dissim;
426
518
 
427
519
  uint32_t optimal_split_idx_prev = 0;
@@ -723,7 +815,7 @@ Arena *arena_create(size_t capacity) {
723
815
  }
724
816
 
725
817
  void *arena_alloc(Arena *arena, size_t size) {
726
- size = (size + 7) & ~7;
818
+ size = (size + 0xf) & ~0xf;
727
819
 
728
820
  if (arena->offset + size > arena->capacity) {
729
821
  rb_raise(rb_eNoMemError, "Arena Insufficient Capacity");
@@ -2,18 +2,18 @@
2
2
 
3
3
  module Ckmeans
4
4
  class Clusterer # rubocop:disable Style/Documentation
5
- def initialize(entries, kmin, kmax = kmin, kestimate = :regular)
5
+ def initialize(entries, kmin, kmax = kmin, kestimate = :fast)
6
6
  @xcount = entries.size
7
7
 
8
8
  raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
9
9
  raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
10
10
 
11
- @kmin = kmin
12
- @unique_xcount = entries.uniq.size
13
- @kmax = [@unique_xcount, kmax].min
14
- @xsorted_original = entries.sort
15
- @xsorted = @xsorted_original.map(&:to_f)
16
- @apply_bic_deviation = kestimate == :sensitive
11
+ @kmin = kmin
12
+ @unique_xcount = entries.uniq.size
13
+ @kmax = [@unique_xcount, kmax].min
14
+ @xsorted_original = entries.sort
15
+ @xsorted = @xsorted_original.map(&:to_f)
16
+ @use_gmm = kestimate == :gmm
17
17
  end
18
18
 
19
19
  def clusters
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmeans
4
- VERSION = "1.1.0"
4
+ VERSION = "2.0.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ckmeans
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Lebedev
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-05-23 00:00:00.000000000 Z
11
+ date: 2025-06-09 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Repeatable clustering of unidimensional data
14
14
  email: