ckmeans 1.0.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 17dd59ae47e814d5cf0b45665856a52e33e1af22c90722955750004405633a4e
4
- data.tar.gz: 4278bb18d8a987ac71fd7ea179055ab6d2c15292d772b7d9df1dd8c4adde011b
3
+ metadata.gz: 1d63d8f65d386bf27082e0a65b1ea82a7d150394b1424ab5c2c274e139f91482
4
+ data.tar.gz: 1f3c4e91fcc9f3bda3d83521cac164ff83e3e5095705cd15420c6278635fc266
5
5
  SHA512:
6
- metadata.gz: 7e3d19cfbfbebb0b26bf1ffdd7c99998a898ccf123359994e339147735b819f3f16fc73c2ac202a3fbe3c4f1c13c747e7181d01d56770be5404ca6354533b23d
7
- data.tar.gz: 2be82db12f8d9da2cafb03713440f3083d2ffd7fd7f6917ad8e98d1c864b1d97f99e9a0771afe6aaaff502fee86d81e9221b8d689a388817b060fc7ce1917a87
6
+ metadata.gz: 0101cd5f6d5ba925d8f37cc73416008ace4ffce7ea33a437e0189549ede4cbc23b7284de2fe28af181ddf08396b74225c67626e94ce015d54ac14fde17b53bda
7
+ data.tar.gz: abbcc012e9378ea1fbf15566fd47691bd4cecaaeaf95947c45414dfb7b304db87d803120749aab3ccbf806ab90dd554cce2461f340c348e4f1b820f47be421a2
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 3.2.2
1
+ 3.2.8
data/README.md CHANGED
@@ -18,10 +18,31 @@ gem install ckmeans
18
18
 
19
19
  ## Usage
20
20
 
21
+ ### Fixed Cluster Count
22
+
23
+ ```rb
24
+ # Fixed cluster count
25
+ Ckmeans::Clusterer(data, kmin).clusters
26
+ Ckmedian::Clusterer(data, kmin).clusters
27
+ ```
28
+
29
+ ### Estimate optimal cluster count within kmin and kmax
30
+
31
+ ```rb
32
+ Ckmeans::Clusterer(data, kmin, kmax).clusters
33
+ Ckmedian::Clusterer(data, kmin, kmax).clusters
34
+ ```
35
+
36
+ ### Fast & Stable Estimation of K
37
+
38
+ For big collections without many duplicates, use regular estimation.
39
+ For relatively small sets or sets with many duplicates use Gaussian Mixture Model (GMM)-based estimation.
40
+ It works slower but is more resilient for various data patterns like big numbers of duplicates or clusters with different
41
+ numbers of elements.
42
+
21
43
  ```rb
22
- Ckmeans::Clusterer(data, kmin).clusters # fixed cluster count
23
- Ckmeans::Clusterer(data, kmin, kmax).clusters # estimate optimal cluster count within kmin and kmax
24
- Ckmeans::Clusterer(data, kmin, kmax, :sensitive).clusters # Adjust Bayesian Information Criteria favoring more smaller clusters
44
+ Ckmeans::Clusterer(data, kmin, kmax, :gmm).clusters
45
+ Ckmedian::Clusterer(data, kmin, kmax, :gmm).clusters
25
46
  ```
26
47
 
27
48
  ## License
@@ -33,17 +33,19 @@ typedef struct VectorI {
33
33
  uint32_t *values;
34
34
  } VectorI;
35
35
 
36
+ typedef LDouble (FnDissim)(uint32_t, uint32_t, VectorF*, VectorF*);
37
+
36
38
  typedef struct State {
37
39
  uint32_t xcount;
38
40
  uint32_t kmin;
39
41
  uint32_t kmax;
40
- bool apply_deviation;
41
42
  Arena *arena;
42
43
  VectorF *xsorted;
43
44
  MatrixF *cost;
44
45
  MatrixI *splits;
45
46
  VectorF *xsum;
46
47
  VectorF *xsumsq;
48
+ FnDissim *dissim;
47
49
  } State;
48
50
 
49
51
  typedef struct RowParams {
@@ -59,6 +61,8 @@ typedef struct {
59
61
  } SegmentStats;
60
62
 
61
63
  VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
64
+ VALUE rb_ckmedian_sorted_group_sizes(VALUE self);
65
+ VALUE rb_sorted_group_sizes(VALUE self, FnDissim*);
62
66
 
63
67
  Arena *arena_create(size_t);
64
68
  void *arena_alloc(Arena*, size_t);
@@ -85,7 +89,8 @@ uint32_t vector_get_i(VectorI*, uint32_t offset);
85
89
  void vector_downsize_i(VectorI*, uint32_t);
86
90
  void vector_inspect_i(VectorI*);
87
91
 
88
- LDouble dissimilarity(uint32_t, uint32_t, VectorF*, VectorF*);
92
+ LDouble dissimilarity_l2(uint32_t, uint32_t, VectorF*, VectorF*);
93
+ LDouble dissimilarity_l1(uint32_t, uint32_t, VectorF*, VectorF*);
89
94
  void fill_row(State, uint32_t, uint32_t, uint32_t);
90
95
  void smawk(State, RowParams, VectorI*);
91
96
  void find_min_from_candidates(State, RowParams, VectorI*);
@@ -93,13 +98,17 @@ VectorI *prune_candidates(State, RowParams, VectorI*);
93
98
  void fill_even_positions(State, RowParams, VectorI*);
94
99
  SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
95
100
  VectorI *backtrack_sizes(State, VectorI*, uint32_t);
96
- uint32_t find_koptimal(State);
101
+ uint32_t find_koptimal_fast(State);
102
+ uint32_t find_koptimal_gmm(State);
97
103
 
98
104
  void Init_extensions(void) {
99
- VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
100
- VALUE clusterer_class = rb_const_get(ckmeans_module, rb_intern("Clusterer"));
105
+ VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
106
+ VALUE ckmedian_module = rb_const_get(rb_cObject, rb_intern("Ckmedian"));
107
+ VALUE ckmeans_clusterer = rb_const_get(ckmeans_module, rb_intern("Clusterer"));
108
+ VALUE ckmedian_clusterer = rb_const_get(ckmedian_module, rb_intern("Clusterer"));
101
109
 
102
- rb_define_private_method(clusterer_class, "sorted_group_sizes", rb_ckmeans_sorted_group_sizes, 0);
110
+ rb_define_private_method(ckmeans_clusterer, "sorted_group_sizes", rb_ckmeans_sorted_group_sizes, 0);
111
+ rb_define_private_method(ckmedian_clusterer, "sorted_group_sizes", rb_ckmedian_sorted_group_sizes, 0);
103
112
  }
104
113
 
105
114
  # define ARENA_MIN_CAPACITY 100
@@ -108,13 +117,23 @@ void Init_extensions(void) {
108
117
 
109
118
  VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
110
119
  {
111
- uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
112
- uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
113
- uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
114
- bool apply_deviation = RTEST(rb_iv_get(self, "@apply_bic_deviation"));
115
- VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
116
- size_t capacity = sizeof(LDouble) * (xcount + 1) * (kmax + 1) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
117
- Arena *arena = arena_create(capacity);
120
+ return rb_sorted_group_sizes(self, dissimilarity_l2);
121
+ }
122
+
123
+ VALUE rb_ckmedian_sorted_group_sizes(VALUE self)
124
+ {
125
+ return rb_sorted_group_sizes(self, dissimilarity_l1);
126
+ }
127
+
128
+ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
129
+ {
130
+ uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
131
+ uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
132
+ uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
133
+ bool use_gmm = RTEST(rb_iv_get(self, "@use_gmm"));
134
+ VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
135
+ size_t capacity = sizeof(LDouble) * (xcount + 2) * (kmax + 2) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
136
+ Arena *arena = arena_create(capacity);
118
137
 
119
138
  if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
120
139
 
@@ -130,16 +149,16 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
130
149
  }
131
150
 
132
151
  State state = {
133
- .arena = arena,
134
- .xcount = xcount,
135
- .kmin = kmin,
136
- .kmax = kmax,
137
- .apply_deviation = apply_deviation,
138
- .xsorted = xsorted,
139
- .cost = cost,
140
- .splits = splits,
141
- .xsum = xsum,
142
- .xsumsq = xsumsq
152
+ .arena = arena,
153
+ .xcount = xcount,
154
+ .kmin = kmin,
155
+ .kmax = kmax,
156
+ .xsorted = xsorted,
157
+ .cost = cost,
158
+ .splits = splits,
159
+ .xsum = xsum,
160
+ .xsumsq = xsumsq,
161
+ .dissim = criteria
143
162
  };
144
163
 
145
164
 
@@ -157,7 +176,7 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
157
176
 
158
177
  vector_set_f(xsum, i, xsum_prev + diff);
159
178
  vector_set_f(xsumsq, i, xsumsq_prev + diff * diff);
160
- matrix_set_f(cost, 0, i, dissimilarity(0, i, xsum, xsumsq));
179
+ matrix_set_f(cost, 0, i, criteria(0, i, xsum, xsumsq));
161
180
  matrix_set_i(splits, 0, i, 0);
162
181
  }
163
182
 
@@ -166,7 +185,7 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
166
185
  fill_row(state, q, imin, xcount - 1);
167
186
  }
168
187
 
169
- uint32_t koptimal = find_koptimal(state);
188
+ uint32_t koptimal = use_gmm ? find_koptimal_gmm(state) : find_koptimal_fast(state);
170
189
 
171
190
  VectorI *sizes = vector_create_i(arena, koptimal);
172
191
  backtrack_sizes(state, sizes, koptimal);
@@ -188,7 +207,7 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
188
207
  return response;
189
208
  }
190
209
 
191
- uint32_t find_koptimal(State state)
210
+ uint32_t find_koptimal_fast(State state)
192
211
  {
193
212
  uint32_t kmin = state.kmin;
194
213
  uint32_t kmax = state.kmax;
@@ -235,8 +254,7 @@ uint32_t find_koptimal(State state)
235
254
  loglikelihood += -(xi - mean) * (xi - mean) / (2.0 * variance);
236
255
  }
237
256
  loglikelihood += npoints * (
238
- (state.apply_deviation ? 0.0 : log(npoints / (LDouble) xcount)) -
239
- (0.5 * log(PIx2 * variance))
257
+ log(npoints / (LDouble) xcount) - (0.5 * log(PIx2 * variance))
240
258
  );
241
259
  } else {
242
260
  loglikelihood += npoints * log(1.0 / bin_width / xcount);
@@ -259,6 +277,101 @@ uint32_t find_koptimal(State state)
259
277
  return kopt;
260
278
  }
261
279
 
280
+ uint32_t find_koptimal_gmm(State state)
281
+ {
282
+ uint32_t kmin = state.kmin;
283
+ uint32_t kmax = state.kmax;
284
+ uint32_t xcount = state.xcount;
285
+
286
+ if (kmin > kmax || xcount < 2) {
287
+ return (kmin < kmax) ? kmin : kmax;
288
+ }
289
+
290
+ Arena *arena = state.arena;
291
+ VectorF *xsorted = state.xsorted;
292
+ uint32_t kopt = kmin;
293
+ LDouble max_bic = 0.0;
294
+ LDouble log_xcount = log((LDouble) xcount);
295
+ VectorF *lambda = vector_create_f(arena, kmax);
296
+ VectorF *mu = vector_create_f(arena, kmax);
297
+ VectorF *sigma2 = vector_create_f(arena, kmax);
298
+ VectorF *coeff = vector_create_f(arena, kmax);
299
+ VectorI *sizes = vector_create_i(arena, kmax);
300
+
301
+ for (uint32_t kouter = kmin; kouter <= kmax; ++kouter)
302
+ {
303
+ uint32_t ileft = 0;
304
+ uint32_t iright;
305
+
306
+ backtrack_sizes(state, sizes, kouter);
307
+
308
+ for (uint32_t k = 0; k < kouter; ++k)
309
+ {
310
+ uint32_t size = vector_get_i(sizes, k);
311
+ vector_set_f(lambda, k, size / (LDouble) xcount);
312
+ iright = ileft + size - 1;
313
+ SegmentStats stats = shifted_data_variance(xsorted, ileft, iright);
314
+
315
+ vector_set_f(mu, k, stats.mean);
316
+ vector_set_f(sigma2, k, stats.variance);
317
+
318
+ if (stats.variance == 0 || size == 1) {
319
+ LDouble dmin;
320
+
321
+ if (ileft > 0 && iright < xcount - 1) {
322
+ LDouble left_diff = vector_get_diff_f(xsorted, ileft, ileft - 1);
323
+ LDouble right_diff = vector_get_diff_f(xsorted, iright + 1, iright);
324
+
325
+ dmin = (left_diff < right_diff) ? left_diff : right_diff;
326
+ } else if (ileft > 0) {
327
+ dmin = vector_get_diff_f(xsorted, ileft, ileft - 1);
328
+ } else {
329
+ dmin = vector_get_diff_f(xsorted, iright + 1, iright);
330
+ }
331
+
332
+ if (stats.variance == 0) vector_set_f(sigma2, k, dmin * dmin / 4.0 / 9.0);
333
+ if (size == 1) vector_set_f(sigma2, k, dmin * dmin);
334
+ }
335
+
336
+ LDouble lambda_k = vector_get_f(lambda, k);
337
+ LDouble sigma2_k = vector_get_f(sigma2, k);
338
+ vector_set_f(coeff, k, lambda_k / sqrt(PIx2 * sigma2_k));
339
+ ileft = iright + 1;
340
+ }
341
+
342
+ LDouble loglikelihood = 0.0;
343
+
344
+ for (uint32_t i = 0; i < xcount; ++i)
345
+ {
346
+ LDouble L = 0.0;
347
+ LDouble xi = vector_get_f(xsorted, i);
348
+
349
+ for (uint32_t k = 0; k < kouter; ++k)
350
+ {
351
+ LDouble coeff_k = vector_get_f(coeff, k);
352
+ LDouble mu_k = vector_get_f(mu, k);
353
+ LDouble sigma2_k = vector_get_f(sigma2, k);
354
+ LDouble x_mu_diff = xi - mu_k;
355
+ L += coeff_k * exp(- x_mu_diff * x_mu_diff / (2.0 * sigma2_k));
356
+ }
357
+ loglikelihood += log(L);
358
+ }
359
+
360
+ LDouble bic = 2 * loglikelihood - (3 * kouter - 1) * log_xcount;
361
+
362
+ if (kouter == kmin) {
363
+ max_bic = bic;
364
+ kopt = kmin;
365
+ } else {
366
+ if (bic > max_bic) {
367
+ max_bic = bic;
368
+ kopt = kouter;
369
+ }
370
+ }
371
+ }
372
+ return kopt;
373
+ }
374
+
262
375
  VectorI *backtrack_sizes(State state, VectorI *sizes, uint32_t k)
263
376
  {
264
377
  MatrixI *splits = state.splits;
@@ -336,7 +449,7 @@ void smawk(State state, RowParams rparams, VectorI *split_candidates)
336
449
  }
337
450
  }
338
451
 
339
- void fill_even_positions(State state, RowParams rparams, VectorI *split_candidates)
452
+ inline void fill_even_positions(State state, RowParams rparams, VectorI *split_candidates)
340
453
  {
341
454
  uint32_t row = rparams.row;
342
455
  uint32_t imin = rparams.imin;
@@ -345,9 +458,10 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
345
458
  uint32_t n = split_candidates->size;
346
459
  uint32_t istepx2 = istep * 2;
347
460
  uint32_t jl = vector_get_i(split_candidates, 0);
348
- VectorF *xsum = state.xsum;
349
- VectorF *xsumsq = state.xsumsq;
350
- MatrixI *splits = state.splits;
461
+ VectorF *const xsum = state.xsum;
462
+ VectorF *const xsumsq = state.xsumsq;
463
+ MatrixI *const splits = state.splits;
464
+ FnDissim *const dissim = state.dissim;
351
465
 
352
466
  for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
353
467
  while (vector_get_i(split_candidates, r) < jl) r++;
@@ -356,7 +470,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
356
470
  uint32_t cost_base_row = row - 1;
357
471
  uint32_t cost_base_col = rcandidate - 1;
358
472
  LDouble cost =
359
- matrix_get_f(state.cost, cost_base_row, cost_base_col) + dissimilarity(rcandidate, i, xsum, xsumsq);
473
+ matrix_get_f(state.cost, cost_base_row, cost_base_col) + dissim(rcandidate, i, xsum, xsumsq);
360
474
 
361
475
  matrix_set_f(state.cost, row, i, cost);
362
476
  matrix_set_i(state.splits, row, i, rcandidate);
@@ -367,7 +481,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
367
481
  : vector_get_i(split_candidates, n - 1);
368
482
 
369
483
  uint32_t jmax = jh < i ? jh : i;
370
- LDouble sjimin = dissimilarity(jmax, i, xsum, xsumsq);
484
+ LDouble sjimin = dissim(jmax, i, xsum, xsumsq);
371
485
 
372
486
  for (++r; r < n && vector_get_i(split_candidates, r) <= jmax; r++) {
373
487
  uint32_t jabs = vector_get_i(split_candidates, r);
@@ -376,7 +490,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
376
490
  if (jabs < matrix_get_i(splits, row - 1, i)) continue;
377
491
 
378
492
  LDouble cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
379
- LDouble sj = cost_base + dissimilarity(jabs, i, xsum, xsumsq);
493
+ LDouble sj = cost_base + dissim(jabs, i, xsum, xsumsq);
380
494
  LDouble cost_prev = matrix_get_f(state.cost, row, i);
381
495
 
382
496
  if (sj <= cost_prev) {
@@ -392,14 +506,15 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
392
506
  }
393
507
  }
394
508
 
395
- void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
509
+ inline void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
396
510
  {
397
- const uint32_t row = rparams.row;
398
- const uint32_t imin = rparams.imin;
399
- const uint32_t imax = rparams.imax;
400
- const uint32_t istep = rparams.istep;
401
- MatrixF *const cost = state.cost;
402
- MatrixI *const splits = state.splits;
511
+ const uint32_t row = rparams.row;
512
+ const uint32_t imin = rparams.imin;
513
+ const uint32_t imax = rparams.imax;
514
+ const uint32_t istep = rparams.istep;
515
+ MatrixF *const cost = state.cost;
516
+ MatrixI *const splits = state.splits;
517
+ FnDissim *const dissim = state.dissim;
403
518
 
404
519
  uint32_t optimal_split_idx_prev = 0;
405
520
 
@@ -408,7 +523,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
408
523
  const uint32_t optimal_split_idx = optimal_split_idx_prev;
409
524
  const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
410
525
  const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
411
- const LDouble added_cost = dissimilarity(optimal_split, i, state.xsum, state.xsumsq);
526
+ const LDouble added_cost = dissim(optimal_split, i, state.xsum, state.xsumsq);
412
527
 
413
528
  matrix_set_f(cost, row, i, cost_prev + added_cost);
414
529
  matrix_set_i(splits, row, i, optimal_split);
@@ -421,7 +536,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
421
536
  if (split > i) break;
422
537
 
423
538
  LDouble split_cost =
424
- matrix_get_f(cost, row - 1, split - 1) + dissimilarity(split, i, state.xsum, state.xsumsq);
539
+ matrix_get_f(cost, row - 1, split - 1) + dissim(split, i, state.xsum, state.xsumsq);
425
540
 
426
541
  if (split_cost > matrix_get_f(cost, row, i)) continue;
427
542
 
@@ -432,7 +547,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
432
547
  }
433
548
  }
434
549
 
435
- VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
550
+ inline VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
436
551
  {
437
552
  uint32_t imin = rparams.imin;
438
553
  uint32_t row = rparams.row;
@@ -445,6 +560,7 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
445
560
  uint32_t left = 0;
446
561
  uint32_t right = 0;
447
562
  VectorI *pruned = vector_dup_i(split_candidates, state.arena);
563
+ FnDissim *const dissim = state.dissim;
448
564
 
449
565
  while (m > n)
450
566
  {
@@ -452,9 +568,9 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
452
568
  uint32_t j = vector_get_i(pruned, right);
453
569
  uint32_t jnext = vector_get_i(pruned, right + 1);
454
570
  LDouble sl =
455
- matrix_get_f(state.cost, row - 1, j - 1) + dissimilarity(j, i, state.xsum, state.xsumsq);
571
+ matrix_get_f(state.cost, row - 1, j - 1) + dissim(j, i, state.xsum, state.xsumsq);
456
572
  LDouble snext =
457
- matrix_get_f(state.cost, row - 1, jnext - 1) + dissimilarity(jnext, i, state.xsum, state.xsumsq);
573
+ matrix_get_f(state.cost, row - 1, jnext - 1) + dissim(jnext, i, state.xsum, state.xsumsq);
458
574
 
459
575
  if ((sl < snext) && (left < n - 1)) {
460
576
  vector_set_i(pruned, left, j);
@@ -484,7 +600,8 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
484
600
  return pruned;
485
601
  }
486
602
 
487
- inline LDouble dissimilarity(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict xsumsq) {
603
+ /* L2 aka Euclidean aka Mean dissimilarity criteria */
604
+ inline LDouble dissimilarity_l2(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict xsumsq) {
488
605
  LDouble sji = 0.0;
489
606
 
490
607
  if (j >= i) return sji;
@@ -501,6 +618,48 @@ inline LDouble dissimilarity(uint32_t j, uint32_t i, VectorF *restrict xsum, Vec
501
618
  return (sji > 0) ? sji : 0.0;
502
619
  }
503
620
 
621
+ /* L1 aka Manhattan aka Median dissimilarity criteria */
622
+ inline LDouble dissimilarity_l1(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict _xsumsq)
623
+ {
624
+ LDouble sji = 0.0;
625
+
626
+ if (j >= i) return sji;
627
+
628
+ if (j > 0) {
629
+ uint32_t median_idx = (i + j) >> 1;
630
+
631
+ if (((i - j + 1) % 2) == 1) {
632
+ sji =
633
+ - vector_get_f(xsum, median_idx - 1)
634
+ + vector_get_f(xsum, j - 1)
635
+ + vector_get_f(xsum, i)
636
+ - vector_get_f(xsum, median_idx);
637
+ } else {
638
+ sji =
639
+ - vector_get_f(xsum, median_idx)
640
+ + vector_get_f(xsum, j - 1)
641
+ + vector_get_f(xsum, i)
642
+ - vector_get_f(xsum, median_idx);
643
+ }
644
+ } else { // j == 0
645
+ uint32_t median_idx = i >> 1;
646
+
647
+ if (((i + 1) % 2) == 1) {
648
+ sji =
649
+ - vector_get_f(xsum, median_idx - 1)
650
+ + vector_get_f(xsum, i)
651
+ - vector_get_f(xsum, median_idx);
652
+ } else {
653
+ sji =
654
+ - vector_get_f(xsum, median_idx)
655
+ + vector_get_f(xsum, i)
656
+ - vector_get_f(xsum, median_idx);
657
+ }
658
+ }
659
+
660
+ return (sji < 0) ? 0.0 : sji;
661
+ }
662
+
504
663
  inline VectorF *vector_create_f(Arena *arena, uint32_t size) {
505
664
  VectorF *v;
506
665
 
@@ -656,7 +815,7 @@ Arena *arena_create(size_t capacity) {
656
815
  }
657
816
 
658
817
  void *arena_alloc(Arena *arena, size_t size) {
659
- size = (size + 7) & ~7;
818
+ size = (size + 0xf) & ~0xf;
660
819
 
661
820
  if (arena->offset + size > arena->capacity) {
662
821
  rb_raise(rb_eNoMemError, "Arena Insufficient Capacity");
@@ -2,18 +2,18 @@
2
2
 
3
3
  module Ckmeans
4
4
  class Clusterer # rubocop:disable Style/Documentation
5
- def initialize(entries, kmin, kmax = kmin, kestimate = :regular)
5
+ def initialize(entries, kmin, kmax = kmin, kestimate = :fast)
6
6
  @xcount = entries.size
7
7
 
8
8
  raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
9
9
  raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
10
10
 
11
- @kmin = kmin
12
- @unique_xcount = entries.uniq.size
13
- @kmax = [@unique_xcount, kmax].min
14
- @xsorted_original = entries.sort
15
- @xsorted = @xsorted_original.map(&:to_f)
16
- @apply_bic_deviation = kestimate == :sensitive
11
+ @kmin = kmin
12
+ @unique_xcount = entries.uniq.size
13
+ @kmax = [@unique_xcount, kmax].min
14
+ @xsorted_original = entries.sort
15
+ @xsorted = @xsorted_original.map(&:to_f)
16
+ @use_gmm = kestimate == :gmm
17
17
  end
18
18
 
19
19
  def clusters
@@ -28,5 +28,3 @@ module Ckmeans
28
28
  end
29
29
  end
30
30
  end
31
-
32
- require "ckmeans/extensions"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmeans
4
- VERSION = "1.0.4"
4
+ VERSION = "2.0.0"
5
5
  end
data/lib/ckmeans.rb CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  require_relative "ckmeans/version"
4
4
  require_relative "ckmeans/clusterer"
5
+ require_relative "ckmedian/clusterer"
6
+ require "ckmeans/extensions"
5
7
 
6
8
  module Ckmeans
7
9
  class Error < StandardError; end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ckmedian
4
+ class Clusterer # rubocop:disable Style/Documentation
5
+ def initialize(entries, kmin, kmax = kmin)
6
+ @xcount = entries.size
7
+
8
+ raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
9
+ raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
10
+
11
+ @kmin = kmin
12
+ @unique_xcount = entries.uniq.size
13
+ @kmax = [@unique_xcount, kmax].min
14
+ @xsorted_original = entries.sort
15
+ @xsorted = @xsorted_original.map(&:to_f)
16
+ end
17
+
18
+ def clusters
19
+ @clusters ||=
20
+ if @unique_xcount <= 1
21
+ [@xsorted_original]
22
+ else
23
+ sorted_group_sizes.each_with_object([]) do |size, groups|
24
+ groups << @xsorted_original.shift(size)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
metadata CHANGED
@@ -1,13 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ckmeans
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Lebedev
8
+ autorequire:
8
9
  bindir: exe
9
10
  cert_chain: []
10
- date: 2025-05-01 00:00:00.000000000 Z
11
+ date: 2025-06-09 00:00:00.000000000 Z
11
12
  dependencies: []
12
13
  description: Repeatable clustering of unidimensional data
13
14
  email:
@@ -32,6 +33,7 @@ files:
32
33
  - lib/ckmeans.rb
33
34
  - lib/ckmeans/clusterer.rb
34
35
  - lib/ckmeans/version.rb
36
+ - lib/ckmedian/clusterer.rb
35
37
  - sig/ckmeans.rbs
36
38
  homepage: https://github.com/vlebedeff/rb-ckmeans
37
39
  licenses:
@@ -41,6 +43,7 @@ metadata:
41
43
  homepage_uri: https://github.com/vlebedeff/rb-ckmeans
42
44
  source_code_uri: https://github.com/vlebedeff/rb-ckmeans
43
45
  changelog_uri: https://github.com/vlebedeff/rb-ckmeans/blob/main/CHANGELOG.md
46
+ post_install_message:
44
47
  rdoc_options: []
45
48
  require_paths:
46
49
  - lib
@@ -55,7 +58,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
55
58
  - !ruby/object:Gem::Version
56
59
  version: '0'
57
60
  requirements: []
58
- rubygems_version: 3.6.5
61
+ rubygems_version: 3.4.19
62
+ signing_key:
59
63
  specification_version: 4
60
64
  summary: Ruby implementation of Ckmeans.1d.dp
61
65
  test_files: []