ckmeans 1.1.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 508f78311a643e1fa8e693e4abf1cdf6df4eb06ff09756fa534ff4a514d0f34f
4
- data.tar.gz: 4ef313387c2e45df4a8afde58e429093023a555a32f4af395a8b79c048a9d98d
3
+ metadata.gz: '08439cddf5410f952a06263d423297de219c86927dea2d0c4538916d9d0c70fd'
4
+ data.tar.gz: 94b1cece717f8538945208519c94046881f2d42c7ea9ba9bd453d063eef39878
5
5
  SHA512:
6
- metadata.gz: ae0f1aff4bd6a78da04123d3728234012d0692ec22396b9529b245c8fa473343314508f053ee02ac876131b243704316948f476840d3c495d4e72eba68e095fd
7
- data.tar.gz: ab95cfdacac4d9204887d4d5c5a7b85aafa3c869ec4b7a851ae994d8f15ddf096e99cb2d1691e6120a7a6bbe51cbde0524c93ad09b5811b7561c43c60a49256c
6
+ metadata.gz: 955d8b19a2a33d46b6adebfa05c9460784f5838d1a5d6d8c82e447be2eacb95da3a2711aebacf96429bf36fafe4d46dfc98fe87025cce4373c734c1cc6a60fb9
7
+ data.tar.gz: 8887346eb2602c071923ade05a549a0eb3df44491538fb79155bd766429dde0084847231e1a4f2dcf6e5e3c14f6dca309f2399339abe378becd4b21d8bc70aa7
data/README.md CHANGED
@@ -18,19 +18,52 @@ gem install ckmeans
18
18
 
19
19
  ## Usage
20
20
 
21
+ ### Basic Clustering
22
+
21
23
  ```rb
22
- # Fixed cluster count
23
- Ckmeans::Clusterer(data, kmin).clusters
24
- Ckmedian::Clusterer(data, kmin).clusters
24
+ # Fixed cluster count (K known in advance)
25
+ Ckmeans::Clusterer.new(data, 3).clusters
26
+ Ckmedian::Clusterer.new(data, 3).clusters
27
+
28
+ # Automatic K selection (tries K from kmin to kmax, picks optimal)
29
+ Ckmeans::Clusterer.new(data, 1, 10).clusters
30
+ Ckmedian::Clusterer.new(data, 1, 10).clusters
31
+ ```
32
+
33
+ ### Choosing Between Ckmeans and Ckmedian
25
34
 
26
- # Estimate optimal cluster count within kmin and kmax
27
- Ckmeans::Clusterer(data, kmin, kmax).clusters
28
- Ckmedian::Clusterer(data, kmin, kmax).clusters
35
+ - **Ckmeans** - Minimizes squared distances (L2). Good for normally distributed data.
36
+ - **Ckmedian** - Minimizes absolute distances (L1). More robust to outliers and data bursts.
29
37
 
30
- # Adjust Bayesian Information Criteria favoring more smaller clusters (Ckmeans only)
31
- Ckmeans::Clusterer(data, kmin, kmax, :sensitive).clusters
38
+ ```rb
39
+ # For clean numerical data
40
+ temperatures = [20.1, 20.2, 25.5, 25.6, 30.1, 30.2]
41
+ Ckmeans::Clusterer.new(temperatures, 1, 5).clusters
42
+ # => [[20.1, 20.2], [25.5, 25.6], [30.1, 30.2]]
43
+
44
+ # For data with outliers (e.g., photo timestamps with bursts)
45
+ timestamps = photos.map(&:taken_at).map(&:to_i)
46
+ Ckmedian::Clusterer.new(timestamps, 1, 20).clusters
32
47
  ```
33
48
 
49
+ ### Stable Estimation (Recommended for Edge Cases)
50
+
51
+ By default, both algorithms use a fast heuristic for estimating K. For datasets with many duplicates, tight clusters, or outliers, use `:stable` for more robust estimation:
52
+
53
+ ```rb
54
+ # Stable estimation (uses statistical mixture models)
55
+ Ckmeans::Clusterer.new(data, 1, 10, :stable).clusters
56
+ Ckmedian::Clusterer.new(data, 1, 10, :stable).clusters
57
+ ```
58
+
59
+ **When to use `:stable`:**
60
+ - Small to medium datasets (< 1000 points)
61
+ - Many duplicate values
62
+ - Clusters with very different sizes
63
+ - Photo/event timeline clustering (bursts and gaps)
64
+
65
+ **Expert users:** `:stable` is an alias for `:gmm` (Gaussian Mixture Model) in Ckmeans and `:lmm` (Laplace Mixture Model) in Ckmedian.
66
+
34
67
  ## License
35
68
 
36
69
  The gem is available as open source under the terms of the [LGPL v3 License](https://opensource.org/license/lgpl-3-0).
@@ -39,7 +39,6 @@ typedef struct State {
39
39
  uint32_t xcount;
40
40
  uint32_t kmin;
41
41
  uint32_t kmax;
42
- bool apply_deviation;
43
42
  Arena *arena;
44
43
  VectorF *xsorted;
45
44
  MatrixF *cost;
@@ -56,6 +55,8 @@ typedef struct RowParams {
56
55
  uint32_t istep;
57
56
  } RowParams;
58
57
 
58
+ typedef uint32_t (FnFindKOptimal)(State);
59
+
59
60
  typedef struct {
60
61
  LDouble mean;
61
62
  LDouble variance;
@@ -63,7 +64,7 @@ typedef struct {
63
64
 
64
65
  VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
65
66
  VALUE rb_ckmedian_sorted_group_sizes(VALUE self);
66
- VALUE rb_sorted_group_sizes(VALUE self, FnDissim*);
67
+ VALUE rb_sorted_group_sizes(VALUE self, FnDissim*, FnFindKOptimal*);
67
68
 
68
69
  Arena *arena_create(size_t);
69
70
  void *arena_alloc(Arena*, size_t);
@@ -99,8 +100,9 @@ VectorI *prune_candidates(State, RowParams, VectorI*);
99
100
  void fill_even_positions(State, RowParams, VectorI*);
100
101
  SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
101
102
  VectorI *backtrack_sizes(State, VectorI*, uint32_t);
102
- uint32_t find_koptimal(State);
103
-
103
+ uint32_t find_koptimal_fast(State);
104
+ uint32_t find_koptimal_gmm(State);
105
+ uint32_t find_koptimal_lmm(State);
104
106
 
105
107
  void Init_extensions(void) {
106
108
  VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
@@ -118,23 +120,26 @@ void Init_extensions(void) {
118
120
 
119
121
  VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
120
122
  {
121
- return rb_sorted_group_sizes(self, dissimilarity_l2);
123
+ bool use_gmm = RTEST(rb_iv_get(self, "@use_gmm"));
124
+ FnFindKOptimal *find_k = use_gmm ? find_koptimal_gmm : find_koptimal_fast;
125
+ return rb_sorted_group_sizes(self, dissimilarity_l2, find_k);
122
126
  }
123
127
 
124
128
  VALUE rb_ckmedian_sorted_group_sizes(VALUE self)
125
129
  {
126
- return rb_sorted_group_sizes(self, dissimilarity_l1);
130
+ bool use_lmm = RTEST(rb_iv_get(self, "@use_lmm"));
131
+ FnFindKOptimal *find_k = use_lmm ? find_koptimal_lmm : find_koptimal_fast;
132
+ return rb_sorted_group_sizes(self, dissimilarity_l1, find_k);
127
133
  }
128
134
 
129
- VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
135
+ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria, FnFindKOptimal *find_koptimal)
130
136
  {
131
- uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
132
- uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
133
- uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
134
- bool apply_deviation = RTEST(rb_iv_get(self, "@apply_bic_deviation"));
135
- VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
136
- size_t capacity = sizeof(LDouble) * (xcount + 1) * (kmax + 1) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
137
- Arena *arena = arena_create(capacity);
137
+ uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
138
+ uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
139
+ uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
140
+ VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
141
+ size_t capacity = sizeof(LDouble) * (xcount + 2) * (kmax + 2) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
142
+ Arena *arena = arena_create(capacity);
138
143
 
139
144
  if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
140
145
 
@@ -150,17 +155,16 @@ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
150
155
  }
151
156
 
152
157
  State state = {
153
- .arena = arena,
154
- .xcount = xcount,
155
- .kmin = kmin,
156
- .kmax = kmax,
157
- .apply_deviation = apply_deviation,
158
- .xsorted = xsorted,
159
- .cost = cost,
160
- .splits = splits,
161
- .xsum = xsum,
162
- .xsumsq = xsumsq,
163
- .dissim = criteria
158
+ .arena = arena,
159
+ .xcount = xcount,
160
+ .kmin = kmin,
161
+ .kmax = kmax,
162
+ .xsorted = xsorted,
163
+ .cost = cost,
164
+ .splits = splits,
165
+ .xsum = xsum,
166
+ .xsumsq = xsumsq,
167
+ .dissim = criteria
164
168
  };
165
169
 
166
170
 
@@ -209,7 +213,7 @@ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
209
213
  return response;
210
214
  }
211
215
 
212
- uint32_t find_koptimal(State state)
216
+ uint32_t find_koptimal_fast(State state)
213
217
  {
214
218
  uint32_t kmin = state.kmin;
215
219
  uint32_t kmax = state.kmax;
@@ -256,8 +260,7 @@ uint32_t find_koptimal(State state)
256
260
  loglikelihood += -(xi - mean) * (xi - mean) / (2.0 * variance);
257
261
  }
258
262
  loglikelihood += npoints * (
259
- (state.apply_deviation ? 0.0 : log(npoints / (LDouble) xcount)) -
260
- (0.5 * log(PIx2 * variance))
263
+ log(npoints / (LDouble) xcount) - (0.5 * log(PIx2 * variance))
261
264
  );
262
265
  } else {
263
266
  loglikelihood += npoints * log(1.0 / bin_width / xcount);
@@ -280,6 +283,214 @@ uint32_t find_koptimal(State state)
280
283
  return kopt;
281
284
  }
282
285
 
286
+ uint32_t find_koptimal_gmm(State state)
287
+ {
288
+ uint32_t kmin = state.kmin;
289
+ uint32_t kmax = state.kmax;
290
+ uint32_t xcount = state.xcount;
291
+
292
+ if (kmin > kmax || xcount < 2) {
293
+ return (kmin < kmax) ? kmin : kmax;
294
+ }
295
+
296
+ Arena *arena = state.arena;
297
+ VectorF *xsorted = state.xsorted;
298
+ uint32_t kopt = kmin;
299
+ LDouble max_bic = 0.0;
300
+ LDouble log_xcount = log((LDouble) xcount);
301
+ VectorF *lambda = vector_create_f(arena, kmax);
302
+ VectorF *mu = vector_create_f(arena, kmax);
303
+ VectorF *sigma2 = vector_create_f(arena, kmax);
304
+ VectorF *coeff = vector_create_f(arena, kmax);
305
+ VectorI *sizes = vector_create_i(arena, kmax);
306
+
307
+ for (uint32_t kouter = kmin; kouter <= kmax; ++kouter)
308
+ {
309
+ uint32_t ileft = 0;
310
+ uint32_t iright;
311
+
312
+ backtrack_sizes(state, sizes, kouter);
313
+
314
+ for (uint32_t k = 0; k < kouter; ++k)
315
+ {
316
+ uint32_t size = vector_get_i(sizes, k);
317
+ vector_set_f(lambda, k, size / (LDouble) xcount);
318
+ iright = ileft + size - 1;
319
+ SegmentStats stats = shifted_data_variance(xsorted, ileft, iright);
320
+
321
+ vector_set_f(mu, k, stats.mean);
322
+ vector_set_f(sigma2, k, stats.variance);
323
+
324
+ if (stats.variance == 0 || size == 1) {
325
+ LDouble dmin;
326
+
327
+ if (ileft > 0 && iright < xcount - 1) {
328
+ LDouble left_diff = vector_get_diff_f(xsorted, ileft, ileft - 1);
329
+ LDouble right_diff = vector_get_diff_f(xsorted, iright + 1, iright);
330
+
331
+ dmin = (left_diff < right_diff) ? left_diff : right_diff;
332
+ } else if (ileft > 0) {
333
+ dmin = vector_get_diff_f(xsorted, ileft, ileft - 1);
334
+ } else {
335
+ dmin = vector_get_diff_f(xsorted, iright + 1, iright);
336
+ }
337
+
338
+ if (stats.variance == 0) vector_set_f(sigma2, k, dmin * dmin / 4.0 / 9.0);
339
+ if (size == 1) vector_set_f(sigma2, k, dmin * dmin);
340
+ }
341
+
342
+ LDouble lambda_k = vector_get_f(lambda, k);
343
+ LDouble sigma2_k = vector_get_f(sigma2, k);
344
+ vector_set_f(coeff, k, lambda_k / sqrt(PIx2 * sigma2_k));
345
+ ileft = iright + 1;
346
+ }
347
+
348
+ LDouble loglikelihood = 0.0;
349
+
350
+ for (uint32_t i = 0; i < xcount; ++i)
351
+ {
352
+ LDouble L = 0.0;
353
+ LDouble xi = vector_get_f(xsorted, i);
354
+
355
+ for (uint32_t k = 0; k < kouter; ++k)
356
+ {
357
+ LDouble coeff_k = vector_get_f(coeff, k);
358
+ LDouble mu_k = vector_get_f(mu, k);
359
+ LDouble sigma2_k = vector_get_f(sigma2, k);
360
+ LDouble x_mu_diff = xi - mu_k;
361
+ L += coeff_k * exp(- x_mu_diff * x_mu_diff / (2.0 * sigma2_k));
362
+ }
363
+ loglikelihood += log(L);
364
+ }
365
+
366
+ LDouble bic = 2 * loglikelihood - (3 * kouter - 1) * log_xcount;
367
+
368
+ if (kouter == kmin) {
369
+ max_bic = bic;
370
+ kopt = kmin;
371
+ } else {
372
+ if (bic > max_bic) {
373
+ max_bic = bic;
374
+ kopt = kouter;
375
+ }
376
+ }
377
+ }
378
+ return kopt;
379
+ }
380
+
381
+ uint32_t find_koptimal_lmm(State state)
382
+ {
383
+ uint32_t kmin = state.kmin;
384
+ uint32_t kmax = state.kmax;
385
+ uint32_t xcount = state.xcount;
386
+
387
+ if (kmin > kmax || xcount < 2) {
388
+ return (kmin < kmax) ? kmin : kmax;
389
+ }
390
+
391
+ Arena *arena = state.arena;
392
+ VectorF *xsorted = state.xsorted;
393
+ uint32_t kopt = kmin;
394
+ LDouble max_bic = 0.0;
395
+ LDouble log_xcount = log((LDouble) xcount);
396
+ VectorF *lambda = vector_create_f(arena, kmax);
397
+ VectorF *mu = vector_create_f(arena, kmax); /* median */
398
+ VectorF *scale = vector_create_f(arena, kmax); /* MAD (mean absolute deviation) */
399
+ VectorF *coeff = vector_create_f(arena, kmax);
400
+ VectorI *sizes = vector_create_i(arena, kmax);
401
+
402
+ for (uint32_t kouter = kmin; kouter <= kmax; ++kouter)
403
+ {
404
+ uint32_t ileft = 0;
405
+ uint32_t iright;
406
+
407
+ backtrack_sizes(state, sizes, kouter);
408
+
409
+ for (uint32_t k = 0; k < kouter; ++k)
410
+ {
411
+ uint32_t size = vector_get_i(sizes, k);
412
+ vector_set_f(lambda, k, size / (LDouble) xcount);
413
+ iright = ileft + size - 1;
414
+
415
+ uint32_t median_idx = (ileft + iright) / 2;
416
+ LDouble median;
417
+ if ((size % 2) == 1) {
418
+ median = vector_get_f(xsorted, median_idx);
419
+ } else {
420
+ median = (vector_get_f(xsorted, median_idx) + vector_get_f(xsorted, median_idx + 1)) / 2.0;
421
+ }
422
+ vector_set_f(mu, k, median);
423
+
424
+ LDouble mad = 0.0;
425
+ for (uint32_t i = ileft; i <= iright; ++i) {
426
+ LDouble xi = vector_get_f(xsorted, i);
427
+ mad += fabs(xi - median);
428
+ }
429
+ mad = mad / size;
430
+ vector_set_f(scale, k, mad);
431
+
432
+ /* Handle edge case: MAD = 0 (all points are the same) or size = 1 */
433
+ if (mad == 0 || size == 1) {
434
+ LDouble dmin;
435
+
436
+ if (ileft > 0 && iright < xcount - 1) {
437
+ LDouble left_diff = vector_get_diff_f(xsorted, ileft, ileft - 1);
438
+ LDouble right_diff = vector_get_diff_f(xsorted, iright + 1, iright);
439
+
440
+ dmin = (left_diff < right_diff) ? left_diff : right_diff;
441
+ } else if (ileft > 0) {
442
+ dmin = vector_get_diff_f(xsorted, ileft, ileft - 1);
443
+ } else {
444
+ dmin = vector_get_diff_f(xsorted, iright + 1, iright);
445
+ }
446
+
447
+ if (mad == 0) vector_set_f(scale, k, dmin / 6.0);
448
+ if (size == 1) vector_set_f(scale, k, dmin);
449
+ }
450
+
451
+ /* Laplace coefficient: lambda_k / (2 * b_k) */
452
+ LDouble lambda_k = vector_get_f(lambda, k);
453
+ LDouble scale_k = vector_get_f(scale, k);
454
+ vector_set_f(coeff, k, lambda_k / (2.0 * scale_k));
455
+ ileft = iright + 1;
456
+ }
457
+
458
+ LDouble loglikelihood = 0.0;
459
+
460
+ for (uint32_t i = 0; i < xcount; ++i)
461
+ {
462
+ LDouble L = 0.0;
463
+ LDouble xi = vector_get_f(xsorted, i);
464
+
465
+ for (uint32_t k = 0; k < kouter; ++k)
466
+ {
467
+ LDouble coeff_k = vector_get_f(coeff, k);
468
+ LDouble mu_k = vector_get_f(mu, k);
469
+ LDouble scale_k = vector_get_f(scale, k);
470
+ LDouble x_mu_abs = fabs(xi - mu_k);
471
+ /* Laplace PDF: (1/(2b)) * exp(-|x-μ|/b) */
472
+ L += coeff_k * exp(-x_mu_abs / scale_k);
473
+ }
474
+ loglikelihood += log(L);
475
+ }
476
+
477
+ /* BIC = 2*logL - (3k-1)*log(n) */
478
+ /* Parameters: k-1 mixing proportions + k medians + k scales = 3k-1 */
479
+ LDouble bic = 2 * loglikelihood - (3 * kouter - 1) * log_xcount;
480
+
481
+ if (kouter == kmin) {
482
+ max_bic = bic;
483
+ kopt = kmin;
484
+ } else {
485
+ if (bic > max_bic) {
486
+ max_bic = bic;
487
+ kopt = kouter;
488
+ }
489
+ }
490
+ }
491
+ return kopt;
492
+ }
493
+
283
494
  VectorI *backtrack_sizes(State state, VectorI *sizes, uint32_t k)
284
495
  {
285
496
  MatrixI *splits = state.splits;
@@ -287,12 +498,12 @@ VectorI *backtrack_sizes(State state, VectorI *sizes, uint32_t k)
287
498
  uint32_t right = xcount - 1;
288
499
  uint32_t left = 0;
289
500
 
290
- // Common case works with `i` remaining unsigned and unconditional assignment of the next `left` and `right`
501
+ /* Common case works with `i` remaining unsigned and unconditional assignment of the next `left` and `right` */
291
502
  for (uint32_t i = k - 1; i > 0; i--, right = left - 1) {
292
503
  left = matrix_get_i(splits, i, right);
293
504
  vector_set_i(sizes, i, right - left + 1);
294
505
  }
295
- // Special case outside of the loop removing the need for conditionals
506
+ /* Special case outside of the loop removing the need for conditionals */
296
507
  left = matrix_get_i(splits, 0, right);
297
508
  vector_set_i(sizes, 0, right - left + 1);
298
509
 
@@ -416,12 +627,12 @@ inline void fill_even_positions(State state, RowParams rparams, VectorI *split_c
416
627
 
417
628
  inline void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
418
629
  {
419
- const uint32_t row = rparams.row;
420
- const uint32_t imin = rparams.imin;
421
- const uint32_t imax = rparams.imax;
422
- const uint32_t istep = rparams.istep;
423
- MatrixF *const cost = state.cost;
424
- MatrixI *const splits = state.splits;
630
+ const uint32_t row = rparams.row;
631
+ const uint32_t imin = rparams.imin;
632
+ const uint32_t imax = rparams.imax;
633
+ const uint32_t istep = rparams.istep;
634
+ MatrixF *const cost = state.cost;
635
+ MatrixI *const splits = state.splits;
425
636
  FnDissim *const dissim = state.dissim;
426
637
 
427
638
  uint32_t optimal_split_idx_prev = 0;
@@ -723,7 +934,7 @@ Arena *arena_create(size_t capacity) {
723
934
  }
724
935
 
725
936
  void *arena_alloc(Arena *arena, size_t size) {
726
- size = (size + 7) & ~7;
937
+ size = (size + 0xf) & ~0xf;
727
938
 
728
939
  if (arena->offset + size > arena->capacity) {
729
940
  rb_raise(rb_eNoMemError, "Arena Insufficient Capacity");
@@ -1,19 +1,37 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmeans
4
- class Clusterer # rubocop:disable Style/Documentation
5
- def initialize(entries, kmin, kmax = kmin, kestimate = :regular)
4
+ # Optimal k-means clustering for univariate (1D) data using dynamic programming.
5
+ # Minimizes within-cluster sum of squared distances (L2 norm).
6
+ class Clusterer
7
+ # Creates a new Ckmeans clusterer.
8
+ #
9
+ # @param entries [Array<Numeric>] The data points to cluster
10
+ # @param kmin [Integer] Minimum number of clusters to consider
11
+ # @param kmax [Integer] Maximum number of clusters to consider (defaults to kmin for fixed K)
12
+ # @param kestimate [Symbol] Method for estimating optimal K:
13
+ # - :fast - Quick heuristic using implicit Gaussian assumption (best for large datasets)
14
+ # - :stable - Model-based estimation using Gaussian Mixture Model (better for duplicates/edge cases)
15
+ # - :gmm - Alias for :stable (Gaussian Mixture Model)
16
+ #
17
+ # @example Fixed number of clusters
18
+ # Ckmeans::Clusterer.new([1, 2, 3, 100, 101], 2).clusters
19
+ # # => [[1, 2, 3], [100, 101]]
20
+ #
21
+ # @example Automatic K selection with stable estimation
22
+ # Ckmeans::Clusterer.new([1, 1, 1, 5, 5, 5, 10, 10, 10], 1, 5, :stable).clusters
23
+ def initialize(entries, kmin, kmax = kmin, kestimate = :fast)
6
24
  @xcount = entries.size
7
25
 
8
26
  raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
9
27
  raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
10
28
 
11
- @kmin = kmin
12
- @unique_xcount = entries.uniq.size
13
- @kmax = [@unique_xcount, kmax].min
14
- @xsorted_original = entries.sort
15
- @xsorted = @xsorted_original.map(&:to_f)
16
- @apply_bic_deviation = kestimate == :sensitive
29
+ @kmin = kmin
30
+ @unique_xcount = entries.uniq.size
31
+ @kmax = [@unique_xcount, kmax].min
32
+ @xsorted_original = entries.sort
33
+ @xsorted = @xsorted_original.map(&:to_f)
34
+ @use_gmm = %i[gmm stable].include?(kestimate)
17
35
  end
18
36
 
19
37
  def clusters
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmeans
4
- VERSION = "1.1.0"
4
+ VERSION = "2.1.0"
5
5
  end
@@ -1,8 +1,28 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmedian
4
- class Clusterer # rubocop:disable Style/Documentation
5
- def initialize(entries, kmin, kmax = kmin)
4
+ # Optimal k-median clustering for univariate (1D) data using dynamic programming.
5
+ # Minimizes within-cluster sum of absolute deviations (L1 norm).
6
+ # More robust to outliers than k-means.
7
+ class Clusterer
8
+ # Creates a new Ckmedian clusterer.
9
+ #
10
+ # @param entries [Array<Numeric>] The data points to cluster
11
+ # @param kmin [Integer] Minimum number of clusters to consider
12
+ # @param kmax [Integer] Maximum number of clusters to consider (defaults to kmin for fixed K)
13
+ # @param kestimate [Symbol] Method for estimating optimal K:
14
+ # - :fast - Quick heuristic using implicit Gaussian assumption (best for large datasets)
15
+ # - :stable - Model-based estimation using Laplace Mixture Model (better for outliers/bursts)
16
+ # - :lmm - Alias for :stable (Laplace Mixture Model)
17
+ #
18
+ # @example Fixed number of clusters
19
+ # Ckmedian::Clusterer.new([1, 2, 3, 100, 101], 2).clusters
20
+ # # => [[1, 2, 3], [100, 101]]
21
+ #
22
+ # @example Photo timeline clustering (robust to bursts and outliers)
23
+ # timestamps = photos.map(&:taken_at).map(&:to_i)
24
+ # Ckmedian::Clusterer.new(timestamps, 1, 20, :stable).clusters
25
+ def initialize(entries, kmin, kmax = kmin, kestimate = :fast)
6
26
  @xcount = entries.size
7
27
 
8
28
  raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
@@ -13,6 +33,7 @@ module Ckmedian
13
33
  @kmax = [@unique_xcount, kmax].min
14
34
  @xsorted_original = entries.sort
15
35
  @xsorted = @xsorted_original.map(&:to_f)
36
+ @use_lmm = %i[lmm stable].include?(kestimate)
16
37
  end
17
38
 
18
39
  def clusters
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ckmeans
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 2.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Lebedev
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-05-23 00:00:00.000000000 Z
11
+ date: 2025-12-14 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Repeatable clustering of unidimensional data
14
14
  email: