ckmeans 2.0.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1d63d8f65d386bf27082e0a65b1ea82a7d150394b1424ab5c2c274e139f91482
4
- data.tar.gz: 1f3c4e91fcc9f3bda3d83521cac164ff83e3e5095705cd15420c6278635fc266
3
+ metadata.gz: 231337d4b73a838b8a9326936c4ae4c003db103108b6622c22a09a6a44bf4e31
4
+ data.tar.gz: c0d97cd2fd4b1fd6693305900e9e329e218993fb735b9d24cd1c7281eda14e8a
5
5
  SHA512:
6
- metadata.gz: 0101cd5f6d5ba925d8f37cc73416008ace4ffce7ea33a437e0189549ede4cbc23b7284de2fe28af181ddf08396b74225c67626e94ce015d54ac14fde17b53bda
7
- data.tar.gz: abbcc012e9378ea1fbf15566fd47691bd4cecaaeaf95947c45414dfb7b304db87d803120749aab3ccbf806ab90dd554cce2461f340c348e4f1b820f47be421a2
6
+ metadata.gz: 04be90532ac9498a184025d849b14a0d34fb61d00352c818a6e7495ae753666689ecef0d542fc6e95f118e5c83fcc29560baee2b339762158d47811a1d4164cd
7
+ data.tar.gz: '019cc8ff7f3fa2648faa03997fc88489718e944d13760b1e28eed3c4e48d48da56fd87ced9e5d4499e9c6cfb962cba6c02889dd0d3a53fd88ccb0a0396e00551'
data/README.md CHANGED
@@ -18,33 +18,52 @@ gem install ckmeans
18
18
 
19
19
  ## Usage
20
20
 
21
- ### Fixed Cluster Count
21
+ ### Basic Clustering
22
22
 
23
23
  ```rb
24
- # Fixed cluster count
25
- Ckmeans::Clusterer(data, kmin).clusters
26
- Ckmedian::Clusterer(data, kmin).clusters
24
+ # Fixed cluster count (K known in advance)
25
+ Ckmeans::Clusterer.new(data, 3).clusters
26
+ Ckmedian::Clusterer.new(data, 3).clusters
27
+
28
+ # Automatic K selection (tries K from kmin to kmax, picks optimal)
29
+ Ckmeans::Clusterer.new(data, 1, 10).clusters
30
+ Ckmedian::Clusterer.new(data, 1, 10).clusters
27
31
  ```
28
32
 
29
- ### Estimate optimal cluster count within kmin and kmax
33
+ ### Choosing Between Ckmeans and Ckmedian
34
+
35
+ - **Ckmeans** - Minimizes squared distances (L2). Good for normally distributed data.
36
+ - **Ckmedian** - Minimizes absolute distances (L1). More robust to outliers and data bursts.
30
37
 
31
38
  ```rb
32
- Ckmeans::Clusterer(data, kmin, kmax).clusters
33
- Ckmedian::Clusterer(data, kmin, kmax).clusters
39
+ # For clean numerical data
40
+ temperatures = [20.1, 20.2, 25.5, 25.6, 30.1, 30.2]
41
+ Ckmeans::Clusterer.new(temperatures, 1, 5).clusters
42
+ # => [[20.1, 20.2], [25.5, 25.6], [30.1, 30.2]]
43
+
44
+ # For data with outliers (e.g., photo timestamps with bursts)
45
+ timestamps = photos.map(&:taken_at).map(&:to_i)
46
+ Ckmedian::Clusterer.new(timestamps, 1, 20).clusters
34
47
  ```
35
48
 
36
- ### Fast & Stable Estimation of K
49
+ ### Stable Estimation (Recommended for Edge Cases)
37
50
 
38
- For big collections without many duplicates, use regular estimation.
39
- For relatively small sets or sets with many duplicates use Gaussian Mixture Model (GMM)-based estimation.
40
- It works slower but is more resilient for various data patterns like big numbers of duplicates or clusters with different
41
- numbers of elements.
51
+ By default, both algorithms use a fast heuristic for estimating K. For datasets with many duplicates, tight clusters, or outliers, use `:stable` for more robust estimation:
42
52
 
43
53
  ```rb
44
- Ckmeans::Clusterer(data, kmin, kmax, :gmm).clusters
45
- Ckmedian::Clusterer(data, kmin, kmax, :gmm).clusters
54
+ # Stable estimation (uses statistical mixture models)
55
+ Ckmeans::Clusterer.new(data, 1, 10, :stable).clusters
56
+ Ckmedian::Clusterer.new(data, 1, 10, :stable).clusters
46
57
  ```
47
58
 
59
+ **When to use `:stable`:**
60
+ - Small to medium datasets (< 1000 points)
61
+ - Many duplicate values
62
+ - Clusters with very different sizes
63
+ - Photo/event timeline clustering (bursts and gaps)
64
+
65
+ **Expert users:** `:stable` is an alias for `:gmm` (Gaussian Mixture Model) in Ckmeans and `:lmm` (Laplace Mixture Model) in Ckmedian.
66
+
48
67
  ## License
49
68
 
50
69
  The gem is available as open source under the terms of the [LGPL v3 License](https://opensource.org/license/lgpl-3-0).
@@ -0,0 +1,281 @@
1
+ /* SMAWK algorithm implementation template
2
+ *
3
+ * This file is designed to be included multiple times with different DISSIM_SUFFIX and DISSIM macro definitions
4
+ * to generate L1 and L2 specific versions of the clustering algorithm.
5
+ *
6
+ * Before including this file, define:
7
+ * DISSIM_SUFFIX - suffix for function names (e.g., l1, l2)
8
+ * DISSIM(j, i, xsum, xsumsq) - macro that calls the dissimilarity function
9
+ *
10
+ * Example:
11
+ * #define DISSIM_SUFFIX l2
12
+ * #define DISSIM(j, i, xsum, xsumsq) dissimilarity_l2(j, i, xsum, xsumsq)
13
+ * #include "algorithm.inc"
14
+ * #undef DISSIM
15
+ * #undef DISSIM_SUFFIX
16
+ */
17
+
18
+ #ifndef DISSIM_SUFFIX
19
+ #error "DISSIM_SUFFIX must be defined before including algorithm.inc"
20
+ #endif
21
+
22
+ #ifndef DISSIM
23
+ #error "DISSIM must be defined before including algorithm.inc"
24
+ #endif
25
+
26
+ /* Helper macros for token pasting */
27
+ #define CONCAT_IMPL(a, b) a##_##b
28
+ #define CONCAT(a, b) CONCAT_IMPL(a, b)
29
+ #define FUNC_NAME(name) CONCAT(name, DISSIM_SUFFIX)
30
+
31
+ static inline void FUNC_NAME(fill_even_positions)(State state, RowParams rparams, VectorI *split_candidates)
32
+ {
33
+ uint32_t row = rparams.row;
34
+ uint32_t imin = rparams.imin;
35
+ uint32_t imax = rparams.imax;
36
+ uint32_t istep = rparams.istep;
37
+ uint32_t n = split_candidates->size;
38
+ uint32_t istepx2 = istep * 2;
39
+ uint32_t jl = vector_get_i(split_candidates, 0);
40
+ VectorF *const xsum = state.xsum;
41
+ VectorF *const xsumsq = state.xsumsq;
42
+ MatrixI *const splits = state.splits;
43
+
44
+ for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
45
+ while (vector_get_i(split_candidates, r) < jl) r++;
46
+
47
+ uint32_t rcandidate = vector_get_i(split_candidates, r);
48
+ uint32_t cost_base_row = row - 1;
49
+ uint32_t cost_base_col = rcandidate - 1;
50
+ LDouble cost = matrix_get_f(state.cost, cost_base_row, cost_base_col)
51
+ + DISSIM(rcandidate, i, xsum, xsumsq);
52
+
53
+ matrix_set_f(state.cost, row, i, cost);
54
+ matrix_set_i(state.splits, row, i, rcandidate);
55
+
56
+ uint32_t jh = (i + istep) <= imax
57
+ ? matrix_get_i(splits, row, i + istep)
58
+ : vector_get_i(split_candidates, n - 1);
59
+
60
+ uint32_t jmax = jh < i ? jh : i;
61
+ LDouble sjimin = DISSIM(jmax, i, xsum, xsumsq);
62
+
63
+ for (++r; r < n && vector_get_i(split_candidates, r) <= jmax; r++) {
64
+ uint32_t jabs = vector_get_i(split_candidates, r);
65
+
66
+ if (jabs > i) break;
67
+ if (jabs < matrix_get_i(splits, row - 1, i)) continue;
68
+
69
+ LDouble cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
70
+ LDouble sj = cost_base + DISSIM(jabs, i, xsum, xsumsq);
71
+ LDouble cost_prev = matrix_get_f(state.cost, row, i);
72
+
73
+ if (sj <= cost_prev) {
74
+ matrix_set_f(state.cost, row, i, sj);
75
+ matrix_set_i(state.splits, row, i, jabs);
76
+ } else if (cost_base + sjimin > cost_prev) {
77
+ break;
78
+ }
79
+ }
80
+
81
+ r--;
82
+ jl = jh;
83
+ }
84
+ }
85
+
86
+ static inline void FUNC_NAME(find_min_from_candidates)(State state, RowParams rparams, VectorI *split_candidates)
87
+ {
88
+ const uint32_t row = rparams.row;
89
+ const uint32_t imin = rparams.imin;
90
+ const uint32_t imax = rparams.imax;
91
+ const uint32_t istep = rparams.istep;
92
+ MatrixF *const cost = state.cost;
93
+ MatrixI *const splits = state.splits;
94
+
95
+ uint32_t optimal_split_idx_prev = 0;
96
+
97
+ for (uint32_t i = imin; i <= imax; i += istep)
98
+ {
99
+ const uint32_t optimal_split_idx = optimal_split_idx_prev;
100
+ const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
101
+ const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
102
+ const LDouble added_cost = DISSIM(optimal_split, i, state.xsum, state.xsumsq);
103
+
104
+ matrix_set_f(cost, row, i, cost_prev + added_cost);
105
+ matrix_set_i(splits, row, i, optimal_split);
106
+
107
+ for (uint32_t r = optimal_split_idx + 1; r < split_candidates->size; r++)
108
+ {
109
+ uint32_t split = vector_get_i(split_candidates, r);
110
+
111
+ if (split < matrix_get_i(splits, row - 1, i)) continue;
112
+ if (split > i) break;
113
+
114
+ LDouble split_cost = matrix_get_f(cost, row - 1, split - 1) + DISSIM(split, i, state.xsum, state.xsumsq);
115
+
116
+ if (split_cost > matrix_get_f(cost, row, i)) continue;
117
+
118
+ matrix_set_f(cost, row, i, split_cost);
119
+ matrix_set_i(splits, row, i, split);
120
+ optimal_split_idx_prev = r;
121
+ }
122
+ }
123
+ }
124
+
125
+ static inline VectorI *FUNC_NAME(prune_candidates)(State state, RowParams rparams, VectorI *split_candidates)
126
+ {
127
+ uint32_t imin = rparams.imin;
128
+ uint32_t row = rparams.row;
129
+ uint32_t istep = rparams.istep;
130
+ uint32_t n = ((rparams.imax - imin) / istep) + 1;
131
+ uint32_t m = split_candidates->size;
132
+
133
+ if (n >= m) return split_candidates;
134
+
135
+ uint32_t left = 0;
136
+ uint32_t right = 0;
137
+ VectorI *pruned = vector_dup_i(split_candidates, state.arena);
138
+
139
+ while (m > n)
140
+ {
141
+ uint32_t i = imin + left * istep;
142
+ uint32_t j = vector_get_i(pruned, right);
143
+ uint32_t jnext = vector_get_i(pruned, right + 1);
144
+ LDouble sl = matrix_get_f(state.cost, row - 1, j - 1) + DISSIM(j, i, state.xsum, state.xsumsq);
145
+ LDouble snext = matrix_get_f(state.cost, row - 1, jnext - 1) + DISSIM(jnext, i, state.xsum, state.xsumsq);
146
+
147
+ if ((sl < snext) && (left < n - 1)) {
148
+ vector_set_i(pruned, left, j);
149
+ left++;
150
+ right++;
151
+ } else if ((sl < snext) && (left == n - 1)) {
152
+ right++;
153
+ m--;
154
+ vector_set_i(pruned, right, j);
155
+ } else {
156
+ if (left > 0) {
157
+ vector_set_i(pruned, right, vector_get_i(pruned, --left));
158
+ } else {
159
+ right++;
160
+ }
161
+
162
+ m--;
163
+ }
164
+ }
165
+
166
+ for (uint32_t i = left; i < m; i++) {
167
+ vector_set_i(pruned, i, vector_get_i(pruned, right++));
168
+ }
169
+
170
+ vector_downsize_i(pruned, m);
171
+
172
+ return pruned;
173
+ }
174
+
175
+ void FUNC_NAME(smawk)(State state, RowParams rparams, VectorI *split_candidates)
176
+ {
177
+ const uint32_t imin = rparams.imin;
178
+ const uint32_t imax = rparams.imax;
179
+ const uint32_t istep = rparams.istep;
180
+
181
+ if ((imax - imin) <= (0 * istep)) {
182
+ FUNC_NAME(find_min_from_candidates)(state, rparams, split_candidates);
183
+ } else {
184
+ VectorI *odd_candidates = FUNC_NAME(prune_candidates)(state, rparams, split_candidates);
185
+ uint32_t istepx2 = istep * 2;
186
+ uint32_t imin_odd = imin + istep;
187
+ uint32_t imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2);
188
+ RowParams rparams_odd = { .row = rparams.row, .imin = imin_odd, .imax = imax_odd, .istep = istepx2 };
189
+
190
+ FUNC_NAME(smawk)(state, rparams_odd, odd_candidates);
191
+ FUNC_NAME(fill_even_positions)(state, rparams, split_candidates);
192
+ }
193
+ }
194
+
195
+ void FUNC_NAME(fill_row)(State state, uint32_t q, uint32_t imin, uint32_t imax)
196
+ {
197
+ uint32_t size = imax - q + 1;
198
+ VectorI *split_candidates = vector_create_i(state.arena, size);
199
+ for (uint32_t i = 0; i < size; i++) {
200
+ vector_set_i(split_candidates, i, q + i);
201
+ }
202
+ RowParams rparams = { .row = q, .imin = imin, .imax = imax, .istep = 1 };
203
+ FUNC_NAME(smawk)(state, rparams, split_candidates);
204
+ }
205
+
206
+ VALUE FUNC_NAME(rb_sorted_group_sizes)(VALUE self, FnFindKOptimal *find_koptimal)
207
+ {
208
+ uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
209
+ uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
210
+ uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
211
+ VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
212
+ size_t capacity = sizeof(LDouble) * (xcount + 2) * (kmax + 2) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
213
+ Arena *arena = arena_create(capacity);
214
+
215
+ if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
216
+
217
+ MatrixF *cost = matrix_create_f(arena, kmax, xcount);
218
+ MatrixI *splits = matrix_create_i(arena, kmax, xcount);
219
+ VectorF *xsorted = vector_create_f(arena, xcount);
220
+ VectorF *xsum = vector_create_f(arena, xcount);
221
+ VectorF *xsumsq = vector_create_f(arena, xcount);
222
+
223
+ for (uint32_t i = 0; i < xcount; i++) {
224
+ LDouble xi = NUM2DBL(rb_ary_entry(rb_xsorted, i));
225
+ vector_set_f(xsorted, i, xi);
226
+ }
227
+
228
+ State state = {
229
+ .arena = arena,
230
+ .xcount = xcount,
231
+ .kmin = kmin,
232
+ .kmax = kmax,
233
+ .xsorted = xsorted,
234
+ .cost = cost,
235
+ .splits = splits,
236
+ .xsum = xsum,
237
+ .xsumsq = xsumsq
238
+ };
239
+
240
+ LDouble shift = vector_get_f(xsorted, xcount / 2);
241
+ LDouble diff_initial = vector_get_f(xsorted, 0) - shift;
242
+
243
+ vector_set_f(xsum, 0, diff_initial);
244
+ vector_set_f(xsumsq, 0, diff_initial * diff_initial);
245
+
246
+ for (uint32_t i = 1; i < xcount; i++) {
247
+ LDouble xi = vector_get_f(xsorted, i);
248
+ LDouble xsum_prev = vector_get_f(xsum, i - 1);
249
+ LDouble xsumsq_prev = vector_get_f(xsumsq, i - 1);
250
+ LDouble diff = xi - shift;
251
+
252
+ vector_set_f(xsum, i, xsum_prev + diff);
253
+ vector_set_f(xsumsq, i, xsumsq_prev + diff * diff);
254
+ matrix_set_f(cost, 0, i, DISSIM(0, i, xsum, xsumsq));
255
+ matrix_set_i(splits, 0, i, 0);
256
+ }
257
+
258
+ for (uint32_t q = 1; q <= kmax - 1; q++) {
259
+ uint32_t imin = (q < kmax - 1) ? ((q > 1) ? q : 1) : xcount - 1;
260
+ FUNC_NAME(fill_row)(state, q, imin, xcount - 1);
261
+ }
262
+
263
+ uint32_t koptimal = find_koptimal(state);
264
+
265
+ VectorI *sizes = vector_create_i(arena, koptimal);
266
+ backtrack_sizes(state, sizes, koptimal);
267
+
268
+ VALUE response = rb_ary_new2(sizes->size);
269
+ for (uint32_t i = 0; i < sizes->size; i++) {
270
+ VALUE size = LONG2NUM(vector_get_i(sizes, i));
271
+ rb_ary_store(response, i, size);
272
+ }
273
+
274
+ arena_destroy(arena);
275
+
276
+ return response;
277
+ }
278
+
279
+ #undef CONCAT_IMPL
280
+ #undef CONCAT
281
+ #undef FUNC_NAME
@@ -0,0 +1,65 @@
1
+ #ifndef DISSIMILARITY_H
2
+ #define DISSIMILARITY_H
3
+
4
+ /* L2 aka Euclidean aka Mean dissimilarity criteria */
5
+ static inline LDouble dissimilarity_l2(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict xsumsq)
6
+ {
7
+ LDouble sji = 0.0;
8
+
9
+ if (j >= i) return sji;
10
+
11
+ if (j > 0) {
12
+ LDouble segment_diff = vector_get_diff_f(xsum, i, j - 1);
13
+ uint32_t segment_size = i - j + 1;
14
+ sji = vector_get_diff_f(xsumsq, i, j - 1) - (segment_diff * segment_diff / segment_size);
15
+ } else {
16
+ LDouble xsumi = vector_get_f(xsum, i);
17
+ sji = vector_get_f(xsumsq, i) - (xsumi * xsumi / (i + 1));
18
+ }
19
+
20
+ return (sji > 0) ? sji : 0.0;
21
+ }
22
+
23
+ /* L1 aka Manhattan aka Median dissimilarity criteria */
24
+ static inline LDouble dissimilarity_l1(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict _xsumsq)
25
+ {
26
+ LDouble sji = 0.0;
27
+
28
+ if (j >= i) return sji;
29
+
30
+ if (j > 0) {
31
+ uint32_t median_idx = (i + j) >> 1;
32
+
33
+ if (((i - j + 1) % 2) == 1) {
34
+ sji =
35
+ - vector_get_f(xsum, median_idx - 1)
36
+ + vector_get_f(xsum, j - 1)
37
+ + vector_get_f(xsum, i)
38
+ - vector_get_f(xsum, median_idx);
39
+ } else {
40
+ sji =
41
+ - vector_get_f(xsum, median_idx)
42
+ + vector_get_f(xsum, j - 1)
43
+ + vector_get_f(xsum, i)
44
+ - vector_get_f(xsum, median_idx);
45
+ }
46
+ } else { // j == 0
47
+ uint32_t median_idx = i >> 1;
48
+
49
+ if (((i + 1) % 2) == 1) {
50
+ sji =
51
+ - vector_get_f(xsum, median_idx - 1)
52
+ + vector_get_f(xsum, i)
53
+ - vector_get_f(xsum, median_idx);
54
+ } else {
55
+ sji =
56
+ - vector_get_f(xsum, median_idx)
57
+ + vector_get_f(xsum, i)
58
+ - vector_get_f(xsum, median_idx);
59
+ }
60
+ }
61
+
62
+ return (sji < 0) ? 0.0 : sji;
63
+ }
64
+
65
+ #endif /* DISSIMILARITY_H */
@@ -33,8 +33,6 @@ typedef struct VectorI {
33
33
  uint32_t *values;
34
34
  } VectorI;
35
35
 
36
- typedef LDouble (FnDissim)(uint32_t, uint32_t, VectorF*, VectorF*);
37
-
38
36
  typedef struct State {
39
37
  uint32_t xcount;
40
38
  uint32_t kmin;
@@ -45,7 +43,6 @@ typedef struct State {
45
43
  MatrixI *splits;
46
44
  VectorF *xsum;
47
45
  VectorF *xsumsq;
48
- FnDissim *dissim;
49
46
  } State;
50
47
 
51
48
  typedef struct RowParams {
@@ -55,6 +52,8 @@ typedef struct RowParams {
55
52
  uint32_t istep;
56
53
  } RowParams;
57
54
 
55
+ typedef uint32_t (FnFindKOptimal)(State);
56
+
58
57
  typedef struct {
59
58
  LDouble mean;
60
59
  LDouble variance;
@@ -62,7 +61,6 @@ typedef struct {
62
61
 
63
62
  VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
64
63
  VALUE rb_ckmedian_sorted_group_sizes(VALUE self);
65
- VALUE rb_sorted_group_sizes(VALUE self, FnDissim*);
66
64
 
67
65
  Arena *arena_create(size_t);
68
66
  void *arena_alloc(Arena*, size_t);
@@ -89,17 +87,11 @@ uint32_t vector_get_i(VectorI*, uint32_t offset);
89
87
  void vector_downsize_i(VectorI*, uint32_t);
90
88
  void vector_inspect_i(VectorI*);
91
89
 
92
- LDouble dissimilarity_l2(uint32_t, uint32_t, VectorF*, VectorF*);
93
- LDouble dissimilarity_l1(uint32_t, uint32_t, VectorF*, VectorF*);
94
- void fill_row(State, uint32_t, uint32_t, uint32_t);
95
- void smawk(State, RowParams, VectorI*);
96
- void find_min_from_candidates(State, RowParams, VectorI*);
97
- VectorI *prune_candidates(State, RowParams, VectorI*);
98
- void fill_even_positions(State, RowParams, VectorI*);
99
90
  SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
100
91
  VectorI *backtrack_sizes(State, VectorI*, uint32_t);
101
92
  uint32_t find_koptimal_fast(State);
102
93
  uint32_t find_koptimal_gmm(State);
94
+ uint32_t find_koptimal_lmm(State);
103
95
 
104
96
  void Init_extensions(void) {
105
97
  VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
@@ -115,96 +107,34 @@ void Init_extensions(void) {
115
107
  # define ALLOCATION_FACTOR 3
116
108
  # define PIx2 (M_PI * 2.0)
117
109
 
110
+ #include "dissimilarity.h"
111
+
112
+ /* L2-specific versions of all hot-path functions */
113
+ #define DISSIM_SUFFIX l2
114
+ #define DISSIM(j, i, xsum, xsumsq) dissimilarity_l2(j, i, xsum, xsumsq)
115
+ #include "algorithm.inc"
116
+ #undef DISSIM
117
+ #undef DISSIM_SUFFIX
118
+
119
+ /* L1-specific versions of all hot-path functions */
120
+ #define DISSIM_SUFFIX l1
121
+ #define DISSIM(j, i, xsum, xsumsq) dissimilarity_l1(j, i, xsum, xsumsq)
122
+ #include "algorithm.inc"
123
+ #undef DISSIM
124
+ #undef DISSIM_SUFFIX
125
+
118
126
  VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
119
127
  {
120
- return rb_sorted_group_sizes(self, dissimilarity_l2);
128
+ bool use_stable = RTEST(rb_iv_get(self, "@use_stable_estimation"));
129
+ FnFindKOptimal *find_k = use_stable ? find_koptimal_gmm : find_koptimal_fast;
130
+ return rb_sorted_group_sizes_l2(self, find_k);
121
131
  }
122
132
 
123
133
  VALUE rb_ckmedian_sorted_group_sizes(VALUE self)
124
134
  {
125
- return rb_sorted_group_sizes(self, dissimilarity_l1);
126
- }
127
-
128
- VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
129
- {
130
- uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
131
- uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
132
- uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
133
- bool use_gmm = RTEST(rb_iv_get(self, "@use_gmm"));
134
- VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
135
- size_t capacity = sizeof(LDouble) * (xcount + 2) * (kmax + 2) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
136
- Arena *arena = arena_create(capacity);
137
-
138
- if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
139
-
140
- MatrixF *cost = matrix_create_f(arena, kmax, xcount);
141
- MatrixI *splits = matrix_create_i(arena, kmax, xcount);
142
- VectorF *xsorted = vector_create_f(arena, xcount);
143
- VectorF *xsum = vector_create_f(arena, xcount);
144
- VectorF *xsumsq = vector_create_f(arena, xcount);
145
-
146
- for (uint32_t i = 0; i < xcount; i++) {
147
- LDouble xi = NUM2DBL(rb_ary_entry(rb_xsorted, i));
148
- vector_set_f(xsorted, i, xi);
149
- }
150
-
151
- State state = {
152
- .arena = arena,
153
- .xcount = xcount,
154
- .kmin = kmin,
155
- .kmax = kmax,
156
- .xsorted = xsorted,
157
- .cost = cost,
158
- .splits = splits,
159
- .xsum = xsum,
160
- .xsumsq = xsumsq,
161
- .dissim = criteria
162
- };
163
-
164
-
165
- LDouble shift = vector_get_f(xsorted, xcount / 2);
166
- LDouble diff_initial = vector_get_f(xsorted, 0) - shift;
167
-
168
- vector_set_f(xsum, 0, diff_initial);
169
- vector_set_f(xsumsq, 0, diff_initial * diff_initial);
170
-
171
- for (uint32_t i = 1; i < xcount; i++) {
172
- LDouble xi = vector_get_f(xsorted, i);
173
- LDouble xsum_prev = vector_get_f(xsum, i - 1);
174
- LDouble xsumsq_prev = vector_get_f(xsumsq, i - 1);
175
- LDouble diff = xi - shift;
176
-
177
- vector_set_f(xsum, i, xsum_prev + diff);
178
- vector_set_f(xsumsq, i, xsumsq_prev + diff * diff);
179
- matrix_set_f(cost, 0, i, criteria(0, i, xsum, xsumsq));
180
- matrix_set_i(splits, 0, i, 0);
181
- }
182
-
183
- for (uint32_t q = 1; q <= kmax - 1; q++) {
184
- uint32_t imin = (q < kmax - 1) ? ((q > 1) ? q : 1) : xcount - 1;
185
- fill_row(state, q, imin, xcount - 1);
186
- }
187
-
188
- uint32_t koptimal = use_gmm ? find_koptimal_gmm(state) : find_koptimal_fast(state);
189
-
190
- VectorI *sizes = vector_create_i(arena, koptimal);
191
- backtrack_sizes(state, sizes, koptimal);
192
-
193
- /* printf("XSORTED \t"); vector_inspect_f(xsorted); */
194
- /* printf("K OPTIMAL: %lld\n", koptimal); */
195
- /* printf("SIZES \t"); vector_inspect_i(sizes); */
196
- /* printf("FINAL COST\n"); matrix_inspect_f(cost); */
197
- /* printf("FINAL SPLITS\n"); matrix_inspect_i(splits); */
198
-
199
- VALUE response = rb_ary_new2(sizes->size);
200
- for (uint32_t i = 0; i < sizes->size; i++) {
201
- VALUE size = LONG2NUM(vector_get_i(sizes, i));
202
- rb_ary_store(response, i, size);
203
- }
204
-
205
- arena_destroy(arena);
206
-
207
- return response;
135
+ bool use_stable = RTEST(rb_iv_get(self, "@use_stable_estimation"));
136
+ FnFindKOptimal *find_k = use_stable ? find_koptimal_lmm : find_koptimal_fast;
137
+ return rb_sorted_group_sizes_l1(self, find_k);
208
138
  }
209
139
 
210
140
  uint32_t find_koptimal_fast(State state)
@@ -372,292 +302,162 @@ uint32_t find_koptimal_gmm(State state)
372
302
  return kopt;
373
303
  }
374
304
 
375
- VectorI *backtrack_sizes(State state, VectorI *sizes, uint32_t k)
305
+ uint32_t find_koptimal_lmm(State state)
376
306
  {
377
- MatrixI *splits = state.splits;
307
+ uint32_t kmin = state.kmin;
308
+ uint32_t kmax = state.kmax;
378
309
  uint32_t xcount = state.xcount;
379
- uint32_t right = xcount - 1;
380
- uint32_t left = 0;
381
310
 
382
- // Common case works with `i` remaining unsigned and unconditional assignment of the next `left` and `right`
383
- for (uint32_t i = k - 1; i > 0; i--, right = left - 1) {
384
- left = matrix_get_i(splits, i, right);
385
- vector_set_i(sizes, i, right - left + 1);
311
+ if (kmin > kmax || xcount < 2) {
312
+ return (kmin < kmax) ? kmin : kmax;
386
313
  }
387
- // Special case outside of the loop removing the need for conditionals
388
- left = matrix_get_i(splits, 0, right);
389
- vector_set_i(sizes, 0, right - left + 1);
390
314
 
391
- return sizes;
392
- }
315
+ Arena *arena = state.arena;
316
+ VectorF *xsorted = state.xsorted;
317
+ uint32_t kopt = kmin;
318
+ LDouble max_bic = 0.0;
319
+ LDouble log_xcount = log((LDouble) xcount);
320
+ VectorF *lambda = vector_create_f(arena, kmax);
321
+ VectorF *mu = vector_create_f(arena, kmax); /* median */
322
+ VectorF *scale = vector_create_f(arena, kmax); /* MAD (mean absolute deviation) */
323
+ VectorF *coeff = vector_create_f(arena, kmax);
324
+ VectorI *sizes = vector_create_i(arena, kmax);
393
325
 
394
- SegmentStats shifted_data_variance(VectorF *xsorted, uint32_t left, uint32_t right)
395
- {
396
- const uint32_t n = right - left + 1;
397
- LDouble sum = 0.0;
398
- LDouble sumsq = 0.0;
399
- SegmentStats stats = { .mean = 0.0, .variance = 0.0 };
326
+ for (uint32_t kouter = kmin; kouter <= kmax; ++kouter)
327
+ {
328
+ uint32_t ileft = 0;
329
+ uint32_t iright;
400
330
 
401
- if (right >= left) {
402
- const LDouble median = vector_get_f(xsorted, (left + right) / 2);
331
+ backtrack_sizes(state, sizes, kouter);
403
332
 
404
- for (uint32_t i = left; i <= right; i++) {
405
- const LDouble sumi = vector_get_f(xsorted, i) - median;
333
+ for (uint32_t k = 0; k < kouter; ++k)
334
+ {
335
+ uint32_t size = vector_get_i(sizes, k);
336
+ vector_set_f(lambda, k, size / (LDouble) xcount);
337
+ iright = ileft + size - 1;
406
338
 
407
- sum += sumi;
408
- sumsq += sumi * sumi;
409
- }
339
+ uint32_t median_idx = (ileft + iright) / 2;
340
+ LDouble median;
341
+ if ((size % 2) == 1) {
342
+ median = vector_get_f(xsorted, median_idx);
343
+ } else {
344
+ median = (vector_get_f(xsorted, median_idx) + vector_get_f(xsorted, median_idx + 1)) / 2.0;
345
+ }
346
+ vector_set_f(mu, k, median);
410
347
 
411
- stats.mean = (sum / n) + median;
412
- if (n > 1) {
413
- stats.variance = (sumsq - (sum * sum / n)) / (n - 1);
414
- }
415
- }
348
+ LDouble mad = 0.0;
349
+ for (uint32_t i = ileft; i <= iright; ++i) {
350
+ LDouble xi = vector_get_f(xsorted, i);
351
+ mad += fabs(xi - median);
352
+ }
353
+ mad = mad / size;
354
+ vector_set_f(scale, k, mad);
416
355
 
417
- return stats;
418
- }
356
+ /* Handle edge case: MAD = 0 (all points are the same) or size = 1 */
357
+ if (mad == 0 || size == 1) {
358
+ LDouble dmin;
419
359
 
420
- void fill_row(State state, uint32_t q, uint32_t imin, uint32_t imax)
421
- {
422
- uint32_t size = imax - q + 1;
423
- VectorI *split_candidates = vector_create_i(state.arena, size);
424
- for (uint32_t i = 0; i < size; i++) {
425
- vector_set_i(split_candidates, i, q + i);
426
- }
427
- RowParams rparams = { .row = q, .imin = imin, .imax = imax, .istep = 1 };
428
- smawk(state, rparams, split_candidates);
429
- }
360
+ if (ileft > 0 && iright < xcount - 1) {
361
+ LDouble left_diff = vector_get_diff_f(xsorted, ileft, ileft - 1);
362
+ LDouble right_diff = vector_get_diff_f(xsorted, iright + 1, iright);
430
363
 
431
- void smawk(State state, RowParams rparams, VectorI *split_candidates)
432
- {
433
- const uint32_t imin = rparams.imin;
434
- const uint32_t imax = rparams.imax;
435
- const uint32_t istep = rparams.istep;
436
-
437
- if ((imax - imin) <= (0 * istep)) {
438
- find_min_from_candidates(state, rparams, split_candidates);
439
- } else {
440
- VectorI *odd_candidates = prune_candidates(state, rparams, split_candidates);
441
- /* printf("PRUNED\t"); vector_inspect_i(odd_candidates); */
442
- uint32_t istepx2 = istep * 2;
443
- uint32_t imin_odd = imin + istep;
444
- uint32_t imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2);
445
- RowParams rparams_odd = { .row = rparams.row, .imin = imin_odd, .imax = imax_odd, .istep = istepx2 };
446
-
447
- smawk(state, rparams_odd, odd_candidates);
448
- fill_even_positions(state, rparams, split_candidates);
449
- }
450
- }
364
+ dmin = (left_diff < right_diff) ? left_diff : right_diff;
365
+ } else if (ileft > 0) {
366
+ dmin = vector_get_diff_f(xsorted, ileft, ileft - 1);
367
+ } else {
368
+ dmin = vector_get_diff_f(xsorted, iright + 1, iright);
369
+ }
451
370
 
452
- inline void fill_even_positions(State state, RowParams rparams, VectorI *split_candidates)
453
- {
454
- uint32_t row = rparams.row;
455
- uint32_t imin = rparams.imin;
456
- uint32_t imax = rparams.imax;
457
- uint32_t istep = rparams.istep;
458
- uint32_t n = split_candidates->size;
459
- uint32_t istepx2 = istep * 2;
460
- uint32_t jl = vector_get_i(split_candidates, 0);
461
- VectorF *const xsum = state.xsum;
462
- VectorF *const xsumsq = state.xsumsq;
463
- MatrixI *const splits = state.splits;
464
- FnDissim *const dissim = state.dissim;
465
-
466
- for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
467
- while (vector_get_i(split_candidates, r) < jl) r++;
468
-
469
- uint32_t rcandidate = vector_get_i(split_candidates, r);
470
- uint32_t cost_base_row = row - 1;
471
- uint32_t cost_base_col = rcandidate - 1;
472
- LDouble cost =
473
- matrix_get_f(state.cost, cost_base_row, cost_base_col) + dissim(rcandidate, i, xsum, xsumsq);
474
-
475
- matrix_set_f(state.cost, row, i, cost);
476
- matrix_set_i(state.splits, row, i, rcandidate);
477
-
478
- uint32_t jh =
479
- (i + istep) <= imax
480
- ? matrix_get_i(splits, row, i + istep)
481
- : vector_get_i(split_candidates, n - 1);
482
-
483
- uint32_t jmax = jh < i ? jh : i;
484
- LDouble sjimin = dissim(jmax, i, xsum, xsumsq);
485
-
486
- for (++r; r < n && vector_get_i(split_candidates, r) <= jmax; r++) {
487
- uint32_t jabs = vector_get_i(split_candidates, r);
488
-
489
- if (jabs > i) break;
490
- if (jabs < matrix_get_i(splits, row - 1, i)) continue;
491
-
492
- LDouble cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
493
- LDouble sj = cost_base + dissim(jabs, i, xsum, xsumsq);
494
- LDouble cost_prev = matrix_get_f(state.cost, row, i);
495
-
496
- if (sj <= cost_prev) {
497
- matrix_set_f(state.cost, row, i, sj);
498
- matrix_set_i(state.splits, row, i, jabs);
499
- } else if (cost_base + sjimin > cost_prev) {
500
- break;
371
+ if (mad == 0) vector_set_f(scale, k, dmin / 6.0);
372
+ if (size == 1) vector_set_f(scale, k, dmin);
501
373
  }
502
- }
503
374
 
504
- r--;
505
- jl = jh;
506
- }
507
- }
508
-
509
- inline void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
510
- {
511
- const uint32_t row = rparams.row;
512
- const uint32_t imin = rparams.imin;
513
- const uint32_t imax = rparams.imax;
514
- const uint32_t istep = rparams.istep;
515
- MatrixF *const cost = state.cost;
516
- MatrixI *const splits = state.splits;
517
- FnDissim *const dissim = state.dissim;
518
-
519
- uint32_t optimal_split_idx_prev = 0;
520
-
521
- for (uint32_t i = imin; i <= imax; i += istep)
522
- {
523
- const uint32_t optimal_split_idx = optimal_split_idx_prev;
524
- const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
525
- const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
526
- const LDouble added_cost = dissim(optimal_split, i, state.xsum, state.xsumsq);
375
+ /* Laplace coefficient: lambda_k / (2 * b_k) */
376
+ LDouble lambda_k = vector_get_f(lambda, k);
377
+ LDouble scale_k = vector_get_f(scale, k);
378
+ vector_set_f(coeff, k, lambda_k / (2.0 * scale_k));
379
+ ileft = iright + 1;
380
+ }
527
381
 
528
- matrix_set_f(cost, row, i, cost_prev + added_cost);
529
- matrix_set_i(splits, row, i, optimal_split);
382
+ LDouble loglikelihood = 0.0;
530
383
 
531
- for (uint32_t r = optimal_split_idx + 1; r < split_candidates->size; r++)
384
+ for (uint32_t i = 0; i < xcount; ++i)
532
385
  {
533
- uint32_t split = vector_get_i(split_candidates, r);
534
-
535
- if (split < matrix_get_i(splits, row - 1, i)) continue;
536
- if (split > i) break;
537
-
538
- LDouble split_cost =
539
- matrix_get_f(cost, row - 1, split - 1) + dissim(split, i, state.xsum, state.xsumsq);
540
-
541
- if (split_cost > matrix_get_f(cost, row, i)) continue;
386
+ LDouble L = 0.0;
387
+ LDouble xi = vector_get_f(xsorted, i);
542
388
 
543
- matrix_set_f(cost, row, i, split_cost);
544
- matrix_set_i(splits, row, i, split);
545
- optimal_split_idx_prev = r;
389
+ for (uint32_t k = 0; k < kouter; ++k)
390
+ {
391
+ LDouble coeff_k = vector_get_f(coeff, k);
392
+ LDouble mu_k = vector_get_f(mu, k);
393
+ LDouble scale_k = vector_get_f(scale, k);
394
+ LDouble x_mu_abs = fabs(xi - mu_k);
395
+ /* Laplace PDF: (1/(2b)) * exp(-|x-μ|/b) */
396
+ L += coeff_k * exp(-x_mu_abs / scale_k);
397
+ }
398
+ loglikelihood += log(L);
546
399
  }
547
- }
548
- }
549
-
550
- inline VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
551
- {
552
- uint32_t imin = rparams.imin;
553
- uint32_t row = rparams.row;
554
- uint32_t istep = rparams.istep;
555
- uint32_t n = ((rparams.imax - imin) / istep) + 1;
556
- uint32_t m = split_candidates->size;
557
400
 
558
- if (n >= m) return split_candidates;
559
-
560
- uint32_t left = 0;
561
- uint32_t right = 0;
562
- VectorI *pruned = vector_dup_i(split_candidates, state.arena);
563
- FnDissim *const dissim = state.dissim;
401
+ /* BIC = 2*logL - (3k-1)*log(n) */
402
+ /* Parameters: k-1 mixing proportions + k medians + k scales = 3k-1 */
403
+ LDouble bic = 2 * loglikelihood - (3 * kouter - 1) * log_xcount;
564
404
 
565
- while (m > n)
566
- {
567
- uint32_t i = imin + left * istep;
568
- uint32_t j = vector_get_i(pruned, right);
569
- uint32_t jnext = vector_get_i(pruned, right + 1);
570
- LDouble sl =
571
- matrix_get_f(state.cost, row - 1, j - 1) + dissim(j, i, state.xsum, state.xsumsq);
572
- LDouble snext =
573
- matrix_get_f(state.cost, row - 1, jnext - 1) + dissim(jnext, i, state.xsum, state.xsumsq);
574
-
575
- if ((sl < snext) && (left < n - 1)) {
576
- vector_set_i(pruned, left, j);
577
- left++;
578
- right++;
579
- } else if ((sl < snext) && (left == n - 1)) {
580
- right++;
581
- m--;
582
- vector_set_i(pruned, right, j);
405
+ if (kouter == kmin) {
406
+ max_bic = bic;
407
+ kopt = kmin;
583
408
  } else {
584
- if (left > 0) {
585
- vector_set_i(pruned, right, vector_get_i(pruned, --left));
586
- } else {
587
- right++;
409
+ if (bic > max_bic) {
410
+ max_bic = bic;
411
+ kopt = kouter;
588
412
  }
589
-
590
- m--;
591
413
  }
592
414
  }
593
-
594
- for (uint32_t i = left; i < m; i++) {
595
- vector_set_i(pruned, i, vector_get_i(pruned, right++));
596
- }
597
-
598
- vector_downsize_i(pruned, m);
599
-
600
- return pruned;
415
+ return kopt;
601
416
  }
602
417
 
603
- /* L2 aka Euclidean aka Mean dissimilarity criteria */
604
- inline LDouble dissimilarity_l2(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict xsumsq) {
605
- LDouble sji = 0.0;
606
-
607
- if (j >= i) return sji;
418
+ VectorI *backtrack_sizes(State state, VectorI *sizes, uint32_t k)
419
+ {
420
+ MatrixI *splits = state.splits;
421
+ uint32_t xcount = state.xcount;
422
+ uint32_t right = xcount - 1;
423
+ uint32_t left = 0;
608
424
 
609
- if (j > 0) {
610
- LDouble segment_diff = vector_get_diff_f(xsum, i, j - 1);
611
- uint32_t segment_size = i - j + 1;
612
- sji = vector_get_diff_f(xsumsq, i, j - 1) - (segment_diff * segment_diff / segment_size);
613
- } else {
614
- LDouble xsumi = vector_get_f(xsum, i);
615
- sji = vector_get_f(xsumsq, i) - (xsumi * xsumi / (i + 1));
425
+ /* Common case works with `i` remaining unsigned and unconditional assignment of the next `left` and `right` */
426
+ for (uint32_t i = k - 1; i > 0; i--, right = left - 1) {
427
+ left = matrix_get_i(splits, i, right);
428
+ vector_set_i(sizes, i, right - left + 1);
616
429
  }
430
+ /* Special case outside of the loop removing the need for conditionals */
431
+ left = matrix_get_i(splits, 0, right);
432
+ vector_set_i(sizes, 0, right - left + 1);
617
433
 
618
- return (sji > 0) ? sji : 0.0;
434
+ return sizes;
619
435
  }
620
436
 
621
- /* L1 aka Manhattan aka Median dissimilarity criteria */
622
- inline LDouble dissimilarity_l1(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict _xsumsq)
437
+ SegmentStats shifted_data_variance(VectorF *xsorted, uint32_t left, uint32_t right)
623
438
  {
624
- LDouble sji = 0.0;
439
+ const uint32_t n = right - left + 1;
440
+ LDouble sum = 0.0;
441
+ LDouble sumsq = 0.0;
442
+ SegmentStats stats = { .mean = 0.0, .variance = 0.0 };
625
443
 
626
- if (j >= i) return sji;
444
+ if (right >= left) {
445
+ const LDouble median = vector_get_f(xsorted, (left + right) / 2);
627
446
 
628
- if (j > 0) {
629
- uint32_t median_idx = (i + j) >> 1;
447
+ for (uint32_t i = left; i <= right; i++) {
448
+ const LDouble sumi = vector_get_f(xsorted, i) - median;
630
449
 
631
- if (((i - j + 1) % 2) == 1) {
632
- sji =
633
- - vector_get_f(xsum, median_idx - 1)
634
- + vector_get_f(xsum, j - 1)
635
- + vector_get_f(xsum, i)
636
- - vector_get_f(xsum, median_idx);
637
- } else {
638
- sji =
639
- - vector_get_f(xsum, median_idx)
640
- + vector_get_f(xsum, j - 1)
641
- + vector_get_f(xsum, i)
642
- - vector_get_f(xsum, median_idx);
450
+ sum += sumi;
451
+ sumsq += sumi * sumi;
643
452
  }
644
- } else { // j == 0
645
- uint32_t median_idx = i >> 1;
646
-
647
- if (((i + 1) % 2) == 1) {
648
- sji =
649
- - vector_get_f(xsum, median_idx - 1)
650
- + vector_get_f(xsum, i)
651
- - vector_get_f(xsum, median_idx);
652
- } else {
653
- sji =
654
- - vector_get_f(xsum, median_idx)
655
- + vector_get_f(xsum, i)
656
- - vector_get_f(xsum, median_idx);
453
+
454
+ stats.mean = (sum / n) + median;
455
+ if (n > 1) {
456
+ stats.variance = (sumsq - (sum * sum / n)) / (n - 1);
657
457
  }
658
458
  }
659
459
 
660
- return (sji < 0) ? 0.0 : sji;
460
+ return stats;
661
461
  }
662
462
 
663
463
  inline VectorF *vector_create_f(Arena *arena, uint32_t size) {
@@ -1,19 +1,37 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmeans
4
- class Clusterer # rubocop:disable Style/Documentation
4
+ # Optimal k-means clustering for univariate (1D) data using dynamic programming.
5
+ # Minimizes within-cluster sum of squared distances (L2 norm).
6
+ class Clusterer
7
+ # Creates a new Ckmeans clusterer.
8
+ #
9
+ # @param entries [Array<Numeric>] The data points to cluster
10
+ # @param kmin [Integer] Minimum number of clusters to consider
11
+ # @param kmax [Integer] Maximum number of clusters to consider (defaults to kmin for fixed K)
12
+ # @param kestimate [Symbol] Method for estimating optimal K:
13
+ # - :fast - Quick heuristic using implicit Gaussian assumption (best for large datasets)
14
+ # - :stable - Model-based estimation using Gaussian Mixture Model (better for duplicates/edge cases)
15
+ # - :gmm - Alias for :stable (Gaussian Mixture Model)
16
+ #
17
+ # @example Fixed number of clusters
18
+ # Ckmeans::Clusterer.new([1, 2, 3, 100, 101], 2).clusters
19
+ # # => [[1, 2, 3], [100, 101]]
20
+ #
21
+ # @example Automatic K selection with stable estimation
22
+ # Ckmeans::Clusterer.new([1, 1, 1, 5, 5, 5, 10, 10, 10], 1, 5, :stable).clusters
5
23
  def initialize(entries, kmin, kmax = kmin, kestimate = :fast)
6
24
  @xcount = entries.size
7
25
 
8
26
  raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
9
27
  raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
10
28
 
11
- @kmin = kmin
12
- @unique_xcount = entries.uniq.size
13
- @kmax = [@unique_xcount, kmax].min
14
- @xsorted_original = entries.sort
15
- @xsorted = @xsorted_original.map(&:to_f)
16
- @use_gmm = kestimate == :gmm
29
+ @kmin = kmin
30
+ @unique_xcount = entries.uniq.size
31
+ @kmax = [@unique_xcount, kmax].min
32
+ @xsorted_original = entries.sort
33
+ @xsorted = @xsorted_original.map(&:to_f)
34
+ @use_stable_estimation = %i[gmm stable].include?(kestimate)
17
35
  end
18
36
 
19
37
  def clusters
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmeans
4
- VERSION = "2.0.0"
4
+ VERSION = "2.1.1"
5
5
  end
@@ -1,18 +1,39 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmedian
4
- class Clusterer # rubocop:disable Style/Documentation
5
- def initialize(entries, kmin, kmax = kmin)
4
+ # Optimal k-median clustering for univariate (1D) data using dynamic programming.
5
+ # Minimizes within-cluster sum of absolute deviations (L1 norm).
6
+ # More robust to outliers than k-means.
7
+ class Clusterer
8
+ # Creates a new Ckmedian clusterer.
9
+ #
10
+ # @param entries [Array<Numeric>] The data points to cluster
11
+ # @param kmin [Integer] Minimum number of clusters to consider
12
+ # @param kmax [Integer] Maximum number of clusters to consider (defaults to kmin for fixed K)
13
+ # @param kestimate [Symbol] Method for estimating optimal K:
14
+ # - :fast - Quick heuristic using implicit Gaussian assumption (best for large datasets)
15
+ # - :stable - Model-based estimation using Laplace Mixture Model (better for outliers/bursts)
16
+ # - :lmm - Alias for :stable (Laplace Mixture Model)
17
+ #
18
+ # @example Fixed number of clusters
19
+ # Ckmedian::Clusterer.new([1, 2, 3, 100, 101], 2).clusters
20
+ # # => [[1, 2, 3], [100, 101]]
21
+ #
22
+ # @example Photo timeline clustering (robust to bursts and outliers)
23
+ # timestamps = photos.map(&:taken_at).map(&:to_i)
24
+ # Ckmedian::Clusterer.new(timestamps, 1, 20, :stable).clusters
25
+ def initialize(entries, kmin, kmax = kmin, kestimate = :fast)
6
26
  @xcount = entries.size
7
27
 
8
28
  raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
9
29
  raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
10
30
 
11
- @kmin = kmin
12
- @unique_xcount = entries.uniq.size
13
- @kmax = [@unique_xcount, kmax].min
14
- @xsorted_original = entries.sort
15
- @xsorted = @xsorted_original.map(&:to_f)
31
+ @kmin = kmin
32
+ @unique_xcount = entries.uniq.size
33
+ @kmax = [@unique_xcount, kmax].min
34
+ @xsorted_original = entries.sort
35
+ @xsorted = @xsorted_original.map(&:to_f)
36
+ @use_stable_estimation = %i[lmm stable].include?(kestimate)
16
37
  end
17
38
 
18
39
  def clusters
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ckmeans
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Lebedev
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-06-09 00:00:00.000000000 Z
11
+ date: 2025-12-14 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Repeatable clustering of unidimensional data
14
14
  email:
@@ -28,6 +28,8 @@ files:
28
28
  - LICENSE
29
29
  - README.md
30
30
  - Rakefile
31
+ - ext/ckmeans/algorithm.inc
32
+ - ext/ckmeans/dissimilarity.h
31
33
  - ext/ckmeans/extconf.rb
32
34
  - ext/ckmeans/extensions.c
33
35
  - lib/ckmeans.rb