ckmeans 1.1.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +41 -8
- data/ext/ckmeans/extensions.c +248 -37
- data/lib/ckmeans/clusterer.rb +26 -8
- data/lib/ckmeans/version.rb +1 -1
- data/lib/ckmedian/clusterer.rb +23 -2
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: '08439cddf5410f952a06263d423297de219c86927dea2d0c4538916d9d0c70fd'
|
|
4
|
+
data.tar.gz: 94b1cece717f8538945208519c94046881f2d42c7ea9ba9bd453d063eef39878
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 955d8b19a2a33d46b6adebfa05c9460784f5838d1a5d6d8c82e447be2eacb95da3a2711aebacf96429bf36fafe4d46dfc98fe87025cce4373c734c1cc6a60fb9
|
|
7
|
+
data.tar.gz: 8887346eb2602c071923ade05a549a0eb3df44491538fb79155bd766429dde0084847231e1a4f2dcf6e5e3c14f6dca309f2399339abe378becd4b21d8bc70aa7
|
data/README.md
CHANGED
|
@@ -18,19 +18,52 @@ gem install ckmeans
|
|
|
18
18
|
|
|
19
19
|
## Usage
|
|
20
20
|
|
|
21
|
+
### Basic Clustering
|
|
22
|
+
|
|
21
23
|
```rb
|
|
22
|
-
# Fixed cluster count
|
|
23
|
-
Ckmeans::Clusterer(data,
|
|
24
|
-
Ckmedian::Clusterer(data,
|
|
24
|
+
# Fixed cluster count (K known in advance)
|
|
25
|
+
Ckmeans::Clusterer.new(data, 3).clusters
|
|
26
|
+
Ckmedian::Clusterer.new(data, 3).clusters
|
|
27
|
+
|
|
28
|
+
# Automatic K selection (tries K from kmin to kmax, picks optimal)
|
|
29
|
+
Ckmeans::Clusterer.new(data, 1, 10).clusters
|
|
30
|
+
Ckmedian::Clusterer.new(data, 1, 10).clusters
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Choosing Between Ckmeans and Ckmedian
|
|
25
34
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
Ckmedian::Clusterer(data, kmin, kmax).clusters
|
|
35
|
+
- **Ckmeans** - Minimizes squared distances (L2). Good for normally distributed data.
|
|
36
|
+
- **Ckmedian** - Minimizes absolute distances (L1). More robust to outliers and data bursts.
|
|
29
37
|
|
|
30
|
-
|
|
31
|
-
|
|
38
|
+
```rb
|
|
39
|
+
# For clean numerical data
|
|
40
|
+
temperatures = [20.1, 20.2, 25.5, 25.6, 30.1, 30.2]
|
|
41
|
+
Ckmeans::Clusterer.new(temperatures, 1, 5).clusters
|
|
42
|
+
# => [[20.1, 20.2], [25.5, 25.6], [30.1, 30.2]]
|
|
43
|
+
|
|
44
|
+
# For data with outliers (e.g., photo timestamps with bursts)
|
|
45
|
+
timestamps = photos.map(&:taken_at).map(&:to_i)
|
|
46
|
+
Ckmedian::Clusterer.new(timestamps, 1, 20).clusters
|
|
32
47
|
```
|
|
33
48
|
|
|
49
|
+
### Stable Estimation (Recommended for Edge Cases)
|
|
50
|
+
|
|
51
|
+
By default, both algorithms use a fast heuristic for estimating K. For datasets with many duplicates, tight clusters, or outliers, use `:stable` for more robust estimation:
|
|
52
|
+
|
|
53
|
+
```rb
|
|
54
|
+
# Stable estimation (uses statistical mixture models)
|
|
55
|
+
Ckmeans::Clusterer.new(data, 1, 10, :stable).clusters
|
|
56
|
+
Ckmedian::Clusterer.new(data, 1, 10, :stable).clusters
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
**When to use `:stable`:**
|
|
60
|
+
- Small to medium datasets (< 1000 points)
|
|
61
|
+
- Many duplicate values
|
|
62
|
+
- Clusters with very different sizes
|
|
63
|
+
- Photo/event timeline clustering (bursts and gaps)
|
|
64
|
+
|
|
65
|
+
**Expert users:** `:stable` is an alias for `:gmm` (Gaussian Mixture Model) in Ckmeans and `:lmm` (Laplace Mixture Model) in Ckmedian.
|
|
66
|
+
|
|
34
67
|
## License
|
|
35
68
|
|
|
36
69
|
The gem is available as open source under the terms of the [LGPL v3 License](https://opensource.org/license/lgpl-3-0).
|
data/ext/ckmeans/extensions.c
CHANGED
|
@@ -39,7 +39,6 @@ typedef struct State {
|
|
|
39
39
|
uint32_t xcount;
|
|
40
40
|
uint32_t kmin;
|
|
41
41
|
uint32_t kmax;
|
|
42
|
-
bool apply_deviation;
|
|
43
42
|
Arena *arena;
|
|
44
43
|
VectorF *xsorted;
|
|
45
44
|
MatrixF *cost;
|
|
@@ -56,6 +55,8 @@ typedef struct RowParams {
|
|
|
56
55
|
uint32_t istep;
|
|
57
56
|
} RowParams;
|
|
58
57
|
|
|
58
|
+
typedef uint32_t (FnFindKOptimal)(State);
|
|
59
|
+
|
|
59
60
|
typedef struct {
|
|
60
61
|
LDouble mean;
|
|
61
62
|
LDouble variance;
|
|
@@ -63,7 +64,7 @@ typedef struct {
|
|
|
63
64
|
|
|
64
65
|
VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
|
|
65
66
|
VALUE rb_ckmedian_sorted_group_sizes(VALUE self);
|
|
66
|
-
VALUE rb_sorted_group_sizes(VALUE self, FnDissim*);
|
|
67
|
+
VALUE rb_sorted_group_sizes(VALUE self, FnDissim*, FnFindKOptimal*);
|
|
67
68
|
|
|
68
69
|
Arena *arena_create(size_t);
|
|
69
70
|
void *arena_alloc(Arena*, size_t);
|
|
@@ -99,8 +100,9 @@ VectorI *prune_candidates(State, RowParams, VectorI*);
|
|
|
99
100
|
void fill_even_positions(State, RowParams, VectorI*);
|
|
100
101
|
SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
|
|
101
102
|
VectorI *backtrack_sizes(State, VectorI*, uint32_t);
|
|
102
|
-
uint32_t
|
|
103
|
-
|
|
103
|
+
uint32_t find_koptimal_fast(State);
|
|
104
|
+
uint32_t find_koptimal_gmm(State);
|
|
105
|
+
uint32_t find_koptimal_lmm(State);
|
|
104
106
|
|
|
105
107
|
void Init_extensions(void) {
|
|
106
108
|
VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
|
|
@@ -118,23 +120,26 @@ void Init_extensions(void) {
|
|
|
118
120
|
|
|
119
121
|
VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
|
|
120
122
|
{
|
|
121
|
-
|
|
123
|
+
bool use_gmm = RTEST(rb_iv_get(self, "@use_gmm"));
|
|
124
|
+
FnFindKOptimal *find_k = use_gmm ? find_koptimal_gmm : find_koptimal_fast;
|
|
125
|
+
return rb_sorted_group_sizes(self, dissimilarity_l2, find_k);
|
|
122
126
|
}
|
|
123
127
|
|
|
124
128
|
VALUE rb_ckmedian_sorted_group_sizes(VALUE self)
|
|
125
129
|
{
|
|
126
|
-
|
|
130
|
+
bool use_lmm = RTEST(rb_iv_get(self, "@use_lmm"));
|
|
131
|
+
FnFindKOptimal *find_k = use_lmm ? find_koptimal_lmm : find_koptimal_fast;
|
|
132
|
+
return rb_sorted_group_sizes(self, dissimilarity_l1, find_k);
|
|
127
133
|
}
|
|
128
134
|
|
|
129
|
-
VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
|
|
135
|
+
VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria, FnFindKOptimal *find_koptimal)
|
|
130
136
|
{
|
|
131
|
-
uint32_t xcount
|
|
132
|
-
uint32_t kmin
|
|
133
|
-
uint32_t kmax
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
Arena *arena = arena_create(capacity);
|
|
137
|
+
uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
|
|
138
|
+
uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
|
|
139
|
+
uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
|
|
140
|
+
VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
|
|
141
|
+
size_t capacity = sizeof(LDouble) * (xcount + 2) * (kmax + 2) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
|
|
142
|
+
Arena *arena = arena_create(capacity);
|
|
138
143
|
|
|
139
144
|
if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
|
|
140
145
|
|
|
@@ -150,17 +155,16 @@ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
|
|
|
150
155
|
}
|
|
151
156
|
|
|
152
157
|
State state = {
|
|
153
|
-
.arena
|
|
154
|
-
.xcount
|
|
155
|
-
.kmin
|
|
156
|
-
.kmax
|
|
157
|
-
.
|
|
158
|
-
.
|
|
159
|
-
.
|
|
160
|
-
.
|
|
161
|
-
.
|
|
162
|
-
.
|
|
163
|
-
.dissim = criteria
|
|
158
|
+
.arena = arena,
|
|
159
|
+
.xcount = xcount,
|
|
160
|
+
.kmin = kmin,
|
|
161
|
+
.kmax = kmax,
|
|
162
|
+
.xsorted = xsorted,
|
|
163
|
+
.cost = cost,
|
|
164
|
+
.splits = splits,
|
|
165
|
+
.xsum = xsum,
|
|
166
|
+
.xsumsq = xsumsq,
|
|
167
|
+
.dissim = criteria
|
|
164
168
|
};
|
|
165
169
|
|
|
166
170
|
|
|
@@ -209,7 +213,7 @@ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
|
|
|
209
213
|
return response;
|
|
210
214
|
}
|
|
211
215
|
|
|
212
|
-
uint32_t
|
|
216
|
+
uint32_t find_koptimal_fast(State state)
|
|
213
217
|
{
|
|
214
218
|
uint32_t kmin = state.kmin;
|
|
215
219
|
uint32_t kmax = state.kmax;
|
|
@@ -256,8 +260,7 @@ uint32_t find_koptimal(State state)
|
|
|
256
260
|
loglikelihood += -(xi - mean) * (xi - mean) / (2.0 * variance);
|
|
257
261
|
}
|
|
258
262
|
loglikelihood += npoints * (
|
|
259
|
-
|
|
260
|
-
(0.5 * log(PIx2 * variance))
|
|
263
|
+
log(npoints / (LDouble) xcount) - (0.5 * log(PIx2 * variance))
|
|
261
264
|
);
|
|
262
265
|
} else {
|
|
263
266
|
loglikelihood += npoints * log(1.0 / bin_width / xcount);
|
|
@@ -280,6 +283,214 @@ uint32_t find_koptimal(State state)
|
|
|
280
283
|
return kopt;
|
|
281
284
|
}
|
|
282
285
|
|
|
286
|
+
uint32_t find_koptimal_gmm(State state)
|
|
287
|
+
{
|
|
288
|
+
uint32_t kmin = state.kmin;
|
|
289
|
+
uint32_t kmax = state.kmax;
|
|
290
|
+
uint32_t xcount = state.xcount;
|
|
291
|
+
|
|
292
|
+
if (kmin > kmax || xcount < 2) {
|
|
293
|
+
return (kmin < kmax) ? kmin : kmax;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
Arena *arena = state.arena;
|
|
297
|
+
VectorF *xsorted = state.xsorted;
|
|
298
|
+
uint32_t kopt = kmin;
|
|
299
|
+
LDouble max_bic = 0.0;
|
|
300
|
+
LDouble log_xcount = log((LDouble) xcount);
|
|
301
|
+
VectorF *lambda = vector_create_f(arena, kmax);
|
|
302
|
+
VectorF *mu = vector_create_f(arena, kmax);
|
|
303
|
+
VectorF *sigma2 = vector_create_f(arena, kmax);
|
|
304
|
+
VectorF *coeff = vector_create_f(arena, kmax);
|
|
305
|
+
VectorI *sizes = vector_create_i(arena, kmax);
|
|
306
|
+
|
|
307
|
+
for (uint32_t kouter = kmin; kouter <= kmax; ++kouter)
|
|
308
|
+
{
|
|
309
|
+
uint32_t ileft = 0;
|
|
310
|
+
uint32_t iright;
|
|
311
|
+
|
|
312
|
+
backtrack_sizes(state, sizes, kouter);
|
|
313
|
+
|
|
314
|
+
for (uint32_t k = 0; k < kouter; ++k)
|
|
315
|
+
{
|
|
316
|
+
uint32_t size = vector_get_i(sizes, k);
|
|
317
|
+
vector_set_f(lambda, k, size / (LDouble) xcount);
|
|
318
|
+
iright = ileft + size - 1;
|
|
319
|
+
SegmentStats stats = shifted_data_variance(xsorted, ileft, iright);
|
|
320
|
+
|
|
321
|
+
vector_set_f(mu, k, stats.mean);
|
|
322
|
+
vector_set_f(sigma2, k, stats.variance);
|
|
323
|
+
|
|
324
|
+
if (stats.variance == 0 || size == 1) {
|
|
325
|
+
LDouble dmin;
|
|
326
|
+
|
|
327
|
+
if (ileft > 0 && iright < xcount - 1) {
|
|
328
|
+
LDouble left_diff = vector_get_diff_f(xsorted, ileft, ileft - 1);
|
|
329
|
+
LDouble right_diff = vector_get_diff_f(xsorted, iright + 1, iright);
|
|
330
|
+
|
|
331
|
+
dmin = (left_diff < right_diff) ? left_diff : right_diff;
|
|
332
|
+
} else if (ileft > 0) {
|
|
333
|
+
dmin = vector_get_diff_f(xsorted, ileft, ileft - 1);
|
|
334
|
+
} else {
|
|
335
|
+
dmin = vector_get_diff_f(xsorted, iright + 1, iright);
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
if (stats.variance == 0) vector_set_f(sigma2, k, dmin * dmin / 4.0 / 9.0);
|
|
339
|
+
if (size == 1) vector_set_f(sigma2, k, dmin * dmin);
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
LDouble lambda_k = vector_get_f(lambda, k);
|
|
343
|
+
LDouble sigma2_k = vector_get_f(sigma2, k);
|
|
344
|
+
vector_set_f(coeff, k, lambda_k / sqrt(PIx2 * sigma2_k));
|
|
345
|
+
ileft = iright + 1;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
LDouble loglikelihood = 0.0;
|
|
349
|
+
|
|
350
|
+
for (uint32_t i = 0; i < xcount; ++i)
|
|
351
|
+
{
|
|
352
|
+
LDouble L = 0.0;
|
|
353
|
+
LDouble xi = vector_get_f(xsorted, i);
|
|
354
|
+
|
|
355
|
+
for (uint32_t k = 0; k < kouter; ++k)
|
|
356
|
+
{
|
|
357
|
+
LDouble coeff_k = vector_get_f(coeff, k);
|
|
358
|
+
LDouble mu_k = vector_get_f(mu, k);
|
|
359
|
+
LDouble sigma2_k = vector_get_f(sigma2, k);
|
|
360
|
+
LDouble x_mu_diff = xi - mu_k;
|
|
361
|
+
L += coeff_k * exp(- x_mu_diff * x_mu_diff / (2.0 * sigma2_k));
|
|
362
|
+
}
|
|
363
|
+
loglikelihood += log(L);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
LDouble bic = 2 * loglikelihood - (3 * kouter - 1) * log_xcount;
|
|
367
|
+
|
|
368
|
+
if (kouter == kmin) {
|
|
369
|
+
max_bic = bic;
|
|
370
|
+
kopt = kmin;
|
|
371
|
+
} else {
|
|
372
|
+
if (bic > max_bic) {
|
|
373
|
+
max_bic = bic;
|
|
374
|
+
kopt = kouter;
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
return kopt;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
uint32_t find_koptimal_lmm(State state)
|
|
382
|
+
{
|
|
383
|
+
uint32_t kmin = state.kmin;
|
|
384
|
+
uint32_t kmax = state.kmax;
|
|
385
|
+
uint32_t xcount = state.xcount;
|
|
386
|
+
|
|
387
|
+
if (kmin > kmax || xcount < 2) {
|
|
388
|
+
return (kmin < kmax) ? kmin : kmax;
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
Arena *arena = state.arena;
|
|
392
|
+
VectorF *xsorted = state.xsorted;
|
|
393
|
+
uint32_t kopt = kmin;
|
|
394
|
+
LDouble max_bic = 0.0;
|
|
395
|
+
LDouble log_xcount = log((LDouble) xcount);
|
|
396
|
+
VectorF *lambda = vector_create_f(arena, kmax);
|
|
397
|
+
VectorF *mu = vector_create_f(arena, kmax); /* median */
|
|
398
|
+
VectorF *scale = vector_create_f(arena, kmax); /* MAD (mean absolute deviation) */
|
|
399
|
+
VectorF *coeff = vector_create_f(arena, kmax);
|
|
400
|
+
VectorI *sizes = vector_create_i(arena, kmax);
|
|
401
|
+
|
|
402
|
+
for (uint32_t kouter = kmin; kouter <= kmax; ++kouter)
|
|
403
|
+
{
|
|
404
|
+
uint32_t ileft = 0;
|
|
405
|
+
uint32_t iright;
|
|
406
|
+
|
|
407
|
+
backtrack_sizes(state, sizes, kouter);
|
|
408
|
+
|
|
409
|
+
for (uint32_t k = 0; k < kouter; ++k)
|
|
410
|
+
{
|
|
411
|
+
uint32_t size = vector_get_i(sizes, k);
|
|
412
|
+
vector_set_f(lambda, k, size / (LDouble) xcount);
|
|
413
|
+
iright = ileft + size - 1;
|
|
414
|
+
|
|
415
|
+
uint32_t median_idx = (ileft + iright) / 2;
|
|
416
|
+
LDouble median;
|
|
417
|
+
if ((size % 2) == 1) {
|
|
418
|
+
median = vector_get_f(xsorted, median_idx);
|
|
419
|
+
} else {
|
|
420
|
+
median = (vector_get_f(xsorted, median_idx) + vector_get_f(xsorted, median_idx + 1)) / 2.0;
|
|
421
|
+
}
|
|
422
|
+
vector_set_f(mu, k, median);
|
|
423
|
+
|
|
424
|
+
LDouble mad = 0.0;
|
|
425
|
+
for (uint32_t i = ileft; i <= iright; ++i) {
|
|
426
|
+
LDouble xi = vector_get_f(xsorted, i);
|
|
427
|
+
mad += fabs(xi - median);
|
|
428
|
+
}
|
|
429
|
+
mad = mad / size;
|
|
430
|
+
vector_set_f(scale, k, mad);
|
|
431
|
+
|
|
432
|
+
/* Handle edge case: MAD = 0 (all points are the same) or size = 1 */
|
|
433
|
+
if (mad == 0 || size == 1) {
|
|
434
|
+
LDouble dmin;
|
|
435
|
+
|
|
436
|
+
if (ileft > 0 && iright < xcount - 1) {
|
|
437
|
+
LDouble left_diff = vector_get_diff_f(xsorted, ileft, ileft - 1);
|
|
438
|
+
LDouble right_diff = vector_get_diff_f(xsorted, iright + 1, iright);
|
|
439
|
+
|
|
440
|
+
dmin = (left_diff < right_diff) ? left_diff : right_diff;
|
|
441
|
+
} else if (ileft > 0) {
|
|
442
|
+
dmin = vector_get_diff_f(xsorted, ileft, ileft - 1);
|
|
443
|
+
} else {
|
|
444
|
+
dmin = vector_get_diff_f(xsorted, iright + 1, iright);
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
if (mad == 0) vector_set_f(scale, k, dmin / 6.0);
|
|
448
|
+
if (size == 1) vector_set_f(scale, k, dmin);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
/* Laplace coefficient: lambda_k / (2 * b_k) */
|
|
452
|
+
LDouble lambda_k = vector_get_f(lambda, k);
|
|
453
|
+
LDouble scale_k = vector_get_f(scale, k);
|
|
454
|
+
vector_set_f(coeff, k, lambda_k / (2.0 * scale_k));
|
|
455
|
+
ileft = iright + 1;
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
LDouble loglikelihood = 0.0;
|
|
459
|
+
|
|
460
|
+
for (uint32_t i = 0; i < xcount; ++i)
|
|
461
|
+
{
|
|
462
|
+
LDouble L = 0.0;
|
|
463
|
+
LDouble xi = vector_get_f(xsorted, i);
|
|
464
|
+
|
|
465
|
+
for (uint32_t k = 0; k < kouter; ++k)
|
|
466
|
+
{
|
|
467
|
+
LDouble coeff_k = vector_get_f(coeff, k);
|
|
468
|
+
LDouble mu_k = vector_get_f(mu, k);
|
|
469
|
+
LDouble scale_k = vector_get_f(scale, k);
|
|
470
|
+
LDouble x_mu_abs = fabs(xi - mu_k);
|
|
471
|
+
/* Laplace PDF: (1/(2b)) * exp(-|x-μ|/b) */
|
|
472
|
+
L += coeff_k * exp(-x_mu_abs / scale_k);
|
|
473
|
+
}
|
|
474
|
+
loglikelihood += log(L);
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
/* BIC = 2*logL - (3k-1)*log(n) */
|
|
478
|
+
/* Parameters: k-1 mixing proportions + k medians + k scales = 3k-1 */
|
|
479
|
+
LDouble bic = 2 * loglikelihood - (3 * kouter - 1) * log_xcount;
|
|
480
|
+
|
|
481
|
+
if (kouter == kmin) {
|
|
482
|
+
max_bic = bic;
|
|
483
|
+
kopt = kmin;
|
|
484
|
+
} else {
|
|
485
|
+
if (bic > max_bic) {
|
|
486
|
+
max_bic = bic;
|
|
487
|
+
kopt = kouter;
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
return kopt;
|
|
492
|
+
}
|
|
493
|
+
|
|
283
494
|
VectorI *backtrack_sizes(State state, VectorI *sizes, uint32_t k)
|
|
284
495
|
{
|
|
285
496
|
MatrixI *splits = state.splits;
|
|
@@ -287,12 +498,12 @@ VectorI *backtrack_sizes(State state, VectorI *sizes, uint32_t k)
|
|
|
287
498
|
uint32_t right = xcount - 1;
|
|
288
499
|
uint32_t left = 0;
|
|
289
500
|
|
|
290
|
-
|
|
501
|
+
/* Common case works with `i` remaining unsigned and unconditional assignment of the next `left` and `right` */
|
|
291
502
|
for (uint32_t i = k - 1; i > 0; i--, right = left - 1) {
|
|
292
503
|
left = matrix_get_i(splits, i, right);
|
|
293
504
|
vector_set_i(sizes, i, right - left + 1);
|
|
294
505
|
}
|
|
295
|
-
|
|
506
|
+
/* Special case outside of the loop removing the need for conditionals */
|
|
296
507
|
left = matrix_get_i(splits, 0, right);
|
|
297
508
|
vector_set_i(sizes, 0, right - left + 1);
|
|
298
509
|
|
|
@@ -416,12 +627,12 @@ inline void fill_even_positions(State state, RowParams rparams, VectorI *split_c
|
|
|
416
627
|
|
|
417
628
|
inline void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
|
|
418
629
|
{
|
|
419
|
-
const uint32_t row
|
|
420
|
-
const uint32_t imin
|
|
421
|
-
const uint32_t imax
|
|
422
|
-
const uint32_t istep
|
|
423
|
-
MatrixF *const cost
|
|
424
|
-
MatrixI *const splits
|
|
630
|
+
const uint32_t row = rparams.row;
|
|
631
|
+
const uint32_t imin = rparams.imin;
|
|
632
|
+
const uint32_t imax = rparams.imax;
|
|
633
|
+
const uint32_t istep = rparams.istep;
|
|
634
|
+
MatrixF *const cost = state.cost;
|
|
635
|
+
MatrixI *const splits = state.splits;
|
|
425
636
|
FnDissim *const dissim = state.dissim;
|
|
426
637
|
|
|
427
638
|
uint32_t optimal_split_idx_prev = 0;
|
|
@@ -723,7 +934,7 @@ Arena *arena_create(size_t capacity) {
|
|
|
723
934
|
}
|
|
724
935
|
|
|
725
936
|
void *arena_alloc(Arena *arena, size_t size) {
|
|
726
|
-
size = (size +
|
|
937
|
+
size = (size + 0xf) & ~0xf;
|
|
727
938
|
|
|
728
939
|
if (arena->offset + size > arena->capacity) {
|
|
729
940
|
rb_raise(rb_eNoMemError, "Arena Insufficient Capacity");
|
data/lib/ckmeans/clusterer.rb
CHANGED
|
@@ -1,19 +1,37 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Ckmeans
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
# Optimal k-means clustering for univariate (1D) data using dynamic programming.
|
|
5
|
+
# Minimizes within-cluster sum of squared distances (L2 norm).
|
|
6
|
+
class Clusterer
|
|
7
|
+
# Creates a new Ckmeans clusterer.
|
|
8
|
+
#
|
|
9
|
+
# @param entries [Array<Numeric>] The data points to cluster
|
|
10
|
+
# @param kmin [Integer] Minimum number of clusters to consider
|
|
11
|
+
# @param kmax [Integer] Maximum number of clusters to consider (defaults to kmin for fixed K)
|
|
12
|
+
# @param kestimate [Symbol] Method for estimating optimal K:
|
|
13
|
+
# - :fast - Quick heuristic using implicit Gaussian assumption (best for large datasets)
|
|
14
|
+
# - :stable - Model-based estimation using Gaussian Mixture Model (better for duplicates/edge cases)
|
|
15
|
+
# - :gmm - Alias for :stable (Gaussian Mixture Model)
|
|
16
|
+
#
|
|
17
|
+
# @example Fixed number of clusters
|
|
18
|
+
# Ckmeans::Clusterer.new([1, 2, 3, 100, 101], 2).clusters
|
|
19
|
+
# # => [[1, 2, 3], [100, 101]]
|
|
20
|
+
#
|
|
21
|
+
# @example Automatic K selection with stable estimation
|
|
22
|
+
# Ckmeans::Clusterer.new([1, 1, 1, 5, 5, 5, 10, 10, 10], 1, 5, :stable).clusters
|
|
23
|
+
def initialize(entries, kmin, kmax = kmin, kestimate = :fast)
|
|
6
24
|
@xcount = entries.size
|
|
7
25
|
|
|
8
26
|
raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
|
|
9
27
|
raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
|
|
10
28
|
|
|
11
|
-
@kmin
|
|
12
|
-
@unique_xcount
|
|
13
|
-
@kmax
|
|
14
|
-
@xsorted_original
|
|
15
|
-
@xsorted
|
|
16
|
-
@
|
|
29
|
+
@kmin = kmin
|
|
30
|
+
@unique_xcount = entries.uniq.size
|
|
31
|
+
@kmax = [@unique_xcount, kmax].min
|
|
32
|
+
@xsorted_original = entries.sort
|
|
33
|
+
@xsorted = @xsorted_original.map(&:to_f)
|
|
34
|
+
@use_gmm = %i[gmm stable].include?(kestimate)
|
|
17
35
|
end
|
|
18
36
|
|
|
19
37
|
def clusters
|
data/lib/ckmeans/version.rb
CHANGED
data/lib/ckmedian/clusterer.rb
CHANGED
|
@@ -1,8 +1,28 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Ckmedian
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
# Optimal k-median clustering for univariate (1D) data using dynamic programming.
|
|
5
|
+
# Minimizes within-cluster sum of absolute deviations (L1 norm).
|
|
6
|
+
# More robust to outliers than k-means.
|
|
7
|
+
class Clusterer
|
|
8
|
+
# Creates a new Ckmedian clusterer.
|
|
9
|
+
#
|
|
10
|
+
# @param entries [Array<Numeric>] The data points to cluster
|
|
11
|
+
# @param kmin [Integer] Minimum number of clusters to consider
|
|
12
|
+
# @param kmax [Integer] Maximum number of clusters to consider (defaults to kmin for fixed K)
|
|
13
|
+
# @param kestimate [Symbol] Method for estimating optimal K:
|
|
14
|
+
# - :fast - Quick heuristic using implicit Gaussian assumption (best for large datasets)
|
|
15
|
+
# - :stable - Model-based estimation using Laplace Mixture Model (better for outliers/bursts)
|
|
16
|
+
# - :lmm - Alias for :stable (Laplace Mixture Model)
|
|
17
|
+
#
|
|
18
|
+
# @example Fixed number of clusters
|
|
19
|
+
# Ckmedian::Clusterer.new([1, 2, 3, 100, 101], 2).clusters
|
|
20
|
+
# # => [[1, 2, 3], [100, 101]]
|
|
21
|
+
#
|
|
22
|
+
# @example Photo timeline clustering (robust to bursts and outliers)
|
|
23
|
+
# timestamps = photos.map(&:taken_at).map(&:to_i)
|
|
24
|
+
# Ckmedian::Clusterer.new(timestamps, 1, 20, :stable).clusters
|
|
25
|
+
def initialize(entries, kmin, kmax = kmin, kestimate = :fast)
|
|
6
26
|
@xcount = entries.size
|
|
7
27
|
|
|
8
28
|
raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
|
|
@@ -13,6 +33,7 @@ module Ckmedian
|
|
|
13
33
|
@kmax = [@unique_xcount, kmax].min
|
|
14
34
|
@xsorted_original = entries.sort
|
|
15
35
|
@xsorted = @xsorted_original.map(&:to_f)
|
|
36
|
+
@use_lmm = %i[lmm stable].include?(kestimate)
|
|
16
37
|
end
|
|
17
38
|
|
|
18
39
|
def clusters
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ckmeans
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 2.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Vlad Lebedev
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-
|
|
11
|
+
date: 2025-12-14 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Repeatable clustering of unidimensional data
|
|
14
14
|
email:
|