ckmeans 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +33 -14
- data/ext/ckmeans/extensions.c +127 -8
- data/lib/ckmeans/clusterer.rb +20 -2
- data/lib/ckmeans/version.rb +1 -1
- data/lib/ckmedian/clusterer.rb +23 -2
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: '08439cddf5410f952a06263d423297de219c86927dea2d0c4538916d9d0c70fd'
|
|
4
|
+
data.tar.gz: 94b1cece717f8538945208519c94046881f2d42c7ea9ba9bd453d063eef39878
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 955d8b19a2a33d46b6adebfa05c9460784f5838d1a5d6d8c82e447be2eacb95da3a2711aebacf96429bf36fafe4d46dfc98fe87025cce4373c734c1cc6a60fb9
|
|
7
|
+
data.tar.gz: 8887346eb2602c071923ade05a549a0eb3df44491538fb79155bd766429dde0084847231e1a4f2dcf6e5e3c14f6dca309f2399339abe378becd4b21d8bc70aa7
|
data/README.md
CHANGED
|
@@ -18,33 +18,52 @@ gem install ckmeans
|
|
|
18
18
|
|
|
19
19
|
## Usage
|
|
20
20
|
|
|
21
|
-
###
|
|
21
|
+
### Basic Clustering
|
|
22
22
|
|
|
23
23
|
```rb
|
|
24
|
-
# Fixed cluster count
|
|
25
|
-
Ckmeans::Clusterer(data,
|
|
26
|
-
Ckmedian::Clusterer(data,
|
|
24
|
+
# Fixed cluster count (K known in advance)
|
|
25
|
+
Ckmeans::Clusterer.new(data, 3).clusters
|
|
26
|
+
Ckmedian::Clusterer.new(data, 3).clusters
|
|
27
|
+
|
|
28
|
+
# Automatic K selection (tries K from kmin to kmax, picks optimal)
|
|
29
|
+
Ckmeans::Clusterer.new(data, 1, 10).clusters
|
|
30
|
+
Ckmedian::Clusterer.new(data, 1, 10).clusters
|
|
27
31
|
```
|
|
28
32
|
|
|
29
|
-
###
|
|
33
|
+
### Choosing Between Ckmeans and Ckmedian
|
|
34
|
+
|
|
35
|
+
- **Ckmeans** - Minimizes squared distances (L2). Good for normally distributed data.
|
|
36
|
+
- **Ckmedian** - Minimizes absolute distances (L1). More robust to outliers and data bursts.
|
|
30
37
|
|
|
31
38
|
```rb
|
|
32
|
-
|
|
33
|
-
|
|
39
|
+
# For clean numerical data
|
|
40
|
+
temperatures = [20.1, 20.2, 25.5, 25.6, 30.1, 30.2]
|
|
41
|
+
Ckmeans::Clusterer.new(temperatures, 1, 5).clusters
|
|
42
|
+
# => [[20.1, 20.2], [25.5, 25.6], [30.1, 30.2]]
|
|
43
|
+
|
|
44
|
+
# For data with outliers (e.g., photo timestamps with bursts)
|
|
45
|
+
timestamps = photos.map(&:taken_at).map(&:to_i)
|
|
46
|
+
Ckmedian::Clusterer.new(timestamps, 1, 20).clusters
|
|
34
47
|
```
|
|
35
48
|
|
|
36
|
-
###
|
|
49
|
+
### Stable Estimation (Recommended for Edge Cases)
|
|
37
50
|
|
|
38
|
-
For
|
|
39
|
-
For relatively small sets or sets with many duplicates use Gaussian Mixture Model (GMM)-based estimation.
|
|
40
|
-
It works slower but is more resilient for various data patterns like big numbers of duplicates or clusters with different
|
|
41
|
-
numbers of elements.
|
|
51
|
+
By default, both algorithms use a fast heuristic for estimating K. For datasets with many duplicates, tight clusters, or outliers, use `:stable` for more robust estimation:
|
|
42
52
|
|
|
43
53
|
```rb
|
|
44
|
-
|
|
45
|
-
|
|
54
|
+
# Stable estimation (uses statistical mixture models)
|
|
55
|
+
Ckmeans::Clusterer.new(data, 1, 10, :stable).clusters
|
|
56
|
+
Ckmedian::Clusterer.new(data, 1, 10, :stable).clusters
|
|
46
57
|
```
|
|
47
58
|
|
|
59
|
+
**When to use `:stable`:**
|
|
60
|
+
- Small to medium datasets (< 1000 points)
|
|
61
|
+
- Many duplicate values
|
|
62
|
+
- Clusters with very different sizes
|
|
63
|
+
- Photo/event timeline clustering (bursts and gaps)
|
|
64
|
+
|
|
65
|
+
**Expert users:** `:stable` is an alias for `:gmm` (Gaussian Mixture Model) in Ckmeans and `:lmm` (Laplace Mixture Model) in Ckmedian.
|
|
66
|
+
|
|
48
67
|
## License
|
|
49
68
|
|
|
50
69
|
The gem is available as open source under the terms of the [LGPL v3 License](https://opensource.org/license/lgpl-3-0).
|
data/ext/ckmeans/extensions.c
CHANGED
|
@@ -55,6 +55,8 @@ typedef struct RowParams {
|
|
|
55
55
|
uint32_t istep;
|
|
56
56
|
} RowParams;
|
|
57
57
|
|
|
58
|
+
typedef uint32_t (FnFindKOptimal)(State);
|
|
59
|
+
|
|
58
60
|
typedef struct {
|
|
59
61
|
LDouble mean;
|
|
60
62
|
LDouble variance;
|
|
@@ -62,7 +64,7 @@ typedef struct {
|
|
|
62
64
|
|
|
63
65
|
VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
|
|
64
66
|
VALUE rb_ckmedian_sorted_group_sizes(VALUE self);
|
|
65
|
-
VALUE rb_sorted_group_sizes(VALUE self, FnDissim*);
|
|
67
|
+
VALUE rb_sorted_group_sizes(VALUE self, FnDissim*, FnFindKOptimal*);
|
|
66
68
|
|
|
67
69
|
Arena *arena_create(size_t);
|
|
68
70
|
void *arena_alloc(Arena*, size_t);
|
|
@@ -100,6 +102,7 @@ SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
|
|
|
100
102
|
VectorI *backtrack_sizes(State, VectorI*, uint32_t);
|
|
101
103
|
uint32_t find_koptimal_fast(State);
|
|
102
104
|
uint32_t find_koptimal_gmm(State);
|
|
105
|
+
uint32_t find_koptimal_lmm(State);
|
|
103
106
|
|
|
104
107
|
void Init_extensions(void) {
|
|
105
108
|
VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
|
|
@@ -117,20 +120,23 @@ void Init_extensions(void) {
|
|
|
117
120
|
|
|
118
121
|
VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
|
|
119
122
|
{
|
|
120
|
-
|
|
123
|
+
bool use_gmm = RTEST(rb_iv_get(self, "@use_gmm"));
|
|
124
|
+
FnFindKOptimal *find_k = use_gmm ? find_koptimal_gmm : find_koptimal_fast;
|
|
125
|
+
return rb_sorted_group_sizes(self, dissimilarity_l2, find_k);
|
|
121
126
|
}
|
|
122
127
|
|
|
123
128
|
VALUE rb_ckmedian_sorted_group_sizes(VALUE self)
|
|
124
129
|
{
|
|
125
|
-
|
|
130
|
+
bool use_lmm = RTEST(rb_iv_get(self, "@use_lmm"));
|
|
131
|
+
FnFindKOptimal *find_k = use_lmm ? find_koptimal_lmm : find_koptimal_fast;
|
|
132
|
+
return rb_sorted_group_sizes(self, dissimilarity_l1, find_k);
|
|
126
133
|
}
|
|
127
134
|
|
|
128
|
-
VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
|
|
135
|
+
VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria, FnFindKOptimal *find_koptimal)
|
|
129
136
|
{
|
|
130
137
|
uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
|
|
131
138
|
uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
|
|
132
139
|
uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
|
|
133
|
-
bool use_gmm = RTEST(rb_iv_get(self, "@use_gmm"));
|
|
134
140
|
VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
|
|
135
141
|
size_t capacity = sizeof(LDouble) * (xcount + 2) * (kmax + 2) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
|
|
136
142
|
Arena *arena = arena_create(capacity);
|
|
@@ -185,7 +191,7 @@ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
|
|
|
185
191
|
fill_row(state, q, imin, xcount - 1);
|
|
186
192
|
}
|
|
187
193
|
|
|
188
|
-
uint32_t koptimal =
|
|
194
|
+
uint32_t koptimal = find_koptimal(state);
|
|
189
195
|
|
|
190
196
|
VectorI *sizes = vector_create_i(arena, koptimal);
|
|
191
197
|
backtrack_sizes(state, sizes, koptimal);
|
|
@@ -372,6 +378,119 @@ uint32_t find_koptimal_gmm(State state)
|
|
|
372
378
|
return kopt;
|
|
373
379
|
}
|
|
374
380
|
|
|
381
|
+
uint32_t find_koptimal_lmm(State state)
|
|
382
|
+
{
|
|
383
|
+
uint32_t kmin = state.kmin;
|
|
384
|
+
uint32_t kmax = state.kmax;
|
|
385
|
+
uint32_t xcount = state.xcount;
|
|
386
|
+
|
|
387
|
+
if (kmin > kmax || xcount < 2) {
|
|
388
|
+
return (kmin < kmax) ? kmin : kmax;
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
Arena *arena = state.arena;
|
|
392
|
+
VectorF *xsorted = state.xsorted;
|
|
393
|
+
uint32_t kopt = kmin;
|
|
394
|
+
LDouble max_bic = 0.0;
|
|
395
|
+
LDouble log_xcount = log((LDouble) xcount);
|
|
396
|
+
VectorF *lambda = vector_create_f(arena, kmax);
|
|
397
|
+
VectorF *mu = vector_create_f(arena, kmax); /* median */
|
|
398
|
+
VectorF *scale = vector_create_f(arena, kmax); /* MAD (mean absolute deviation) */
|
|
399
|
+
VectorF *coeff = vector_create_f(arena, kmax);
|
|
400
|
+
VectorI *sizes = vector_create_i(arena, kmax);
|
|
401
|
+
|
|
402
|
+
for (uint32_t kouter = kmin; kouter <= kmax; ++kouter)
|
|
403
|
+
{
|
|
404
|
+
uint32_t ileft = 0;
|
|
405
|
+
uint32_t iright;
|
|
406
|
+
|
|
407
|
+
backtrack_sizes(state, sizes, kouter);
|
|
408
|
+
|
|
409
|
+
for (uint32_t k = 0; k < kouter; ++k)
|
|
410
|
+
{
|
|
411
|
+
uint32_t size = vector_get_i(sizes, k);
|
|
412
|
+
vector_set_f(lambda, k, size / (LDouble) xcount);
|
|
413
|
+
iright = ileft + size - 1;
|
|
414
|
+
|
|
415
|
+
uint32_t median_idx = (ileft + iright) / 2;
|
|
416
|
+
LDouble median;
|
|
417
|
+
if ((size % 2) == 1) {
|
|
418
|
+
median = vector_get_f(xsorted, median_idx);
|
|
419
|
+
} else {
|
|
420
|
+
median = (vector_get_f(xsorted, median_idx) + vector_get_f(xsorted, median_idx + 1)) / 2.0;
|
|
421
|
+
}
|
|
422
|
+
vector_set_f(mu, k, median);
|
|
423
|
+
|
|
424
|
+
LDouble mad = 0.0;
|
|
425
|
+
for (uint32_t i = ileft; i <= iright; ++i) {
|
|
426
|
+
LDouble xi = vector_get_f(xsorted, i);
|
|
427
|
+
mad += fabs(xi - median);
|
|
428
|
+
}
|
|
429
|
+
mad = mad / size;
|
|
430
|
+
vector_set_f(scale, k, mad);
|
|
431
|
+
|
|
432
|
+
/* Handle edge case: MAD = 0 (all points are the same) or size = 1 */
|
|
433
|
+
if (mad == 0 || size == 1) {
|
|
434
|
+
LDouble dmin;
|
|
435
|
+
|
|
436
|
+
if (ileft > 0 && iright < xcount - 1) {
|
|
437
|
+
LDouble left_diff = vector_get_diff_f(xsorted, ileft, ileft - 1);
|
|
438
|
+
LDouble right_diff = vector_get_diff_f(xsorted, iright + 1, iright);
|
|
439
|
+
|
|
440
|
+
dmin = (left_diff < right_diff) ? left_diff : right_diff;
|
|
441
|
+
} else if (ileft > 0) {
|
|
442
|
+
dmin = vector_get_diff_f(xsorted, ileft, ileft - 1);
|
|
443
|
+
} else {
|
|
444
|
+
dmin = vector_get_diff_f(xsorted, iright + 1, iright);
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
if (mad == 0) vector_set_f(scale, k, dmin / 6.0);
|
|
448
|
+
if (size == 1) vector_set_f(scale, k, dmin);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
/* Laplace coefficient: lambda_k / (2 * b_k) */
|
|
452
|
+
LDouble lambda_k = vector_get_f(lambda, k);
|
|
453
|
+
LDouble scale_k = vector_get_f(scale, k);
|
|
454
|
+
vector_set_f(coeff, k, lambda_k / (2.0 * scale_k));
|
|
455
|
+
ileft = iright + 1;
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
LDouble loglikelihood = 0.0;
|
|
459
|
+
|
|
460
|
+
for (uint32_t i = 0; i < xcount; ++i)
|
|
461
|
+
{
|
|
462
|
+
LDouble L = 0.0;
|
|
463
|
+
LDouble xi = vector_get_f(xsorted, i);
|
|
464
|
+
|
|
465
|
+
for (uint32_t k = 0; k < kouter; ++k)
|
|
466
|
+
{
|
|
467
|
+
LDouble coeff_k = vector_get_f(coeff, k);
|
|
468
|
+
LDouble mu_k = vector_get_f(mu, k);
|
|
469
|
+
LDouble scale_k = vector_get_f(scale, k);
|
|
470
|
+
LDouble x_mu_abs = fabs(xi - mu_k);
|
|
471
|
+
/* Laplace PDF: (1/(2b)) * exp(-|x-μ|/b) */
|
|
472
|
+
L += coeff_k * exp(-x_mu_abs / scale_k);
|
|
473
|
+
}
|
|
474
|
+
loglikelihood += log(L);
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
/* BIC = 2*logL - (3k-1)*log(n) */
|
|
478
|
+
/* Parameters: k-1 mixing proportions + k medians + k scales = 3k-1 */
|
|
479
|
+
LDouble bic = 2 * loglikelihood - (3 * kouter - 1) * log_xcount;
|
|
480
|
+
|
|
481
|
+
if (kouter == kmin) {
|
|
482
|
+
max_bic = bic;
|
|
483
|
+
kopt = kmin;
|
|
484
|
+
} else {
|
|
485
|
+
if (bic > max_bic) {
|
|
486
|
+
max_bic = bic;
|
|
487
|
+
kopt = kouter;
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
return kopt;
|
|
492
|
+
}
|
|
493
|
+
|
|
375
494
|
VectorI *backtrack_sizes(State state, VectorI *sizes, uint32_t k)
|
|
376
495
|
{
|
|
377
496
|
MatrixI *splits = state.splits;
|
|
@@ -379,12 +498,12 @@ VectorI *backtrack_sizes(State state, VectorI *sizes, uint32_t k)
|
|
|
379
498
|
uint32_t right = xcount - 1;
|
|
380
499
|
uint32_t left = 0;
|
|
381
500
|
|
|
382
|
-
|
|
501
|
+
/* Common case works with `i` remaining unsigned and unconditional assignment of the next `left` and `right` */
|
|
383
502
|
for (uint32_t i = k - 1; i > 0; i--, right = left - 1) {
|
|
384
503
|
left = matrix_get_i(splits, i, right);
|
|
385
504
|
vector_set_i(sizes, i, right - left + 1);
|
|
386
505
|
}
|
|
387
|
-
|
|
506
|
+
/* Special case outside of the loop removing the need for conditionals */
|
|
388
507
|
left = matrix_get_i(splits, 0, right);
|
|
389
508
|
vector_set_i(sizes, 0, right - left + 1);
|
|
390
509
|
|
data/lib/ckmeans/clusterer.rb
CHANGED
|
@@ -1,7 +1,25 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Ckmeans
|
|
4
|
-
|
|
4
|
+
# Optimal k-means clustering for univariate (1D) data using dynamic programming.
|
|
5
|
+
# Minimizes within-cluster sum of squared distances (L2 norm).
|
|
6
|
+
class Clusterer
|
|
7
|
+
# Creates a new Ckmeans clusterer.
|
|
8
|
+
#
|
|
9
|
+
# @param entries [Array<Numeric>] The data points to cluster
|
|
10
|
+
# @param kmin [Integer] Minimum number of clusters to consider
|
|
11
|
+
# @param kmax [Integer] Maximum number of clusters to consider (defaults to kmin for fixed K)
|
|
12
|
+
# @param kestimate [Symbol] Method for estimating optimal K:
|
|
13
|
+
# - :fast - Quick heuristic using implicit Gaussian assumption (best for large datasets)
|
|
14
|
+
# - :stable - Model-based estimation using Gaussian Mixture Model (better for duplicates/edge cases)
|
|
15
|
+
# - :gmm - Alias for :stable (Gaussian Mixture Model)
|
|
16
|
+
#
|
|
17
|
+
# @example Fixed number of clusters
|
|
18
|
+
# Ckmeans::Clusterer.new([1, 2, 3, 100, 101], 2).clusters
|
|
19
|
+
# # => [[1, 2, 3], [100, 101]]
|
|
20
|
+
#
|
|
21
|
+
# @example Automatic K selection with stable estimation
|
|
22
|
+
# Ckmeans::Clusterer.new([1, 1, 1, 5, 5, 5, 10, 10, 10], 1, 5, :stable).clusters
|
|
5
23
|
def initialize(entries, kmin, kmax = kmin, kestimate = :fast)
|
|
6
24
|
@xcount = entries.size
|
|
7
25
|
|
|
@@ -13,7 +31,7 @@ module Ckmeans
|
|
|
13
31
|
@kmax = [@unique_xcount, kmax].min
|
|
14
32
|
@xsorted_original = entries.sort
|
|
15
33
|
@xsorted = @xsorted_original.map(&:to_f)
|
|
16
|
-
@use_gmm = kestimate
|
|
34
|
+
@use_gmm = %i[gmm stable].include?(kestimate)
|
|
17
35
|
end
|
|
18
36
|
|
|
19
37
|
def clusters
|
data/lib/ckmeans/version.rb
CHANGED
data/lib/ckmedian/clusterer.rb
CHANGED
|
@@ -1,8 +1,28 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Ckmedian
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
# Optimal k-median clustering for univariate (1D) data using dynamic programming.
|
|
5
|
+
# Minimizes within-cluster sum of absolute deviations (L1 norm).
|
|
6
|
+
# More robust to outliers than k-means.
|
|
7
|
+
class Clusterer
|
|
8
|
+
# Creates a new Ckmedian clusterer.
|
|
9
|
+
#
|
|
10
|
+
# @param entries [Array<Numeric>] The data points to cluster
|
|
11
|
+
# @param kmin [Integer] Minimum number of clusters to consider
|
|
12
|
+
# @param kmax [Integer] Maximum number of clusters to consider (defaults to kmin for fixed K)
|
|
13
|
+
# @param kestimate [Symbol] Method for estimating optimal K:
|
|
14
|
+
# - :fast - Quick heuristic using implicit Gaussian assumption (best for large datasets)
|
|
15
|
+
# - :stable - Model-based estimation using Laplace Mixture Model (better for outliers/bursts)
|
|
16
|
+
# - :lmm - Alias for :stable (Laplace Mixture Model)
|
|
17
|
+
#
|
|
18
|
+
# @example Fixed number of clusters
|
|
19
|
+
# Ckmedian::Clusterer.new([1, 2, 3, 100, 101], 2).clusters
|
|
20
|
+
# # => [[1, 2, 3], [100, 101]]
|
|
21
|
+
#
|
|
22
|
+
# @example Photo timeline clustering (robust to bursts and outliers)
|
|
23
|
+
# timestamps = photos.map(&:taken_at).map(&:to_i)
|
|
24
|
+
# Ckmedian::Clusterer.new(timestamps, 1, 20, :stable).clusters
|
|
25
|
+
def initialize(entries, kmin, kmax = kmin, kestimate = :fast)
|
|
6
26
|
@xcount = entries.size
|
|
7
27
|
|
|
8
28
|
raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
|
|
@@ -13,6 +33,7 @@ module Ckmedian
|
|
|
13
33
|
@kmax = [@unique_xcount, kmax].min
|
|
14
34
|
@xsorted_original = entries.sort
|
|
15
35
|
@xsorted = @xsorted_original.map(&:to_f)
|
|
36
|
+
@use_lmm = %i[lmm stable].include?(kestimate)
|
|
16
37
|
end
|
|
17
38
|
|
|
18
39
|
def clusters
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ckmeans
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Vlad Lebedev
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-
|
|
11
|
+
date: 2025-12-14 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Repeatable clustering of unidimensional data
|
|
14
14
|
email:
|