ckmeans 1.1.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +17 -3
- data/ext/ckmeans/extensions.c +124 -32
- data/lib/ckmeans/clusterer.rb +7 -7
- data/lib/ckmeans/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1d63d8f65d386bf27082e0a65b1ea82a7d150394b1424ab5c2c274e139f91482
|
|
4
|
+
data.tar.gz: 1f3c4e91fcc9f3bda3d83521cac164ff83e3e5095705cd15420c6278635fc266
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0101cd5f6d5ba925d8f37cc73416008ace4ffce7ea33a437e0189549ede4cbc23b7284de2fe28af181ddf08396b74225c67626e94ce015d54ac14fde17b53bda
|
|
7
|
+
data.tar.gz: abbcc012e9378ea1fbf15566fd47691bd4cecaaeaf95947c45414dfb7b304db87d803120749aab3ccbf806ab90dd554cce2461f340c348e4f1b820f47be421a2
|
data/README.md
CHANGED
|
@@ -18,17 +18,31 @@ gem install ckmeans
|
|
|
18
18
|
|
|
19
19
|
## Usage
|
|
20
20
|
|
|
21
|
+
### Fixed Cluster Count
|
|
22
|
+
|
|
21
23
|
```rb
|
|
22
24
|
# Fixed cluster count
|
|
23
25
|
Ckmeans::Clusterer(data, kmin).clusters
|
|
24
26
|
Ckmedian::Clusterer(data, kmin).clusters
|
|
27
|
+
```
|
|
25
28
|
|
|
26
|
-
|
|
29
|
+
### Estimate optimal cluster count within kmin and kmax
|
|
30
|
+
|
|
31
|
+
```rb
|
|
27
32
|
Ckmeans::Clusterer(data, kmin, kmax).clusters
|
|
28
33
|
Ckmedian::Clusterer(data, kmin, kmax).clusters
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### Fast & Stable Estimation of K
|
|
29
37
|
|
|
30
|
-
|
|
31
|
-
|
|
38
|
+
For big collections without many duplicates, use regular estimation.
|
|
39
|
+
For relatively small sets or sets with many duplicates use Gaussian Mixture Model (GMM)-based estimation.
|
|
40
|
+
It works slower but is more resilient for various data patterns like big numbers of duplicates or clusters with different
|
|
41
|
+
numbers of elements.
|
|
42
|
+
|
|
43
|
+
```rb
|
|
44
|
+
Ckmeans::Clusterer(data, kmin, kmax, :gmm).clusters
|
|
45
|
+
Ckmedian::Clusterer(data, kmin, kmax, :gmm).clusters
|
|
32
46
|
```
|
|
33
47
|
|
|
34
48
|
## License
|
data/ext/ckmeans/extensions.c
CHANGED
|
@@ -39,7 +39,6 @@ typedef struct State {
|
|
|
39
39
|
uint32_t xcount;
|
|
40
40
|
uint32_t kmin;
|
|
41
41
|
uint32_t kmax;
|
|
42
|
-
bool apply_deviation;
|
|
43
42
|
Arena *arena;
|
|
44
43
|
VectorF *xsorted;
|
|
45
44
|
MatrixF *cost;
|
|
@@ -99,8 +98,8 @@ VectorI *prune_candidates(State, RowParams, VectorI*);
|
|
|
99
98
|
void fill_even_positions(State, RowParams, VectorI*);
|
|
100
99
|
SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
|
|
101
100
|
VectorI *backtrack_sizes(State, VectorI*, uint32_t);
|
|
102
|
-
uint32_t
|
|
103
|
-
|
|
101
|
+
uint32_t find_koptimal_fast(State);
|
|
102
|
+
uint32_t find_koptimal_gmm(State);
|
|
104
103
|
|
|
105
104
|
void Init_extensions(void) {
|
|
106
105
|
VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
|
|
@@ -128,13 +127,13 @@ VALUE rb_ckmedian_sorted_group_sizes(VALUE self)
|
|
|
128
127
|
|
|
129
128
|
VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
|
|
130
129
|
{
|
|
131
|
-
uint32_t xcount
|
|
132
|
-
uint32_t kmin
|
|
133
|
-
uint32_t kmax
|
|
134
|
-
bool
|
|
135
|
-
VALUE rb_xsorted
|
|
136
|
-
size_t capacity
|
|
137
|
-
Arena *arena
|
|
130
|
+
uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
|
|
131
|
+
uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
|
|
132
|
+
uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
|
|
133
|
+
bool use_gmm = RTEST(rb_iv_get(self, "@use_gmm"));
|
|
134
|
+
VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
|
|
135
|
+
size_t capacity = sizeof(LDouble) * (xcount + 2) * (kmax + 2) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
|
|
136
|
+
Arena *arena = arena_create(capacity);
|
|
138
137
|
|
|
139
138
|
if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
|
|
140
139
|
|
|
@@ -150,17 +149,16 @@ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
|
|
|
150
149
|
}
|
|
151
150
|
|
|
152
151
|
State state = {
|
|
153
|
-
.arena
|
|
154
|
-
.xcount
|
|
155
|
-
.kmin
|
|
156
|
-
.kmax
|
|
157
|
-
.
|
|
158
|
-
.
|
|
159
|
-
.
|
|
160
|
-
.
|
|
161
|
-
.
|
|
162
|
-
.
|
|
163
|
-
.dissim = criteria
|
|
152
|
+
.arena = arena,
|
|
153
|
+
.xcount = xcount,
|
|
154
|
+
.kmin = kmin,
|
|
155
|
+
.kmax = kmax,
|
|
156
|
+
.xsorted = xsorted,
|
|
157
|
+
.cost = cost,
|
|
158
|
+
.splits = splits,
|
|
159
|
+
.xsum = xsum,
|
|
160
|
+
.xsumsq = xsumsq,
|
|
161
|
+
.dissim = criteria
|
|
164
162
|
};
|
|
165
163
|
|
|
166
164
|
|
|
@@ -187,7 +185,7 @@ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
|
|
|
187
185
|
fill_row(state, q, imin, xcount - 1);
|
|
188
186
|
}
|
|
189
187
|
|
|
190
|
-
uint32_t koptimal =
|
|
188
|
+
uint32_t koptimal = use_gmm ? find_koptimal_gmm(state) : find_koptimal_fast(state);
|
|
191
189
|
|
|
192
190
|
VectorI *sizes = vector_create_i(arena, koptimal);
|
|
193
191
|
backtrack_sizes(state, sizes, koptimal);
|
|
@@ -209,7 +207,7 @@ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
|
|
|
209
207
|
return response;
|
|
210
208
|
}
|
|
211
209
|
|
|
212
|
-
uint32_t
|
|
210
|
+
uint32_t find_koptimal_fast(State state)
|
|
213
211
|
{
|
|
214
212
|
uint32_t kmin = state.kmin;
|
|
215
213
|
uint32_t kmax = state.kmax;
|
|
@@ -256,8 +254,7 @@ uint32_t find_koptimal(State state)
|
|
|
256
254
|
loglikelihood += -(xi - mean) * (xi - mean) / (2.0 * variance);
|
|
257
255
|
}
|
|
258
256
|
loglikelihood += npoints * (
|
|
259
|
-
|
|
260
|
-
(0.5 * log(PIx2 * variance))
|
|
257
|
+
log(npoints / (LDouble) xcount) - (0.5 * log(PIx2 * variance))
|
|
261
258
|
);
|
|
262
259
|
} else {
|
|
263
260
|
loglikelihood += npoints * log(1.0 / bin_width / xcount);
|
|
@@ -280,6 +277,101 @@ uint32_t find_koptimal(State state)
|
|
|
280
277
|
return kopt;
|
|
281
278
|
}
|
|
282
279
|
|
|
280
|
+
uint32_t find_koptimal_gmm(State state)
|
|
281
|
+
{
|
|
282
|
+
uint32_t kmin = state.kmin;
|
|
283
|
+
uint32_t kmax = state.kmax;
|
|
284
|
+
uint32_t xcount = state.xcount;
|
|
285
|
+
|
|
286
|
+
if (kmin > kmax || xcount < 2) {
|
|
287
|
+
return (kmin < kmax) ? kmin : kmax;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
Arena *arena = state.arena;
|
|
291
|
+
VectorF *xsorted = state.xsorted;
|
|
292
|
+
uint32_t kopt = kmin;
|
|
293
|
+
LDouble max_bic = 0.0;
|
|
294
|
+
LDouble log_xcount = log((LDouble) xcount);
|
|
295
|
+
VectorF *lambda = vector_create_f(arena, kmax);
|
|
296
|
+
VectorF *mu = vector_create_f(arena, kmax);
|
|
297
|
+
VectorF *sigma2 = vector_create_f(arena, kmax);
|
|
298
|
+
VectorF *coeff = vector_create_f(arena, kmax);
|
|
299
|
+
VectorI *sizes = vector_create_i(arena, kmax);
|
|
300
|
+
|
|
301
|
+
for (uint32_t kouter = kmin; kouter <= kmax; ++kouter)
|
|
302
|
+
{
|
|
303
|
+
uint32_t ileft = 0;
|
|
304
|
+
uint32_t iright;
|
|
305
|
+
|
|
306
|
+
backtrack_sizes(state, sizes, kouter);
|
|
307
|
+
|
|
308
|
+
for (uint32_t k = 0; k < kouter; ++k)
|
|
309
|
+
{
|
|
310
|
+
uint32_t size = vector_get_i(sizes, k);
|
|
311
|
+
vector_set_f(lambda, k, size / (LDouble) xcount);
|
|
312
|
+
iright = ileft + size - 1;
|
|
313
|
+
SegmentStats stats = shifted_data_variance(xsorted, ileft, iright);
|
|
314
|
+
|
|
315
|
+
vector_set_f(mu, k, stats.mean);
|
|
316
|
+
vector_set_f(sigma2, k, stats.variance);
|
|
317
|
+
|
|
318
|
+
if (stats.variance == 0 || size == 1) {
|
|
319
|
+
LDouble dmin;
|
|
320
|
+
|
|
321
|
+
if (ileft > 0 && iright < xcount - 1) {
|
|
322
|
+
LDouble left_diff = vector_get_diff_f(xsorted, ileft, ileft - 1);
|
|
323
|
+
LDouble right_diff = vector_get_diff_f(xsorted, iright + 1, iright);
|
|
324
|
+
|
|
325
|
+
dmin = (left_diff < right_diff) ? left_diff : right_diff;
|
|
326
|
+
} else if (ileft > 0) {
|
|
327
|
+
dmin = vector_get_diff_f(xsorted, ileft, ileft - 1);
|
|
328
|
+
} else {
|
|
329
|
+
dmin = vector_get_diff_f(xsorted, iright + 1, iright);
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
if (stats.variance == 0) vector_set_f(sigma2, k, dmin * dmin / 4.0 / 9.0);
|
|
333
|
+
if (size == 1) vector_set_f(sigma2, k, dmin * dmin);
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
LDouble lambda_k = vector_get_f(lambda, k);
|
|
337
|
+
LDouble sigma2_k = vector_get_f(sigma2, k);
|
|
338
|
+
vector_set_f(coeff, k, lambda_k / sqrt(PIx2 * sigma2_k));
|
|
339
|
+
ileft = iright + 1;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
LDouble loglikelihood = 0.0;
|
|
343
|
+
|
|
344
|
+
for (uint32_t i = 0; i < xcount; ++i)
|
|
345
|
+
{
|
|
346
|
+
LDouble L = 0.0;
|
|
347
|
+
LDouble xi = vector_get_f(xsorted, i);
|
|
348
|
+
|
|
349
|
+
for (uint32_t k = 0; k < kouter; ++k)
|
|
350
|
+
{
|
|
351
|
+
LDouble coeff_k = vector_get_f(coeff, k);
|
|
352
|
+
LDouble mu_k = vector_get_f(mu, k);
|
|
353
|
+
LDouble sigma2_k = vector_get_f(sigma2, k);
|
|
354
|
+
LDouble x_mu_diff = xi - mu_k;
|
|
355
|
+
L += coeff_k * exp(- x_mu_diff * x_mu_diff / (2.0 * sigma2_k));
|
|
356
|
+
}
|
|
357
|
+
loglikelihood += log(L);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
LDouble bic = 2 * loglikelihood - (3 * kouter - 1) * log_xcount;
|
|
361
|
+
|
|
362
|
+
if (kouter == kmin) {
|
|
363
|
+
max_bic = bic;
|
|
364
|
+
kopt = kmin;
|
|
365
|
+
} else {
|
|
366
|
+
if (bic > max_bic) {
|
|
367
|
+
max_bic = bic;
|
|
368
|
+
kopt = kouter;
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
return kopt;
|
|
373
|
+
}
|
|
374
|
+
|
|
283
375
|
VectorI *backtrack_sizes(State state, VectorI *sizes, uint32_t k)
|
|
284
376
|
{
|
|
285
377
|
MatrixI *splits = state.splits;
|
|
@@ -416,12 +508,12 @@ inline void fill_even_positions(State state, RowParams rparams, VectorI *split_c
|
|
|
416
508
|
|
|
417
509
|
inline void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
|
|
418
510
|
{
|
|
419
|
-
const uint32_t row
|
|
420
|
-
const uint32_t imin
|
|
421
|
-
const uint32_t imax
|
|
422
|
-
const uint32_t istep
|
|
423
|
-
MatrixF *const cost
|
|
424
|
-
MatrixI *const splits
|
|
511
|
+
const uint32_t row = rparams.row;
|
|
512
|
+
const uint32_t imin = rparams.imin;
|
|
513
|
+
const uint32_t imax = rparams.imax;
|
|
514
|
+
const uint32_t istep = rparams.istep;
|
|
515
|
+
MatrixF *const cost = state.cost;
|
|
516
|
+
MatrixI *const splits = state.splits;
|
|
425
517
|
FnDissim *const dissim = state.dissim;
|
|
426
518
|
|
|
427
519
|
uint32_t optimal_split_idx_prev = 0;
|
|
@@ -723,7 +815,7 @@ Arena *arena_create(size_t capacity) {
|
|
|
723
815
|
}
|
|
724
816
|
|
|
725
817
|
void *arena_alloc(Arena *arena, size_t size) {
|
|
726
|
-
size = (size +
|
|
818
|
+
size = (size + 0xf) & ~0xf;
|
|
727
819
|
|
|
728
820
|
if (arena->offset + size > arena->capacity) {
|
|
729
821
|
rb_raise(rb_eNoMemError, "Arena Insufficient Capacity");
|
data/lib/ckmeans/clusterer.rb
CHANGED
|
@@ -2,18 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
module Ckmeans
|
|
4
4
|
class Clusterer # rubocop:disable Style/Documentation
|
|
5
|
-
def initialize(entries, kmin, kmax = kmin, kestimate = :
|
|
5
|
+
def initialize(entries, kmin, kmax = kmin, kestimate = :fast)
|
|
6
6
|
@xcount = entries.size
|
|
7
7
|
|
|
8
8
|
raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
|
|
9
9
|
raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
|
|
10
10
|
|
|
11
|
-
@kmin
|
|
12
|
-
@unique_xcount
|
|
13
|
-
@kmax
|
|
14
|
-
@xsorted_original
|
|
15
|
-
@xsorted
|
|
16
|
-
@
|
|
11
|
+
@kmin = kmin
|
|
12
|
+
@unique_xcount = entries.uniq.size
|
|
13
|
+
@kmax = [@unique_xcount, kmax].min
|
|
14
|
+
@xsorted_original = entries.sort
|
|
15
|
+
@xsorted = @xsorted_original.map(&:to_f)
|
|
16
|
+
@use_gmm = kestimate == :gmm
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
def clusters
|
data/lib/ckmeans/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ckmeans
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 2.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Vlad Lebedev
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-
|
|
11
|
+
date: 2025-06-09 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Repeatable clustering of unidimensional data
|
|
14
14
|
email:
|