ckmeans 1.0.4 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/README.md +10 -3
- data/ext/ckmeans/extensions.c +87 -20
- data/lib/ckmeans/clusterer.rb +0 -2
- data/lib/ckmeans/version.rb +1 -1
- data/lib/ckmeans.rb +2 -0
- data/lib/ckmedian/clusterer.rb +29 -0
- metadata +7 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 508f78311a643e1fa8e693e4abf1cdf6df4eb06ff09756fa534ff4a514d0f34f
|
|
4
|
+
data.tar.gz: 4ef313387c2e45df4a8afde58e429093023a555a32f4af395a8b79c048a9d98d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ae0f1aff4bd6a78da04123d3728234012d0692ec22396b9529b245c8fa473343314508f053ee02ac876131b243704316948f476840d3c495d4e72eba68e095fd
|
|
7
|
+
data.tar.gz: ab95cfdacac4d9204887d4d5c5a7b85aafa3c869ec4b7a851ae994d8f15ddf096e99cb2d1691e6120a7a6bbe51cbde0524c93ad09b5811b7561c43c60a49256c
|
data/.ruby-version
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
3.2.
|
|
1
|
+
3.2.8
|
data/README.md
CHANGED
|
@@ -19,9 +19,16 @@ gem install ckmeans
|
|
|
19
19
|
## Usage
|
|
20
20
|
|
|
21
21
|
```rb
|
|
22
|
-
|
|
23
|
-
Ckmeans::Clusterer(data, kmin
|
|
24
|
-
|
|
22
|
+
# Fixed cluster count
|
|
23
|
+
Ckmeans::Clusterer(data, kmin).clusters
|
|
24
|
+
Ckmedian::Clusterer(data, kmin).clusters
|
|
25
|
+
|
|
26
|
+
# Estimate optimal cluster count within kmin and kmax
|
|
27
|
+
Ckmeans::Clusterer(data, kmin, kmax).clusters
|
|
28
|
+
Ckmedian::Clusterer(data, kmin, kmax).clusters
|
|
29
|
+
|
|
30
|
+
# Adjust Bayesian Information Criteria favoring more smaller clusters (Ckmeans only)
|
|
31
|
+
Ckmeans::Clusterer(data, kmin, kmax, :sensitive).clusters
|
|
25
32
|
```
|
|
26
33
|
|
|
27
34
|
## License
|
data/ext/ckmeans/extensions.c
CHANGED
|
@@ -33,6 +33,8 @@ typedef struct VectorI {
|
|
|
33
33
|
uint32_t *values;
|
|
34
34
|
} VectorI;
|
|
35
35
|
|
|
36
|
+
typedef LDouble (FnDissim)(uint32_t, uint32_t, VectorF*, VectorF*);
|
|
37
|
+
|
|
36
38
|
typedef struct State {
|
|
37
39
|
uint32_t xcount;
|
|
38
40
|
uint32_t kmin;
|
|
@@ -44,6 +46,7 @@ typedef struct State {
|
|
|
44
46
|
MatrixI *splits;
|
|
45
47
|
VectorF *xsum;
|
|
46
48
|
VectorF *xsumsq;
|
|
49
|
+
FnDissim *dissim;
|
|
47
50
|
} State;
|
|
48
51
|
|
|
49
52
|
typedef struct RowParams {
|
|
@@ -59,6 +62,8 @@ typedef struct {
|
|
|
59
62
|
} SegmentStats;
|
|
60
63
|
|
|
61
64
|
VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
|
|
65
|
+
VALUE rb_ckmedian_sorted_group_sizes(VALUE self);
|
|
66
|
+
VALUE rb_sorted_group_sizes(VALUE self, FnDissim*);
|
|
62
67
|
|
|
63
68
|
Arena *arena_create(size_t);
|
|
64
69
|
void *arena_alloc(Arena*, size_t);
|
|
@@ -85,7 +90,8 @@ uint32_t vector_get_i(VectorI*, uint32_t offset);
|
|
|
85
90
|
void vector_downsize_i(VectorI*, uint32_t);
|
|
86
91
|
void vector_inspect_i(VectorI*);
|
|
87
92
|
|
|
88
|
-
LDouble
|
|
93
|
+
LDouble dissimilarity_l2(uint32_t, uint32_t, VectorF*, VectorF*);
|
|
94
|
+
LDouble dissimilarity_l1(uint32_t, uint32_t, VectorF*, VectorF*);
|
|
89
95
|
void fill_row(State, uint32_t, uint32_t, uint32_t);
|
|
90
96
|
void smawk(State, RowParams, VectorI*);
|
|
91
97
|
void find_min_from_candidates(State, RowParams, VectorI*);
|
|
@@ -95,11 +101,15 @@ SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
|
|
|
95
101
|
VectorI *backtrack_sizes(State, VectorI*, uint32_t);
|
|
96
102
|
uint32_t find_koptimal(State);
|
|
97
103
|
|
|
104
|
+
|
|
98
105
|
void Init_extensions(void) {
|
|
99
|
-
VALUE ckmeans_module
|
|
100
|
-
VALUE
|
|
106
|
+
VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
|
|
107
|
+
VALUE ckmedian_module = rb_const_get(rb_cObject, rb_intern("Ckmedian"));
|
|
108
|
+
VALUE ckmeans_clusterer = rb_const_get(ckmeans_module, rb_intern("Clusterer"));
|
|
109
|
+
VALUE ckmedian_clusterer = rb_const_get(ckmedian_module, rb_intern("Clusterer"));
|
|
101
110
|
|
|
102
|
-
rb_define_private_method(
|
|
111
|
+
rb_define_private_method(ckmeans_clusterer, "sorted_group_sizes", rb_ckmeans_sorted_group_sizes, 0);
|
|
112
|
+
rb_define_private_method(ckmedian_clusterer, "sorted_group_sizes", rb_ckmedian_sorted_group_sizes, 0);
|
|
103
113
|
}
|
|
104
114
|
|
|
105
115
|
# define ARENA_MIN_CAPACITY 100
|
|
@@ -107,6 +117,16 @@ void Init_extensions(void) {
|
|
|
107
117
|
# define PIx2 (M_PI * 2.0)
|
|
108
118
|
|
|
109
119
|
VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
|
|
120
|
+
{
|
|
121
|
+
return rb_sorted_group_sizes(self, dissimilarity_l2);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
VALUE rb_ckmedian_sorted_group_sizes(VALUE self)
|
|
125
|
+
{
|
|
126
|
+
return rb_sorted_group_sizes(self, dissimilarity_l1);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
|
|
110
130
|
{
|
|
111
131
|
uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
|
|
112
132
|
uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
|
|
@@ -139,7 +159,8 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
|
|
|
139
159
|
.cost = cost,
|
|
140
160
|
.splits = splits,
|
|
141
161
|
.xsum = xsum,
|
|
142
|
-
.xsumsq = xsumsq
|
|
162
|
+
.xsumsq = xsumsq,
|
|
163
|
+
.dissim = criteria
|
|
143
164
|
};
|
|
144
165
|
|
|
145
166
|
|
|
@@ -157,7 +178,7 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
|
|
|
157
178
|
|
|
158
179
|
vector_set_f(xsum, i, xsum_prev + diff);
|
|
159
180
|
vector_set_f(xsumsq, i, xsumsq_prev + diff * diff);
|
|
160
|
-
matrix_set_f(cost, 0, i,
|
|
181
|
+
matrix_set_f(cost, 0, i, criteria(0, i, xsum, xsumsq));
|
|
161
182
|
matrix_set_i(splits, 0, i, 0);
|
|
162
183
|
}
|
|
163
184
|
|
|
@@ -336,7 +357,7 @@ void smawk(State state, RowParams rparams, VectorI *split_candidates)
|
|
|
336
357
|
}
|
|
337
358
|
}
|
|
338
359
|
|
|
339
|
-
void fill_even_positions(State state, RowParams rparams, VectorI *split_candidates)
|
|
360
|
+
inline void fill_even_positions(State state, RowParams rparams, VectorI *split_candidates)
|
|
340
361
|
{
|
|
341
362
|
uint32_t row = rparams.row;
|
|
342
363
|
uint32_t imin = rparams.imin;
|
|
@@ -345,9 +366,10 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
|
|
|
345
366
|
uint32_t n = split_candidates->size;
|
|
346
367
|
uint32_t istepx2 = istep * 2;
|
|
347
368
|
uint32_t jl = vector_get_i(split_candidates, 0);
|
|
348
|
-
VectorF *xsum = state.xsum;
|
|
349
|
-
VectorF *xsumsq = state.xsumsq;
|
|
350
|
-
MatrixI *splits = state.splits;
|
|
369
|
+
VectorF *const xsum = state.xsum;
|
|
370
|
+
VectorF *const xsumsq = state.xsumsq;
|
|
371
|
+
MatrixI *const splits = state.splits;
|
|
372
|
+
FnDissim *const dissim = state.dissim;
|
|
351
373
|
|
|
352
374
|
for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
|
|
353
375
|
while (vector_get_i(split_candidates, r) < jl) r++;
|
|
@@ -356,7 +378,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
|
|
|
356
378
|
uint32_t cost_base_row = row - 1;
|
|
357
379
|
uint32_t cost_base_col = rcandidate - 1;
|
|
358
380
|
LDouble cost =
|
|
359
|
-
matrix_get_f(state.cost, cost_base_row, cost_base_col) +
|
|
381
|
+
matrix_get_f(state.cost, cost_base_row, cost_base_col) + dissim(rcandidate, i, xsum, xsumsq);
|
|
360
382
|
|
|
361
383
|
matrix_set_f(state.cost, row, i, cost);
|
|
362
384
|
matrix_set_i(state.splits, row, i, rcandidate);
|
|
@@ -367,7 +389,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
|
|
|
367
389
|
: vector_get_i(split_candidates, n - 1);
|
|
368
390
|
|
|
369
391
|
uint32_t jmax = jh < i ? jh : i;
|
|
370
|
-
LDouble sjimin =
|
|
392
|
+
LDouble sjimin = dissim(jmax, i, xsum, xsumsq);
|
|
371
393
|
|
|
372
394
|
for (++r; r < n && vector_get_i(split_candidates, r) <= jmax; r++) {
|
|
373
395
|
uint32_t jabs = vector_get_i(split_candidates, r);
|
|
@@ -376,7 +398,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
|
|
|
376
398
|
if (jabs < matrix_get_i(splits, row - 1, i)) continue;
|
|
377
399
|
|
|
378
400
|
LDouble cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
|
|
379
|
-
LDouble sj = cost_base +
|
|
401
|
+
LDouble sj = cost_base + dissim(jabs, i, xsum, xsumsq);
|
|
380
402
|
LDouble cost_prev = matrix_get_f(state.cost, row, i);
|
|
381
403
|
|
|
382
404
|
if (sj <= cost_prev) {
|
|
@@ -392,7 +414,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
|
|
|
392
414
|
}
|
|
393
415
|
}
|
|
394
416
|
|
|
395
|
-
void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
|
|
417
|
+
inline void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
|
|
396
418
|
{
|
|
397
419
|
const uint32_t row = rparams.row;
|
|
398
420
|
const uint32_t imin = rparams.imin;
|
|
@@ -400,6 +422,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
|
|
|
400
422
|
const uint32_t istep = rparams.istep;
|
|
401
423
|
MatrixF *const cost = state.cost;
|
|
402
424
|
MatrixI *const splits = state.splits;
|
|
425
|
+
FnDissim *const dissim = state.dissim;
|
|
403
426
|
|
|
404
427
|
uint32_t optimal_split_idx_prev = 0;
|
|
405
428
|
|
|
@@ -408,7 +431,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
|
|
|
408
431
|
const uint32_t optimal_split_idx = optimal_split_idx_prev;
|
|
409
432
|
const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
|
|
410
433
|
const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
|
|
411
|
-
const LDouble added_cost =
|
|
434
|
+
const LDouble added_cost = dissim(optimal_split, i, state.xsum, state.xsumsq);
|
|
412
435
|
|
|
413
436
|
matrix_set_f(cost, row, i, cost_prev + added_cost);
|
|
414
437
|
matrix_set_i(splits, row, i, optimal_split);
|
|
@@ -421,7 +444,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
|
|
|
421
444
|
if (split > i) break;
|
|
422
445
|
|
|
423
446
|
LDouble split_cost =
|
|
424
|
-
matrix_get_f(cost, row - 1, split - 1) +
|
|
447
|
+
matrix_get_f(cost, row - 1, split - 1) + dissim(split, i, state.xsum, state.xsumsq);
|
|
425
448
|
|
|
426
449
|
if (split_cost > matrix_get_f(cost, row, i)) continue;
|
|
427
450
|
|
|
@@ -432,7 +455,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
|
|
|
432
455
|
}
|
|
433
456
|
}
|
|
434
457
|
|
|
435
|
-
VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
|
|
458
|
+
inline VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
|
|
436
459
|
{
|
|
437
460
|
uint32_t imin = rparams.imin;
|
|
438
461
|
uint32_t row = rparams.row;
|
|
@@ -445,6 +468,7 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
|
|
|
445
468
|
uint32_t left = 0;
|
|
446
469
|
uint32_t right = 0;
|
|
447
470
|
VectorI *pruned = vector_dup_i(split_candidates, state.arena);
|
|
471
|
+
FnDissim *const dissim = state.dissim;
|
|
448
472
|
|
|
449
473
|
while (m > n)
|
|
450
474
|
{
|
|
@@ -452,9 +476,9 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
|
|
|
452
476
|
uint32_t j = vector_get_i(pruned, right);
|
|
453
477
|
uint32_t jnext = vector_get_i(pruned, right + 1);
|
|
454
478
|
LDouble sl =
|
|
455
|
-
matrix_get_f(state.cost, row - 1, j - 1) +
|
|
479
|
+
matrix_get_f(state.cost, row - 1, j - 1) + dissim(j, i, state.xsum, state.xsumsq);
|
|
456
480
|
LDouble snext =
|
|
457
|
-
matrix_get_f(state.cost, row - 1, jnext - 1) +
|
|
481
|
+
matrix_get_f(state.cost, row - 1, jnext - 1) + dissim(jnext, i, state.xsum, state.xsumsq);
|
|
458
482
|
|
|
459
483
|
if ((sl < snext) && (left < n - 1)) {
|
|
460
484
|
vector_set_i(pruned, left, j);
|
|
@@ -484,7 +508,8 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
|
|
|
484
508
|
return pruned;
|
|
485
509
|
}
|
|
486
510
|
|
|
487
|
-
|
|
511
|
+
/* L2 aka Euclidean aka Mean dissimilarity criteria */
|
|
512
|
+
inline LDouble dissimilarity_l2(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict xsumsq) {
|
|
488
513
|
LDouble sji = 0.0;
|
|
489
514
|
|
|
490
515
|
if (j >= i) return sji;
|
|
@@ -501,6 +526,48 @@ inline LDouble dissimilarity(uint32_t j, uint32_t i, VectorF *restrict xsum, Vec
|
|
|
501
526
|
return (sji > 0) ? sji : 0.0;
|
|
502
527
|
}
|
|
503
528
|
|
|
529
|
+
/* L1 aka Manhattan aka Median dissimilarity criteria */
|
|
530
|
+
inline LDouble dissimilarity_l1(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict _xsumsq)
|
|
531
|
+
{
|
|
532
|
+
LDouble sji = 0.0;
|
|
533
|
+
|
|
534
|
+
if (j >= i) return sji;
|
|
535
|
+
|
|
536
|
+
if (j > 0) {
|
|
537
|
+
uint32_t median_idx = (i + j) >> 1;
|
|
538
|
+
|
|
539
|
+
if (((i - j + 1) % 2) == 1) {
|
|
540
|
+
sji =
|
|
541
|
+
- vector_get_f(xsum, median_idx - 1)
|
|
542
|
+
+ vector_get_f(xsum, j - 1)
|
|
543
|
+
+ vector_get_f(xsum, i)
|
|
544
|
+
- vector_get_f(xsum, median_idx);
|
|
545
|
+
} else {
|
|
546
|
+
sji =
|
|
547
|
+
- vector_get_f(xsum, median_idx)
|
|
548
|
+
+ vector_get_f(xsum, j - 1)
|
|
549
|
+
+ vector_get_f(xsum, i)
|
|
550
|
+
- vector_get_f(xsum, median_idx);
|
|
551
|
+
}
|
|
552
|
+
} else { // j == 0
|
|
553
|
+
uint32_t median_idx = i >> 1;
|
|
554
|
+
|
|
555
|
+
if (((i + 1) % 2) == 1) {
|
|
556
|
+
sji =
|
|
557
|
+
- vector_get_f(xsum, median_idx - 1)
|
|
558
|
+
+ vector_get_f(xsum, i)
|
|
559
|
+
- vector_get_f(xsum, median_idx);
|
|
560
|
+
} else {
|
|
561
|
+
sji =
|
|
562
|
+
- vector_get_f(xsum, median_idx)
|
|
563
|
+
+ vector_get_f(xsum, i)
|
|
564
|
+
- vector_get_f(xsum, median_idx);
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
return (sji < 0) ? 0.0 : sji;
|
|
569
|
+
}
|
|
570
|
+
|
|
504
571
|
inline VectorF *vector_create_f(Arena *arena, uint32_t size) {
|
|
505
572
|
VectorF *v;
|
|
506
573
|
|
data/lib/ckmeans/clusterer.rb
CHANGED
data/lib/ckmeans/version.rb
CHANGED
data/lib/ckmeans.rb
CHANGED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ckmedian
|
|
4
|
+
class Clusterer # rubocop:disable Style/Documentation
|
|
5
|
+
def initialize(entries, kmin, kmax = kmin)
|
|
6
|
+
@xcount = entries.size
|
|
7
|
+
|
|
8
|
+
raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
|
|
9
|
+
raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
|
|
10
|
+
|
|
11
|
+
@kmin = kmin
|
|
12
|
+
@unique_xcount = entries.uniq.size
|
|
13
|
+
@kmax = [@unique_xcount, kmax].min
|
|
14
|
+
@xsorted_original = entries.sort
|
|
15
|
+
@xsorted = @xsorted_original.map(&:to_f)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def clusters
|
|
19
|
+
@clusters ||=
|
|
20
|
+
if @unique_xcount <= 1
|
|
21
|
+
[@xsorted_original]
|
|
22
|
+
else
|
|
23
|
+
sorted_group_sizes.each_with_object([]) do |size, groups|
|
|
24
|
+
groups << @xsorted_original.shift(size)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
metadata
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ckmeans
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.0
|
|
4
|
+
version: 1.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Vlad Lebedev
|
|
8
|
+
autorequire:
|
|
8
9
|
bindir: exe
|
|
9
10
|
cert_chain: []
|
|
10
|
-
date: 2025-05-
|
|
11
|
+
date: 2025-05-23 00:00:00.000000000 Z
|
|
11
12
|
dependencies: []
|
|
12
13
|
description: Repeatable clustering of unidimensional data
|
|
13
14
|
email:
|
|
@@ -32,6 +33,7 @@ files:
|
|
|
32
33
|
- lib/ckmeans.rb
|
|
33
34
|
- lib/ckmeans/clusterer.rb
|
|
34
35
|
- lib/ckmeans/version.rb
|
|
36
|
+
- lib/ckmedian/clusterer.rb
|
|
35
37
|
- sig/ckmeans.rbs
|
|
36
38
|
homepage: https://github.com/vlebedeff/rb-ckmeans
|
|
37
39
|
licenses:
|
|
@@ -41,6 +43,7 @@ metadata:
|
|
|
41
43
|
homepage_uri: https://github.com/vlebedeff/rb-ckmeans
|
|
42
44
|
source_code_uri: https://github.com/vlebedeff/rb-ckmeans
|
|
43
45
|
changelog_uri: https://github.com/vlebedeff/rb-ckmeans/blob/main/CHANGELOG.md
|
|
46
|
+
post_install_message:
|
|
44
47
|
rdoc_options: []
|
|
45
48
|
require_paths:
|
|
46
49
|
- lib
|
|
@@ -55,7 +58,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
55
58
|
- !ruby/object:Gem::Version
|
|
56
59
|
version: '0'
|
|
57
60
|
requirements: []
|
|
58
|
-
rubygems_version: 3.
|
|
61
|
+
rubygems_version: 3.4.19
|
|
62
|
+
signing_key:
|
|
59
63
|
specification_version: 4
|
|
60
64
|
summary: Ruby implementation of Ckmeans.1d.dp
|
|
61
65
|
test_files: []
|