ckmeans 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +3 -39
- data/ext/ckmeans/extensions.c +45 -74
- data/lib/ckmeans/clusterer.rb +1 -297
- data/lib/ckmeans/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: be110daab8039e8a76ccbc68808120caf3aa7b189e107d6bced6d3519e8d917c
|
|
4
|
+
data.tar.gz: 6647bce619e675a4e24f4a17ec8e5aee23280cd97d89b29433da56e122f3c932
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 54c6292bbae43afdbb4c618983c9a602ce68fec83239cf3a2f77c42fb8544fb45801d71910796db49cdf872a6151f616a711bce9c1fe537a202cae476bbb995b
|
|
7
|
+
data.tar.gz: 82d39fb2a5870a92ab579f5342de64e69a170c1d9ef31e9d5e121fb06902d3a3dba1b6a5cd4b56b432524e991e53221b78ebee7c88bbbbea868ee63ccb256d0e
|
data/.rubocop_todo.yml
CHANGED
|
@@ -1,49 +1,13 @@
|
|
|
1
1
|
# This configuration was generated by
|
|
2
2
|
# `rubocop --auto-gen-config`
|
|
3
|
-
# on 2025-04-
|
|
3
|
+
# on 2025-04-24 06:16:37 UTC using RuboCop version 1.75.1.
|
|
4
4
|
# The point is for the user to remove these configuration records
|
|
5
5
|
# one by one as the offenses are removed from the code base.
|
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
|
8
8
|
|
|
9
|
-
# Offense count:
|
|
10
|
-
# Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes.
|
|
11
|
-
Metrics/AbcSize:
|
|
12
|
-
Max: 95
|
|
13
|
-
|
|
14
|
-
# Offense count: 2
|
|
9
|
+
# Offense count: 1
|
|
15
10
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
|
16
11
|
# AllowedMethods: refine
|
|
17
12
|
Metrics/BlockLength:
|
|
18
|
-
Max:
|
|
19
|
-
|
|
20
|
-
# Offense count: 3
|
|
21
|
-
# Configuration parameters: AllowedMethods, AllowedPatterns.
|
|
22
|
-
Metrics/CyclomaticComplexity:
|
|
23
|
-
Max: 10
|
|
24
|
-
|
|
25
|
-
# Offense count: 6
|
|
26
|
-
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
|
27
|
-
Metrics/MethodLength:
|
|
28
|
-
Max: 48
|
|
29
|
-
|
|
30
|
-
# Offense count: 3
|
|
31
|
-
# Configuration parameters: AllowedMethods, AllowedPatterns.
|
|
32
|
-
Metrics/PerceivedComplexity:
|
|
33
|
-
Max: 13
|
|
34
|
-
|
|
35
|
-
# Offense count: 12
|
|
36
|
-
# Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
|
|
37
|
-
# AllowedNames: as, at, by, cc, db, id, if, in, io, ip, of, on, os, pp, to
|
|
38
|
-
Naming/MethodParameterName:
|
|
39
|
-
Exclude:
|
|
40
|
-
- 'lib/ckmeans/clusterer.rb'
|
|
41
|
-
|
|
42
|
-
# Offense count: 5
|
|
43
|
-
# This cop supports unsafe autocorrection (--autocorrect-all).
|
|
44
|
-
# Configuration parameters: EnforcedStyle, AllowedMethods, AllowedPatterns.
|
|
45
|
-
# SupportedStyles: predicate, comparison
|
|
46
|
-
Style/NumericPredicate:
|
|
47
|
-
Exclude:
|
|
48
|
-
- 'spec/**/*'
|
|
49
|
-
- 'lib/ckmeans/clusterer.rb'
|
|
13
|
+
Max: 26
|
data/ext/ckmeans/extensions.c
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#include <stdio.h>
|
|
2
|
-
#include <assert.h>
|
|
3
2
|
#include <math.h>
|
|
3
|
+
#include <string.h>
|
|
4
4
|
#include "ruby.h"
|
|
5
5
|
|
|
6
6
|
typedef struct Arena {
|
|
@@ -60,7 +60,6 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
|
|
|
60
60
|
|
|
61
61
|
Arena *arena_create(uint32_t);
|
|
62
62
|
void *arena_alloc(Arena*, uint32_t);
|
|
63
|
-
void arena_rewind(Arena*);
|
|
64
63
|
void arena_destroy(Arena*);
|
|
65
64
|
|
|
66
65
|
MatrixF *matrix_create_f(Arena*, uint32_t, uint32_t);
|
|
@@ -117,14 +116,11 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
|
|
|
117
116
|
bool apply_deviation = RTEST(rb_apply_bic_deviation);
|
|
118
117
|
Arena *arena = arena_create(sizeof(int) * xcount * kmax * ALLOCATION_FACTOR);
|
|
119
118
|
|
|
120
|
-
if (arena == NULL)
|
|
121
|
-
return Qnil;
|
|
122
|
-
}
|
|
119
|
+
if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
|
|
123
120
|
|
|
124
121
|
MatrixF *cost = matrix_create_f(arena, kmax, xcount);
|
|
125
122
|
MatrixI *splits = matrix_create_i(arena, kmax, xcount);
|
|
126
123
|
VectorF *xsorted = vector_create_f(arena, xcount);
|
|
127
|
-
/* TODO: pack sums into one vector of pairs */
|
|
128
124
|
VectorF *xsum = vector_create_f(arena, xcount);
|
|
129
125
|
VectorF *xsumsq = vector_create_f(arena, xcount);
|
|
130
126
|
|
|
@@ -193,11 +189,11 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
|
|
|
193
189
|
|
|
194
190
|
uint32_t find_koptimal(State state)
|
|
195
191
|
{
|
|
196
|
-
uint32_t kmin
|
|
197
|
-
uint32_t kmax
|
|
198
|
-
uint32_t xcount
|
|
199
|
-
uint32_t kopt
|
|
200
|
-
uint32_t xindex_max
|
|
192
|
+
uint32_t kmin = state.kmin;
|
|
193
|
+
uint32_t kmax = state.kmax;
|
|
194
|
+
uint32_t xcount = state.xcount;
|
|
195
|
+
uint32_t kopt = kmin;
|
|
196
|
+
uint32_t xindex_max = state.xcount - 1;
|
|
201
197
|
VectorF *xsorted = state.xsorted;
|
|
202
198
|
long double x0 = vector_get_f(xsorted, 0);
|
|
203
199
|
long double xn = vector_get_f(xsorted, xindex_max);
|
|
@@ -274,6 +270,7 @@ VectorI *backtrack_sizes(State state, uint32_t k)
|
|
|
274
270
|
left = matrix_get_i(splits, i, right);
|
|
275
271
|
vector_set_i(sizes, i, right - left + 1);
|
|
276
272
|
}
|
|
273
|
+
// Special case outside of the loop removing the need for conditionals
|
|
277
274
|
left = matrix_get_i(splits, 0, right);
|
|
278
275
|
vector_set_i(sizes, 0, right - left + 1);
|
|
279
276
|
|
|
@@ -282,7 +279,7 @@ VectorI *backtrack_sizes(State state, uint32_t k)
|
|
|
282
279
|
|
|
283
280
|
SegmentStats shifted_data_variance(VectorF *xsorted, uint32_t left, uint32_t right)
|
|
284
281
|
{
|
|
285
|
-
const uint32_t n
|
|
282
|
+
const uint32_t n = right - left + 1;
|
|
286
283
|
long double sum = 0.0;
|
|
287
284
|
long double sumsq = 0.0;
|
|
288
285
|
SegmentStats stats = { .mean = 0.0, .variance = 0.0 };
|
|
@@ -306,7 +303,8 @@ SegmentStats shifted_data_variance(VectorF *xsorted, uint32_t left, uint32_t rig
|
|
|
306
303
|
return stats;
|
|
307
304
|
}
|
|
308
305
|
|
|
309
|
-
void fill_row(State state, uint32_t q, uint32_t imin, uint32_t imax)
|
|
306
|
+
void fill_row(State state, uint32_t q, uint32_t imin, uint32_t imax)
|
|
307
|
+
{
|
|
310
308
|
uint32_t size = imax - q + 1;
|
|
311
309
|
VectorI *split_candidates = vector_create_i(state.arena, size);
|
|
312
310
|
for (uint32_t i = 0; i < size; i++) {
|
|
@@ -316,7 +314,8 @@ void fill_row(State state, uint32_t q, uint32_t imin, uint32_t imax) {
|
|
|
316
314
|
smawk(state, rparams, split_candidates);
|
|
317
315
|
}
|
|
318
316
|
|
|
319
|
-
void smawk(State state, RowParams rparams, VectorI *split_candidates)
|
|
317
|
+
void smawk(State state, RowParams rparams, VectorI *split_candidates)
|
|
318
|
+
{
|
|
320
319
|
const uint32_t imin = rparams.imin;
|
|
321
320
|
const uint32_t imax = rparams.imax;
|
|
322
321
|
const uint32_t istep = rparams.istep;
|
|
@@ -326,9 +325,9 @@ void smawk(State state, RowParams rparams, VectorI *split_candidates) {
|
|
|
326
325
|
} else {
|
|
327
326
|
VectorI *odd_candidates = prune_candidates(state, rparams, split_candidates);
|
|
328
327
|
/* printf("PRUNED\t"); vector_inspect_i(odd_candidates); */
|
|
329
|
-
uint32_t istepx2
|
|
330
|
-
uint32_t imin_odd
|
|
331
|
-
uint32_t imax_odd
|
|
328
|
+
uint32_t istepx2 = istep * 2;
|
|
329
|
+
uint32_t imin_odd = imin + istep;
|
|
330
|
+
uint32_t imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2);
|
|
332
331
|
RowParams rparams_odd = { .row = rparams.row, .imin = imin_odd, .imax = imax_odd, .istep = istepx2 };
|
|
333
332
|
|
|
334
333
|
smawk(state, rparams_odd, odd_candidates);
|
|
@@ -345,9 +344,9 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
|
|
|
345
344
|
uint32_t n = split_candidates->nvalues;
|
|
346
345
|
uint32_t istepx2 = istep * 2;
|
|
347
346
|
uint32_t jl = vector_get_i(split_candidates, 0);
|
|
348
|
-
VectorF *xsum
|
|
349
|
-
VectorF *xsumsq
|
|
350
|
-
MatrixI *splits
|
|
347
|
+
VectorF *xsum = state.xsum;
|
|
348
|
+
VectorF *xsumsq = state.xsumsq;
|
|
349
|
+
MatrixI *splits = state.splits;
|
|
351
350
|
|
|
352
351
|
for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
|
|
353
352
|
while (vector_get_i(split_candidates, r) < jl) r++;
|
|
@@ -394,10 +393,10 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
|
|
|
394
393
|
|
|
395
394
|
void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
|
|
396
395
|
{
|
|
397
|
-
const uint32_t row
|
|
398
|
-
const uint32_t imin
|
|
399
|
-
const uint32_t imax
|
|
400
|
-
const uint32_t istep
|
|
396
|
+
const uint32_t row = rparams.row;
|
|
397
|
+
const uint32_t imin = rparams.imin;
|
|
398
|
+
const uint32_t imax = rparams.imax;
|
|
399
|
+
const uint32_t istep = rparams.istep;
|
|
401
400
|
MatrixF *const cost = state.cost;
|
|
402
401
|
MatrixI *const splits = state.splits;
|
|
403
402
|
|
|
@@ -408,7 +407,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
|
|
|
408
407
|
const uint32_t optimal_split_idx = optimal_split_idx_prev;
|
|
409
408
|
const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
|
|
410
409
|
const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
|
|
411
|
-
const long double added_cost
|
|
410
|
+
const long double added_cost = dissimilarity(optimal_split, i, state.xsum, state.xsumsq);
|
|
412
411
|
|
|
413
412
|
matrix_set_f(cost, row, i, cost_prev + added_cost);
|
|
414
413
|
matrix_set_i(splits, row, i, optimal_split);
|
|
@@ -434,39 +433,39 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
|
|
|
434
433
|
|
|
435
434
|
VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
|
|
436
435
|
{
|
|
437
|
-
uint32_t
|
|
438
|
-
uint32_t
|
|
436
|
+
uint32_t imin = rparams.imin;
|
|
437
|
+
uint32_t row = rparams.row;
|
|
438
|
+
uint32_t istep = rparams.istep;
|
|
439
|
+
uint32_t n = ((rparams.imax - imin) / istep) + 1;
|
|
440
|
+
uint32_t m = split_candidates->nvalues;
|
|
439
441
|
|
|
440
442
|
if (n >= m) return split_candidates;
|
|
441
443
|
|
|
442
|
-
|
|
444
|
+
uint32_t left = 0;
|
|
443
445
|
uint32_t right = 0;
|
|
444
446
|
VectorI *pruned = vector_dup_i(split_candidates, state.arena);
|
|
445
447
|
|
|
446
448
|
while (m > n)
|
|
447
449
|
{
|
|
448
|
-
uint32_t
|
|
449
|
-
uint32_t
|
|
450
|
-
uint32_t
|
|
451
|
-
uint32_t jnext = vector_get_i(pruned, right + 1);
|
|
450
|
+
uint32_t i = imin + left * istep;
|
|
451
|
+
uint32_t j = vector_get_i(pruned, right);
|
|
452
|
+
uint32_t jnext = vector_get_i(pruned, right + 1);
|
|
452
453
|
long double sl =
|
|
453
|
-
matrix_get_f(state.cost,
|
|
454
|
+
matrix_get_f(state.cost, row - 1, j - 1) + dissimilarity(j, i, state.xsum, state.xsumsq);
|
|
454
455
|
long double snext =
|
|
455
|
-
matrix_get_f(state.cost,
|
|
456
|
+
matrix_get_f(state.cost, row - 1, jnext - 1) + dissimilarity(jnext, i, state.xsum, state.xsumsq);
|
|
456
457
|
|
|
457
|
-
if ((sl < snext) && (
|
|
458
|
+
if ((sl < snext) && (left < n - 1)) {
|
|
459
|
+
vector_set_i(pruned, left, j);
|
|
458
460
|
left++;
|
|
459
461
|
right++;
|
|
460
|
-
|
|
461
|
-
} else if ((sl < snext) && (p == n - 1)) {
|
|
462
|
+
} else if ((sl < snext) && (left == n - 1)) {
|
|
462
463
|
right++;
|
|
463
464
|
m--;
|
|
464
465
|
vector_set_i(pruned, right, j);
|
|
465
466
|
} else {
|
|
466
|
-
if (
|
|
467
|
-
|
|
468
|
-
vector_set_i(pruned, right, vector_get_i(pruned, left));
|
|
469
|
-
left--;
|
|
467
|
+
if (left > 0) {
|
|
468
|
+
vector_set_i(pruned, right, vector_get_i(pruned, --left));
|
|
470
469
|
} else {
|
|
471
470
|
right++;
|
|
472
471
|
}
|
|
@@ -475,8 +474,7 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
|
|
|
475
474
|
}
|
|
476
475
|
}
|
|
477
476
|
|
|
478
|
-
for (uint32_t i = left
|
|
479
|
-
/* TODO: extract `vector_setcpy_T` */
|
|
477
|
+
for (uint32_t i = left; i < m; i++) {
|
|
480
478
|
vector_set_i(pruned, i, vector_get_i(pruned, right++));
|
|
481
479
|
}
|
|
482
480
|
|
|
@@ -491,10 +489,9 @@ long double dissimilarity(uint32_t j, uint32_t i, VectorF *xsum, VectorF *xsumsq
|
|
|
491
489
|
if (j >= i) return sji;
|
|
492
490
|
|
|
493
491
|
if (j > 0) {
|
|
494
|
-
|
|
495
|
-
long double segment_sum = vector_get_diff_f(xsum, i, j - 1);
|
|
492
|
+
long double segment_diff = vector_get_diff_f(xsum, i, j - 1);
|
|
496
493
|
uint32_t segment_size = i - j + 1;
|
|
497
|
-
sji
|
|
494
|
+
sji = vector_get_diff_f(xsumsq, i, j - 1) - (segment_diff * segment_diff / segment_size);
|
|
498
495
|
} else {
|
|
499
496
|
long double xsumi = vector_get_f(xsum, i);
|
|
500
497
|
sji = vector_get_f(xsumsq, i) - (xsumi * xsumi / (i + 1));
|
|
@@ -527,29 +524,20 @@ VectorI *vector_dup_i(VectorI *v, Arena *arena)
|
|
|
527
524
|
{
|
|
528
525
|
VectorI *vdup = vector_create_i(arena, v->nvalues);
|
|
529
526
|
|
|
530
|
-
|
|
531
|
-
for (uint32_t i = 0; i < v->nvalues; i++) {
|
|
532
|
-
vector_set_i(vdup, i, vector_get_i(v, i));
|
|
533
|
-
}
|
|
527
|
+
memcpy(vdup->values, v->values, sizeof(*(v->values)) * v->nvalues);
|
|
534
528
|
|
|
535
529
|
return vdup;
|
|
536
530
|
}
|
|
537
531
|
|
|
538
532
|
void vector_set_f(VectorF *v, uint32_t offset, long double value) {
|
|
539
|
-
assert(offset < v->nvalues && "[vector_set_f] element index should be less than nvalues");
|
|
540
|
-
|
|
541
533
|
*(v->values + offset) = value;
|
|
542
534
|
}
|
|
543
535
|
|
|
544
536
|
void vector_set_i(VectorI *v, uint32_t offset, uint32_t value) {
|
|
545
|
-
assert(offset < v->nvalues && "[vector_set_i] element index should be less than nvalues");
|
|
546
|
-
|
|
547
537
|
*(v->values + offset) = value;
|
|
548
538
|
}
|
|
549
539
|
|
|
550
540
|
uint32_t vector_get_i(VectorI *v, uint32_t offset) {
|
|
551
|
-
assert(offset < v->nvalues && "[vector_get_i] element index should be less than nvalues");
|
|
552
|
-
|
|
553
541
|
return *(v->values + offset);
|
|
554
542
|
}
|
|
555
543
|
|
|
@@ -564,15 +552,10 @@ void vector_inspect_i(VectorI *v) {
|
|
|
564
552
|
}
|
|
565
553
|
|
|
566
554
|
long double vector_get_f(VectorF *v, uint32_t offset) {
|
|
567
|
-
assert(offset < v->nvalues && "[vector_get_f] element index should be less than nvalues");
|
|
568
|
-
|
|
569
555
|
return *(v->values + offset);
|
|
570
556
|
}
|
|
571
557
|
|
|
572
558
|
long double vector_get_diff_f(VectorF *v, uint32_t i, uint32_t j) {
|
|
573
|
-
assert(i < v->nvalues && "[vector_get_diff_f] i should be less than nvalues");
|
|
574
|
-
assert(j < v->nvalues && "[vector_get_diff_f] j should be less than nvalues");
|
|
575
|
-
|
|
576
559
|
return *(v->values + i) - *(v->values + j);
|
|
577
560
|
}
|
|
578
561
|
|
|
@@ -605,17 +588,11 @@ MatrixI *matrix_create_i(Arena *arena, uint32_t nrows, uint32_t ncols) {
|
|
|
605
588
|
}
|
|
606
589
|
|
|
607
590
|
void matrix_set_f(MatrixF *m, uint32_t i, uint32_t j, long double value) {
|
|
608
|
-
assert(i < m->nrows && "[matrix_set_f] row offset should be less than nrows");
|
|
609
|
-
assert(j < m->cols && "[matrix_set_f] col offset should be less than ncols");
|
|
610
|
-
|
|
611
591
|
uint32_t offset = i * m->ncols + j;
|
|
612
592
|
*(m->values + offset) = value;
|
|
613
593
|
}
|
|
614
594
|
|
|
615
595
|
long double matrix_get_f(MatrixF *m, uint32_t i, uint32_t j) {
|
|
616
|
-
assert(i < m->nrows && "[matrix_get_f] row offset should be less than nrows");
|
|
617
|
-
assert(j < m->cols && "[matrix_get_f] col offset should be less than ncols");
|
|
618
|
-
|
|
619
596
|
uint32_t offset = i * m->ncols + j;
|
|
620
597
|
return *(m->values + offset);
|
|
621
598
|
}
|
|
@@ -640,17 +617,11 @@ void matrix_inspect_i(MatrixI *m) {
|
|
|
640
617
|
}
|
|
641
618
|
|
|
642
619
|
void matrix_set_i(MatrixI *m, uint32_t i, uint32_t j, uint32_t value) {
|
|
643
|
-
assert(i < m->nrows && "[matrix_set_i] row offset should be less than nrows");
|
|
644
|
-
assert(j < m->cols && "[matrix_set_i] col offset should be less than ncols");
|
|
645
|
-
|
|
646
620
|
uint32_t offset = i * m->ncols + j;
|
|
647
621
|
*(m->values + offset) = value;
|
|
648
622
|
}
|
|
649
623
|
|
|
650
624
|
uint32_t matrix_get_i(MatrixI *m, uint32_t i, uint32_t j) {
|
|
651
|
-
assert(i < m->nrows && "[matrix_get_i] row offset should be less than nrows");
|
|
652
|
-
assert(j < m->cols && "[matrix_get_i] col offset should be less than ncols");
|
|
653
|
-
|
|
654
625
|
uint32_t offset = i * m->ncols + j;
|
|
655
626
|
return *(m->values + offset);
|
|
656
627
|
}
|
|
@@ -687,7 +658,7 @@ void *arena_alloc(Arena *arena, uint32_t size) {
|
|
|
687
658
|
size = (size + 7) & ~7;
|
|
688
659
|
|
|
689
660
|
if (arena->offset + size > arena->capacity) {
|
|
690
|
-
|
|
661
|
+
rb_raise(rb_eNoMemError, "Arena Insufficient Capacity");
|
|
691
662
|
return NULL;
|
|
692
663
|
}
|
|
693
664
|
|
data/lib/ckmeans/clusterer.rb
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Ckmeans
|
|
4
|
-
class Clusterer # rubocop:disable Style/Documentation
|
|
5
|
-
PI_DOUBLE = Math::PI * 2
|
|
6
|
-
|
|
4
|
+
class Clusterer # rubocop:disable Style/Documentation
|
|
7
5
|
def initialize(entries, kmin, kmax = kmin, kestimate = :regular)
|
|
8
6
|
@xcount = entries.size
|
|
9
7
|
|
|
@@ -26,301 +24,7 @@ module Ckmeans
|
|
|
26
24
|
sorted_group_sizes.each_with_object([]) do |size, groups|
|
|
27
25
|
groups << @xsorted_original.shift(size)
|
|
28
26
|
end
|
|
29
|
-
|
|
30
|
-
=begin # rubocop:disable Style/BlockComments
|
|
31
|
-
@cost = Array.new(kmax) { Array.new(xcount) { 0.0 } }
|
|
32
|
-
@splits = Array.new(kmax) { Array.new(xcount) { 0 } }
|
|
33
|
-
@xsum = Array.new(xcount)
|
|
34
|
-
@xsumsq = Array.new(xcount)
|
|
35
|
-
|
|
36
|
-
shift = xsorted[xcount / 2]
|
|
37
|
-
xsum[0] = xsorted[0].to_f - shift
|
|
38
|
-
xsumsq[0] = xsum[0]**2
|
|
39
|
-
|
|
40
|
-
1.upto(xcount - 1) do |i|
|
|
41
|
-
xf = xsorted[i].to_f
|
|
42
|
-
xsum[i] = xsum[i - 1] + xf - shift
|
|
43
|
-
xsumsq[i] = xsumsq[i - 1] + ((xf - shift) * (xf - shift))
|
|
44
|
-
cost[0][i] = dissim(0, i)
|
|
45
|
-
splits[0][i] = 0
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
kmax_idx = kmax - 1
|
|
49
|
-
1.upto(kmax_idx) do |q|
|
|
50
|
-
imin = q < kmax_idx ? [1, q].max : xcount - 1
|
|
51
|
-
fill_row(q, imin, xcount - 1)
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
kopt = koptimal
|
|
55
|
-
|
|
56
|
-
puts "RB COST\n", cost.map(&:inspect)
|
|
57
|
-
puts "RB SPLITS\n", splits.map(&:inspect)
|
|
58
|
-
puts "RB K OPTIMAL: #{kopt}"
|
|
59
|
-
|
|
60
|
-
backtrack(kopt).each_with_object(Array.new(kopt)) do |(q, left, right), res|
|
|
61
|
-
res[q] = xsorted[left..right]
|
|
62
|
-
end
|
|
63
|
-
=end
|
|
64
|
-
end
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
private
|
|
68
|
-
|
|
69
|
-
attr_reader :cost, :splits, :xsum, :xsumsq, :xcount, :xsorted, :kmin, :kmax
|
|
70
|
-
|
|
71
|
-
def koptimal # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
72
|
-
kopt = kmin
|
|
73
|
-
n = xcount
|
|
74
|
-
max_bic = 0.0
|
|
75
|
-
adjustment = kestimate == :sensitive ? 0.0 : 1.0 # Deviation from BIC formula to favor smaller clusters
|
|
76
|
-
|
|
77
|
-
kmin.upto(kmax) do |k|
|
|
78
|
-
sizes = backtrack(k).each_with_object(Array.new(k)) { |(q, left, right), sz| sz[q] = right - left + 1 }
|
|
79
|
-
|
|
80
|
-
index_left = 0
|
|
81
|
-
index_right = nil
|
|
82
|
-
loglikelihood = 0.0
|
|
83
|
-
bin_left = nil
|
|
84
|
-
bin_right = nil
|
|
85
|
-
|
|
86
|
-
k.times do |kb|
|
|
87
|
-
num_points_in_bin = sizes[kb]
|
|
88
|
-
index_right = index_left + num_points_in_bin - 1
|
|
89
|
-
|
|
90
|
-
if xsorted[index_left] < xsorted[index_right]
|
|
91
|
-
bin_left = xsorted[index_left]
|
|
92
|
-
bin_right = xsorted[index_right]
|
|
93
|
-
elsif xsorted[index_left] == xsorted[index_right]
|
|
94
|
-
bin_left = index_left == 0 ? xsorted[0] : (xsorted[index_left - 1] + xsorted[index_left]) / 2.0
|
|
95
|
-
bin_right = index_right < n - 1 ? (xsorted[index_right] + xsorted[index_right + 1]) / 2.0 : xsorted[n - 1]
|
|
96
|
-
else
|
|
97
|
-
raise "ERROR: binLeft > binRight"
|
|
98
|
-
end
|
|
99
|
-
|
|
100
|
-
bin_width = bin_right.to_f - bin_left
|
|
101
|
-
|
|
102
|
-
mean, variance = shifted_data_variance(index_left, index_right)
|
|
103
|
-
|
|
104
|
-
if variance > 0
|
|
105
|
-
(index_left..index_right).each do |i|
|
|
106
|
-
loglikelihood += -(xsorted[i] - mean) * (xsorted[i] - mean) / (2.0 * variance)
|
|
107
|
-
end
|
|
108
|
-
loglikelihood +=
|
|
109
|
-
num_points_in_bin *
|
|
110
|
-
((Math.log(num_points_in_bin / n.to_f) * adjustment) - (0.5 * Math.log(PI_DOUBLE * variance)))
|
|
111
|
-
else
|
|
112
|
-
loglikelihood += num_points_in_bin * Math.log(1.0 / bin_width / n)
|
|
113
|
-
end
|
|
114
|
-
|
|
115
|
-
index_left = index_right + 1
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
bic = (2.0 * loglikelihood) - (((3 * k) - 1) * Math.log(n.to_f))
|
|
119
|
-
|
|
120
|
-
if k == kmin
|
|
121
|
-
max_bic = bic
|
|
122
|
-
kopt = kmin
|
|
123
|
-
elsif bic > max_bic
|
|
124
|
-
max_bic = bic
|
|
125
|
-
kopt = k
|
|
126
|
-
end
|
|
127
|
-
end
|
|
128
|
-
|
|
129
|
-
kopt
|
|
130
|
-
end
|
|
131
|
-
|
|
132
|
-
def shifted_data_variance(ileft, iright)
|
|
133
|
-
sum = 0.0
|
|
134
|
-
sumsq = 0.0
|
|
135
|
-
mean = 0.0
|
|
136
|
-
variance = 0.0
|
|
137
|
-
n = iright - ileft + 1
|
|
138
|
-
|
|
139
|
-
if iright >= ileft
|
|
140
|
-
median = xsorted[(ileft + iright) / 2].to_f
|
|
141
|
-
|
|
142
|
-
ileft.upto(iright) do |i|
|
|
143
|
-
sumi = xsorted[i] - median
|
|
144
|
-
sum += sumi
|
|
145
|
-
sumsq += sumi**2
|
|
146
|
-
end
|
|
147
|
-
|
|
148
|
-
mean = (sum / n) + median
|
|
149
|
-
variance = (sumsq - (sum * sum / n)) / (n - 1) if n > 1
|
|
150
|
-
end
|
|
151
|
-
|
|
152
|
-
[mean, variance]
|
|
153
|
-
end
|
|
154
|
-
|
|
155
|
-
def backtrack(k)
|
|
156
|
-
return to_enum(__method__, k) unless block_given?
|
|
157
|
-
|
|
158
|
-
right = xcount - 1
|
|
159
|
-
left = nil
|
|
160
|
-
|
|
161
|
-
(k - 1).downto(0) do |q|
|
|
162
|
-
left = splits[q][right]
|
|
163
|
-
|
|
164
|
-
yield q, left, right
|
|
165
|
-
|
|
166
|
-
right = left - 1 if q > 0
|
|
167
|
-
end
|
|
168
|
-
end
|
|
169
|
-
|
|
170
|
-
def dissim(j, i)
|
|
171
|
-
return 0.0 if j >= i
|
|
172
|
-
|
|
173
|
-
sji =
|
|
174
|
-
if j > 0
|
|
175
|
-
segment_sum = xsum[i] - xsum[j - 1]
|
|
176
|
-
segment_size = i - j + 1
|
|
177
|
-
xsumsq[i] - xsumsq[j - 1] - (segment_sum * segment_sum / segment_size)
|
|
178
|
-
else
|
|
179
|
-
xsumsq[i] - (xsum[i] * xsum[i] / (i + 1))
|
|
180
|
-
end
|
|
181
|
-
|
|
182
|
-
[0, sji].max
|
|
183
|
-
end
|
|
184
|
-
|
|
185
|
-
def fill_row(q, imin, imax)
|
|
186
|
-
size = imax - q + 1
|
|
187
|
-
|
|
188
|
-
js = Array.new(size) { |i| q + i }
|
|
189
|
-
smawk(imin, imax, 1, q, js)
|
|
190
|
-
end
|
|
191
|
-
|
|
192
|
-
def smawk(imin, imax, istep, q, js)
|
|
193
|
-
if (imax - imin) <= (0 * istep)
|
|
194
|
-
find_min_from_candidates(q, imin, imax, istep, js)
|
|
195
|
-
else
|
|
196
|
-
js_odd = prune_candidates(imin, imax, istep, q, js)
|
|
197
|
-
# puts "Pruned: #{js_odd.inspect}"
|
|
198
|
-
istepx2 = istep * 2
|
|
199
|
-
imin_odd = imin + istep
|
|
200
|
-
imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2)
|
|
201
|
-
smawk(imin_odd, imax_odd, istepx2, q, js_odd)
|
|
202
|
-
fill_even_positions(imin, imax, istep, q, js)
|
|
203
|
-
end
|
|
204
|
-
end
|
|
205
|
-
|
|
206
|
-
def find_min_from_candidates(q, imin, imax, istep, js)
|
|
207
|
-
optimal_split_index_prev = 0
|
|
208
|
-
|
|
209
|
-
(imin..imax).step(istep) do |i|
|
|
210
|
-
optimal_split_index = optimal_split_index_prev
|
|
211
|
-
optimal_split = js[optimal_split_index]
|
|
212
|
-
cost[q][i] = cost[q - 1][optimal_split - 1] + dissim(optimal_split, i)
|
|
213
|
-
splits[q][i] = optimal_split
|
|
214
|
-
|
|
215
|
-
((optimal_split_index + 1)...js.size).each do |split_index|
|
|
216
|
-
jabs = js[split_index]
|
|
217
|
-
|
|
218
|
-
next if jabs < splits[q - 1][i]
|
|
219
|
-
break if jabs > i
|
|
220
|
-
|
|
221
|
-
sj = cost[q - 1][jabs - 1] + dissim(jabs, i)
|
|
222
|
-
|
|
223
|
-
next unless sj <= cost[q][i]
|
|
224
|
-
|
|
225
|
-
cost[q][i] = sj
|
|
226
|
-
splits[q][i] = js[split_index]
|
|
227
|
-
optimal_split_index_prev = split_index
|
|
228
|
-
end
|
|
229
|
-
end
|
|
230
|
-
end
|
|
231
|
-
|
|
232
|
-
def prune_candidates(imin, imax, istep, q, js)
|
|
233
|
-
n = ((imax - imin) / istep) + 1
|
|
234
|
-
m = js.size
|
|
235
|
-
|
|
236
|
-
return js if n >= m
|
|
237
|
-
|
|
238
|
-
pruned = js.dup
|
|
239
|
-
left = -1
|
|
240
|
-
right = 0
|
|
241
|
-
|
|
242
|
-
while m > n
|
|
243
|
-
p = left + 1
|
|
244
|
-
i = imin + (p * istep)
|
|
245
|
-
j = pruned[right]
|
|
246
|
-
jnext = pruned[right + 1]
|
|
247
|
-
sl = cost[q - 1][j - 1] + dissim(j, i)
|
|
248
|
-
snext = cost[q - 1][jnext - 1] + dissim(jnext, i)
|
|
249
|
-
|
|
250
|
-
if (sl < snext) && (p < n - 1)
|
|
251
|
-
left += 1
|
|
252
|
-
pruned[left] = j
|
|
253
|
-
right += 1
|
|
254
|
-
elsif (sl < snext) && (p == n - 1)
|
|
255
|
-
right += 1
|
|
256
|
-
pruned[right] = j
|
|
257
|
-
m -= 1
|
|
258
|
-
else
|
|
259
|
-
if p > 0
|
|
260
|
-
pruned[right] = pruned[left]
|
|
261
|
-
left -= 1
|
|
262
|
-
else
|
|
263
|
-
right += 1
|
|
264
|
-
end
|
|
265
|
-
|
|
266
|
-
m -= 1
|
|
267
|
-
end
|
|
268
|
-
end
|
|
269
|
-
|
|
270
|
-
((left + 1)...m).each do |r|
|
|
271
|
-
pruned[r] = pruned[right]
|
|
272
|
-
right += 1
|
|
273
|
-
end
|
|
274
|
-
|
|
275
|
-
pruned.slice!(m..-1) if pruned.size > m
|
|
276
|
-
pruned
|
|
277
|
-
end
|
|
278
|
-
|
|
279
|
-
def fill_even_positions(imin, imax, istep, q, js)
|
|
280
|
-
n = js.size
|
|
281
|
-
istepx2 = istep * 2
|
|
282
|
-
jl = js[0]
|
|
283
|
-
|
|
284
|
-
i = imin
|
|
285
|
-
r = 0
|
|
286
|
-
while i <= imax
|
|
287
|
-
r += 1 while js[r] < jl
|
|
288
|
-
|
|
289
|
-
cost[q][i] = cost[q - 1][js[r] - 1] + dissim(js[r], i)
|
|
290
|
-
splits[q][i] = js[r]
|
|
291
|
-
jh = (i + istep) <= imax ? splits[q][i + istep] : js[n - 1]
|
|
292
|
-
jmax = [jh, i].min
|
|
293
|
-
sjimin = dissim(jmax, i)
|
|
294
|
-
|
|
295
|
-
r += 1
|
|
296
|
-
while r < n && js[r] <= jmax
|
|
297
|
-
jabs = js[r]
|
|
298
|
-
|
|
299
|
-
break if jabs > i
|
|
300
|
-
|
|
301
|
-
if jabs < splits[q - 1][i]
|
|
302
|
-
r += 1
|
|
303
|
-
next
|
|
304
|
-
end
|
|
305
|
-
|
|
306
|
-
cost_base = cost[q - 1][jabs - 1]
|
|
307
|
-
sj = cost_base + dissim(jabs, i)
|
|
308
|
-
|
|
309
|
-
if sj <= cost[q][i]
|
|
310
|
-
cost[q][i] = sj
|
|
311
|
-
splits[q][i] = jabs
|
|
312
|
-
elsif cost_base + sjimin > cost[q][i]
|
|
313
|
-
break
|
|
314
|
-
end
|
|
315
|
-
|
|
316
|
-
r += 1
|
|
317
27
|
end
|
|
318
|
-
|
|
319
|
-
r -= 1
|
|
320
|
-
jl = jh
|
|
321
|
-
|
|
322
|
-
i += istepx2
|
|
323
|
-
end
|
|
324
28
|
end
|
|
325
29
|
end
|
|
326
30
|
end
|
data/lib/ckmeans/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ckmeans
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.0.
|
|
4
|
+
version: 1.0.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Vlad Lebedev
|
|
8
8
|
bindir: exe
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2025-04-
|
|
10
|
+
date: 2025-04-24 00:00:00.000000000 Z
|
|
11
11
|
dependencies: []
|
|
12
12
|
description: Repeatable clustering of unidimensional data
|
|
13
13
|
email:
|