ckmeans 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d7f334215d673498dd97a243407c4606a9d361d8aa07cc497f57f6bf0667f1d5
4
- data.tar.gz: e0def5eeb06821d9b9c1c75b1808b90d27c3b5bfc1f950509de2b388e5f5a291
3
+ metadata.gz: be110daab8039e8a76ccbc68808120caf3aa7b189e107d6bced6d3519e8d917c
4
+ data.tar.gz: 6647bce619e675a4e24f4a17ec8e5aee23280cd97d89b29433da56e122f3c932
5
5
  SHA512:
6
- metadata.gz: 1bea370847934e340bb5f23ec721bc21b26e515d5cf3136870779ea49eb8f0d8d1e1ca134a0dd59caac4fed070b9751bc2612c78974e5ef4bcf99997f1f72e58
7
- data.tar.gz: 37b184e51ed7172765bfedac3671a0bfa7c59a6d2a783ef37fdd90525636f9ad158717b05cee3be24f63f4eec1cbc3ef85b67214000cbbc18a334db067a1e7d5
6
+ metadata.gz: 54c6292bbae43afdbb4c618983c9a602ce68fec83239cf3a2f77c42fb8544fb45801d71910796db49cdf872a6151f616a711bce9c1fe537a202cae476bbb995b
7
+ data.tar.gz: 82d39fb2a5870a92ab579f5342de64e69a170c1d9ef31e9d5e121fb06902d3a3dba1b6a5cd4b56b432524e991e53221b78ebee7c88bbbbea868ee63ccb256d0e
data/.rubocop_todo.yml CHANGED
@@ -1,49 +1,13 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2025-04-17 07:09:28 UTC using RuboCop version 1.75.1.
3
+ # on 2025-04-24 06:16:37 UTC using RuboCop version 1.75.1.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
7
7
  # versions of RuboCop, may require this file to be generated again.
8
8
 
9
- # Offense count: 7
10
- # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes.
11
- Metrics/AbcSize:
12
- Max: 95
13
-
14
- # Offense count: 2
9
+ # Offense count: 1
15
10
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
16
11
  # AllowedMethods: refine
17
12
  Metrics/BlockLength:
18
- Max: 41
19
-
20
- # Offense count: 3
21
- # Configuration parameters: AllowedMethods, AllowedPatterns.
22
- Metrics/CyclomaticComplexity:
23
- Max: 10
24
-
25
- # Offense count: 6
26
- # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
27
- Metrics/MethodLength:
28
- Max: 48
29
-
30
- # Offense count: 3
31
- # Configuration parameters: AllowedMethods, AllowedPatterns.
32
- Metrics/PerceivedComplexity:
33
- Max: 13
34
-
35
- # Offense count: 12
36
- # Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
37
- # AllowedNames: as, at, by, cc, db, id, if, in, io, ip, of, on, os, pp, to
38
- Naming/MethodParameterName:
39
- Exclude:
40
- - 'lib/ckmeans/clusterer.rb'
41
-
42
- # Offense count: 5
43
- # This cop supports unsafe autocorrection (--autocorrect-all).
44
- # Configuration parameters: EnforcedStyle, AllowedMethods, AllowedPatterns.
45
- # SupportedStyles: predicate, comparison
46
- Style/NumericPredicate:
47
- Exclude:
48
- - 'spec/**/*'
49
- - 'lib/ckmeans/clusterer.rb'
13
+ Max: 26
@@ -1,6 +1,6 @@
1
1
  #include <stdio.h>
2
- #include <assert.h>
3
2
  #include <math.h>
3
+ #include <string.h>
4
4
  #include "ruby.h"
5
5
 
6
6
  typedef struct Arena {
@@ -60,7 +60,6 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
60
60
 
61
61
  Arena *arena_create(uint32_t);
62
62
  void *arena_alloc(Arena*, uint32_t);
63
- void arena_rewind(Arena*);
64
63
  void arena_destroy(Arena*);
65
64
 
66
65
  MatrixF *matrix_create_f(Arena*, uint32_t, uint32_t);
@@ -117,14 +116,11 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
117
116
  bool apply_deviation = RTEST(rb_apply_bic_deviation);
118
117
  Arena *arena = arena_create(sizeof(int) * xcount * kmax * ALLOCATION_FACTOR);
119
118
 
120
- if (arena == NULL) {
121
- return Qnil;
122
- }
119
+ if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
123
120
 
124
121
  MatrixF *cost = matrix_create_f(arena, kmax, xcount);
125
122
  MatrixI *splits = matrix_create_i(arena, kmax, xcount);
126
123
  VectorF *xsorted = vector_create_f(arena, xcount);
127
- /* TODO: pack sums into one vector of pairs */
128
124
  VectorF *xsum = vector_create_f(arena, xcount);
129
125
  VectorF *xsumsq = vector_create_f(arena, xcount);
130
126
 
@@ -193,11 +189,11 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
193
189
 
194
190
  uint32_t find_koptimal(State state)
195
191
  {
196
- uint32_t kmin = state.kmin;
197
- uint32_t kmax = state.kmax;
198
- uint32_t xcount = state.xcount;
199
- uint32_t kopt = kmin;
200
- uint32_t xindex_max = state.xcount - 1;
192
+ uint32_t kmin = state.kmin;
193
+ uint32_t kmax = state.kmax;
194
+ uint32_t xcount = state.xcount;
195
+ uint32_t kopt = kmin;
196
+ uint32_t xindex_max = state.xcount - 1;
201
197
  VectorF *xsorted = state.xsorted;
202
198
  long double x0 = vector_get_f(xsorted, 0);
203
199
  long double xn = vector_get_f(xsorted, xindex_max);
@@ -274,6 +270,7 @@ VectorI *backtrack_sizes(State state, uint32_t k)
274
270
  left = matrix_get_i(splits, i, right);
275
271
  vector_set_i(sizes, i, right - left + 1);
276
272
  }
273
+ // Special case outside of the loop removing the need for conditionals
277
274
  left = matrix_get_i(splits, 0, right);
278
275
  vector_set_i(sizes, 0, right - left + 1);
279
276
 
@@ -282,7 +279,7 @@ VectorI *backtrack_sizes(State state, uint32_t k)
282
279
 
283
280
  SegmentStats shifted_data_variance(VectorF *xsorted, uint32_t left, uint32_t right)
284
281
  {
285
- const uint32_t n = right - left + 1;
282
+ const uint32_t n = right - left + 1;
286
283
  long double sum = 0.0;
287
284
  long double sumsq = 0.0;
288
285
  SegmentStats stats = { .mean = 0.0, .variance = 0.0 };
@@ -306,7 +303,8 @@ SegmentStats shifted_data_variance(VectorF *xsorted, uint32_t left, uint32_t rig
306
303
  return stats;
307
304
  }
308
305
 
309
- void fill_row(State state, uint32_t q, uint32_t imin, uint32_t imax) {
306
+ void fill_row(State state, uint32_t q, uint32_t imin, uint32_t imax)
307
+ {
310
308
  uint32_t size = imax - q + 1;
311
309
  VectorI *split_candidates = vector_create_i(state.arena, size);
312
310
  for (uint32_t i = 0; i < size; i++) {
@@ -316,7 +314,8 @@ void fill_row(State state, uint32_t q, uint32_t imin, uint32_t imax) {
316
314
  smawk(state, rparams, split_candidates);
317
315
  }
318
316
 
319
- void smawk(State state, RowParams rparams, VectorI *split_candidates) {
317
+ void smawk(State state, RowParams rparams, VectorI *split_candidates)
318
+ {
320
319
  const uint32_t imin = rparams.imin;
321
320
  const uint32_t imax = rparams.imax;
322
321
  const uint32_t istep = rparams.istep;
@@ -326,9 +325,9 @@ void smawk(State state, RowParams rparams, VectorI *split_candidates) {
326
325
  } else {
327
326
  VectorI *odd_candidates = prune_candidates(state, rparams, split_candidates);
328
327
  /* printf("PRUNED\t"); vector_inspect_i(odd_candidates); */
329
- uint32_t istepx2 = istep * 2;
330
- uint32_t imin_odd = imin + istep;
331
- uint32_t imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2);
328
+ uint32_t istepx2 = istep * 2;
329
+ uint32_t imin_odd = imin + istep;
330
+ uint32_t imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2);
332
331
  RowParams rparams_odd = { .row = rparams.row, .imin = imin_odd, .imax = imax_odd, .istep = istepx2 };
333
332
 
334
333
  smawk(state, rparams_odd, odd_candidates);
@@ -345,9 +344,9 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
345
344
  uint32_t n = split_candidates->nvalues;
346
345
  uint32_t istepx2 = istep * 2;
347
346
  uint32_t jl = vector_get_i(split_candidates, 0);
348
- VectorF *xsum = state.xsum;
349
- VectorF *xsumsq = state.xsumsq;
350
- MatrixI *splits = state.splits;
347
+ VectorF *xsum = state.xsum;
348
+ VectorF *xsumsq = state.xsumsq;
349
+ MatrixI *splits = state.splits;
351
350
 
352
351
  for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
353
352
  while (vector_get_i(split_candidates, r) < jl) r++;
@@ -394,10 +393,10 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
394
393
 
395
394
  void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
396
395
  {
397
- const uint32_t row = rparams.row;
398
- const uint32_t imin = rparams.imin;
399
- const uint32_t imax = rparams.imax;
400
- const uint32_t istep = rparams.istep;
396
+ const uint32_t row = rparams.row;
397
+ const uint32_t imin = rparams.imin;
398
+ const uint32_t imax = rparams.imax;
399
+ const uint32_t istep = rparams.istep;
401
400
  MatrixF *const cost = state.cost;
402
401
  MatrixI *const splits = state.splits;
403
402
 
@@ -408,7 +407,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
408
407
  const uint32_t optimal_split_idx = optimal_split_idx_prev;
409
408
  const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
410
409
  const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
411
- const long double added_cost = dissimilarity(optimal_split, i, state.xsum, state.xsumsq);
410
+ const long double added_cost = dissimilarity(optimal_split, i, state.xsum, state.xsumsq);
412
411
 
413
412
  matrix_set_f(cost, row, i, cost_prev + added_cost);
414
413
  matrix_set_i(splits, row, i, optimal_split);
@@ -434,39 +433,39 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
434
433
 
435
434
  VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
436
435
  {
437
- uint32_t n = ((rparams.imax - rparams.imin) / rparams.istep) + 1;
438
- uint32_t m = split_candidates->nvalues;
436
+ uint32_t imin = rparams.imin;
437
+ uint32_t row = rparams.row;
438
+ uint32_t istep = rparams.istep;
439
+ uint32_t n = ((rparams.imax - imin) / istep) + 1;
440
+ uint32_t m = split_candidates->nvalues;
439
441
 
440
442
  if (n >= m) return split_candidates;
441
443
 
442
- uint32_t left = -1;
443
- uint32_t right = 0;
444
+ uint32_t left = 0;
445
+ uint32_t right = 0;
444
446
  VectorI *pruned = vector_dup_i(split_candidates, state.arena);
445
447
 
446
448
  while (m > n)
447
449
  {
448
- uint32_t p = left + 1;
449
- uint32_t i = rparams.imin + p * rparams.istep;
450
- uint32_t j = vector_get_i(pruned, right);
451
- uint32_t jnext = vector_get_i(pruned, right + 1);
450
+ uint32_t i = imin + left * istep;
451
+ uint32_t j = vector_get_i(pruned, right);
452
+ uint32_t jnext = vector_get_i(pruned, right + 1);
452
453
  long double sl =
453
- matrix_get_f(state.cost, rparams.row - 1, j - 1) + dissimilarity(j, i, state.xsum, state.xsumsq);
454
+ matrix_get_f(state.cost, row - 1, j - 1) + dissimilarity(j, i, state.xsum, state.xsumsq);
454
455
  long double snext =
455
- matrix_get_f(state.cost, rparams.row - 1, jnext - 1) + dissimilarity(jnext, i, state.xsum, state.xsumsq);
456
+ matrix_get_f(state.cost, row - 1, jnext - 1) + dissimilarity(jnext, i, state.xsum, state.xsumsq);
456
457
 
457
- if ((sl < snext) && (p < n - 1)) {
458
+ if ((sl < snext) && (left < n - 1)) {
459
+ vector_set_i(pruned, left, j);
458
460
  left++;
459
461
  right++;
460
- vector_set_i(pruned, left, j);
461
- } else if ((sl < snext) && (p == n - 1)) {
462
+ } else if ((sl < snext) && (left == n - 1)) {
462
463
  right++;
463
464
  m--;
464
465
  vector_set_i(pruned, right, j);
465
466
  } else {
466
- if (p > 0) {
467
- /* TODO: extract `vector_setcpy_T` */
468
- vector_set_i(pruned, right, vector_get_i(pruned, left));
469
- left--;
467
+ if (left > 0) {
468
+ vector_set_i(pruned, right, vector_get_i(pruned, --left));
470
469
  } else {
471
470
  right++;
472
471
  }
@@ -475,8 +474,7 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
475
474
  }
476
475
  }
477
476
 
478
- for (uint32_t i = left + 1; i < m; i++) {
479
- /* TODO: extract `vector_setcpy_T` */
477
+ for (uint32_t i = left; i < m; i++) {
480
478
  vector_set_i(pruned, i, vector_get_i(pruned, right++));
481
479
  }
482
480
 
@@ -491,10 +489,9 @@ long double dissimilarity(uint32_t j, uint32_t i, VectorF *xsum, VectorF *xsumsq
491
489
  if (j >= i) return sji;
492
490
 
493
491
  if (j > 0) {
494
- /* TODO: looks more like `segment_delta` */
495
- long double segment_sum = vector_get_diff_f(xsum, i, j - 1);
492
+ long double segment_diff = vector_get_diff_f(xsum, i, j - 1);
496
493
  uint32_t segment_size = i - j + 1;
497
- sji = vector_get_diff_f(xsumsq, i, j - 1) - (segment_sum * segment_sum / segment_size);
494
+ sji = vector_get_diff_f(xsumsq, i, j - 1) - (segment_diff * segment_diff / segment_size);
498
495
  } else {
499
496
  long double xsumi = vector_get_f(xsum, i);
500
497
  sji = vector_get_f(xsumsq, i) - (xsumi * xsumi / (i + 1));
@@ -527,29 +524,20 @@ VectorI *vector_dup_i(VectorI *v, Arena *arena)
527
524
  {
528
525
  VectorI *vdup = vector_create_i(arena, v->nvalues);
529
526
 
530
- /* TODO: use one memcpy call */
531
- for (uint32_t i = 0; i < v->nvalues; i++) {
532
- vector_set_i(vdup, i, vector_get_i(v, i));
533
- }
527
+ memcpy(vdup->values, v->values, sizeof(*(v->values)) * v->nvalues);
534
528
 
535
529
  return vdup;
536
530
  }
537
531
 
538
532
  void vector_set_f(VectorF *v, uint32_t offset, long double value) {
539
- assert(offset < v->nvalues && "[vector_set_f] element index should be less than nvalues");
540
-
541
533
  *(v->values + offset) = value;
542
534
  }
543
535
 
544
536
  void vector_set_i(VectorI *v, uint32_t offset, uint32_t value) {
545
- assert(offset < v->nvalues && "[vector_set_i] element index should be less than nvalues");
546
-
547
537
  *(v->values + offset) = value;
548
538
  }
549
539
 
550
540
  uint32_t vector_get_i(VectorI *v, uint32_t offset) {
551
- assert(offset < v->nvalues && "[vector_get_i] element index should be less than nvalues");
552
-
553
541
  return *(v->values + offset);
554
542
  }
555
543
 
@@ -564,15 +552,10 @@ void vector_inspect_i(VectorI *v) {
564
552
  }
565
553
 
566
554
  long double vector_get_f(VectorF *v, uint32_t offset) {
567
- assert(offset < v->nvalues && "[vector_get_f] element index should be less than nvalues");
568
-
569
555
  return *(v->values + offset);
570
556
  }
571
557
 
572
558
  long double vector_get_diff_f(VectorF *v, uint32_t i, uint32_t j) {
573
- assert(i < v->nvalues && "[vector_get_diff_f] i should be less than nvalues");
574
- assert(j < v->nvalues && "[vector_get_diff_f] j should be less than nvalues");
575
-
576
559
  return *(v->values + i) - *(v->values + j);
577
560
  }
578
561
 
@@ -605,17 +588,11 @@ MatrixI *matrix_create_i(Arena *arena, uint32_t nrows, uint32_t ncols) {
605
588
  }
606
589
 
607
590
  void matrix_set_f(MatrixF *m, uint32_t i, uint32_t j, long double value) {
608
- assert(i < m->nrows && "[matrix_set_f] row offset should be less than nrows");
609
- assert(j < m->cols && "[matrix_set_f] col offset should be less than ncols");
610
-
611
591
  uint32_t offset = i * m->ncols + j;
612
592
  *(m->values + offset) = value;
613
593
  }
614
594
 
615
595
  long double matrix_get_f(MatrixF *m, uint32_t i, uint32_t j) {
616
- assert(i < m->nrows && "[matrix_get_f] row offset should be less than nrows");
617
- assert(j < m->cols && "[matrix_get_f] col offset should be less than ncols");
618
-
619
596
  uint32_t offset = i * m->ncols + j;
620
597
  return *(m->values + offset);
621
598
  }
@@ -640,17 +617,11 @@ void matrix_inspect_i(MatrixI *m) {
640
617
  }
641
618
 
642
619
  void matrix_set_i(MatrixI *m, uint32_t i, uint32_t j, uint32_t value) {
643
- assert(i < m->nrows && "[matrix_set_i] row offset should be less than nrows");
644
- assert(j < m->cols && "[matrix_set_i] col offset should be less than ncols");
645
-
646
620
  uint32_t offset = i * m->ncols + j;
647
621
  *(m->values + offset) = value;
648
622
  }
649
623
 
650
624
  uint32_t matrix_get_i(MatrixI *m, uint32_t i, uint32_t j) {
651
- assert(i < m->nrows && "[matrix_get_i] row offset should be less than nrows");
652
- assert(j < m->cols && "[matrix_get_i] col offset should be less than ncols");
653
-
654
625
  uint32_t offset = i * m->ncols + j;
655
626
  return *(m->values + offset);
656
627
  }
@@ -687,7 +658,7 @@ void *arena_alloc(Arena *arena, uint32_t size) {
687
658
  size = (size + 7) & ~7;
688
659
 
689
660
  if (arena->offset + size > arena->capacity) {
690
- printf("Arena Out Of Memory\n");
661
+ rb_raise(rb_eNoMemError, "Arena Insufficient Capacity");
691
662
  return NULL;
692
663
  }
693
664
 
@@ -1,9 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmeans
4
- class Clusterer # rubocop:disable Style/Documentation, Metrics/ClassLength
5
- PI_DOUBLE = Math::PI * 2
6
-
4
+ class Clusterer # rubocop:disable Style/Documentation
7
5
  def initialize(entries, kmin, kmax = kmin, kestimate = :regular)
8
6
  @xcount = entries.size
9
7
 
@@ -26,301 +24,7 @@ module Ckmeans
26
24
  sorted_group_sizes.each_with_object([]) do |size, groups|
27
25
  groups << @xsorted_original.shift(size)
28
26
  end
29
-
30
- =begin # rubocop:disable Style/BlockComments
31
- @cost = Array.new(kmax) { Array.new(xcount) { 0.0 } }
32
- @splits = Array.new(kmax) { Array.new(xcount) { 0 } }
33
- @xsum = Array.new(xcount)
34
- @xsumsq = Array.new(xcount)
35
-
36
- shift = xsorted[xcount / 2]
37
- xsum[0] = xsorted[0].to_f - shift
38
- xsumsq[0] = xsum[0]**2
39
-
40
- 1.upto(xcount - 1) do |i|
41
- xf = xsorted[i].to_f
42
- xsum[i] = xsum[i - 1] + xf - shift
43
- xsumsq[i] = xsumsq[i - 1] + ((xf - shift) * (xf - shift))
44
- cost[0][i] = dissim(0, i)
45
- splits[0][i] = 0
46
- end
47
-
48
- kmax_idx = kmax - 1
49
- 1.upto(kmax_idx) do |q|
50
- imin = q < kmax_idx ? [1, q].max : xcount - 1
51
- fill_row(q, imin, xcount - 1)
52
- end
53
-
54
- kopt = koptimal
55
-
56
- puts "RB COST\n", cost.map(&:inspect)
57
- puts "RB SPLITS\n", splits.map(&:inspect)
58
- puts "RB K OPTIMAL: #{kopt}"
59
-
60
- backtrack(kopt).each_with_object(Array.new(kopt)) do |(q, left, right), res|
61
- res[q] = xsorted[left..right]
62
- end
63
- =end
64
- end
65
- end
66
-
67
- private
68
-
69
- attr_reader :cost, :splits, :xsum, :xsumsq, :xcount, :xsorted, :kmin, :kmax
70
-
71
- def koptimal # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
72
- kopt = kmin
73
- n = xcount
74
- max_bic = 0.0
75
- adjustment = kestimate == :sensitive ? 0.0 : 1.0 # Deviation from BIC formula to favor smaller clusters
76
-
77
- kmin.upto(kmax) do |k|
78
- sizes = backtrack(k).each_with_object(Array.new(k)) { |(q, left, right), sz| sz[q] = right - left + 1 }
79
-
80
- index_left = 0
81
- index_right = nil
82
- loglikelihood = 0.0
83
- bin_left = nil
84
- bin_right = nil
85
-
86
- k.times do |kb|
87
- num_points_in_bin = sizes[kb]
88
- index_right = index_left + num_points_in_bin - 1
89
-
90
- if xsorted[index_left] < xsorted[index_right]
91
- bin_left = xsorted[index_left]
92
- bin_right = xsorted[index_right]
93
- elsif xsorted[index_left] == xsorted[index_right]
94
- bin_left = index_left == 0 ? xsorted[0] : (xsorted[index_left - 1] + xsorted[index_left]) / 2.0
95
- bin_right = index_right < n - 1 ? (xsorted[index_right] + xsorted[index_right + 1]) / 2.0 : xsorted[n - 1]
96
- else
97
- raise "ERROR: binLeft > binRight"
98
- end
99
-
100
- bin_width = bin_right.to_f - bin_left
101
-
102
- mean, variance = shifted_data_variance(index_left, index_right)
103
-
104
- if variance > 0
105
- (index_left..index_right).each do |i|
106
- loglikelihood += -(xsorted[i] - mean) * (xsorted[i] - mean) / (2.0 * variance)
107
- end
108
- loglikelihood +=
109
- num_points_in_bin *
110
- ((Math.log(num_points_in_bin / n.to_f) * adjustment) - (0.5 * Math.log(PI_DOUBLE * variance)))
111
- else
112
- loglikelihood += num_points_in_bin * Math.log(1.0 / bin_width / n)
113
- end
114
-
115
- index_left = index_right + 1
116
- end
117
-
118
- bic = (2.0 * loglikelihood) - (((3 * k) - 1) * Math.log(n.to_f))
119
-
120
- if k == kmin
121
- max_bic = bic
122
- kopt = kmin
123
- elsif bic > max_bic
124
- max_bic = bic
125
- kopt = k
126
- end
127
- end
128
-
129
- kopt
130
- end
131
-
132
- def shifted_data_variance(ileft, iright)
133
- sum = 0.0
134
- sumsq = 0.0
135
- mean = 0.0
136
- variance = 0.0
137
- n = iright - ileft + 1
138
-
139
- if iright >= ileft
140
- median = xsorted[(ileft + iright) / 2].to_f
141
-
142
- ileft.upto(iright) do |i|
143
- sumi = xsorted[i] - median
144
- sum += sumi
145
- sumsq += sumi**2
146
- end
147
-
148
- mean = (sum / n) + median
149
- variance = (sumsq - (sum * sum / n)) / (n - 1) if n > 1
150
- end
151
-
152
- [mean, variance]
153
- end
154
-
155
- def backtrack(k)
156
- return to_enum(__method__, k) unless block_given?
157
-
158
- right = xcount - 1
159
- left = nil
160
-
161
- (k - 1).downto(0) do |q|
162
- left = splits[q][right]
163
-
164
- yield q, left, right
165
-
166
- right = left - 1 if q > 0
167
- end
168
- end
169
-
170
- def dissim(j, i)
171
- return 0.0 if j >= i
172
-
173
- sji =
174
- if j > 0
175
- segment_sum = xsum[i] - xsum[j - 1]
176
- segment_size = i - j + 1
177
- xsumsq[i] - xsumsq[j - 1] - (segment_sum * segment_sum / segment_size)
178
- else
179
- xsumsq[i] - (xsum[i] * xsum[i] / (i + 1))
180
- end
181
-
182
- [0, sji].max
183
- end
184
-
185
- def fill_row(q, imin, imax)
186
- size = imax - q + 1
187
-
188
- js = Array.new(size) { |i| q + i }
189
- smawk(imin, imax, 1, q, js)
190
- end
191
-
192
- def smawk(imin, imax, istep, q, js)
193
- if (imax - imin) <= (0 * istep)
194
- find_min_from_candidates(q, imin, imax, istep, js)
195
- else
196
- js_odd = prune_candidates(imin, imax, istep, q, js)
197
- # puts "Pruned: #{js_odd.inspect}"
198
- istepx2 = istep * 2
199
- imin_odd = imin + istep
200
- imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2)
201
- smawk(imin_odd, imax_odd, istepx2, q, js_odd)
202
- fill_even_positions(imin, imax, istep, q, js)
203
- end
204
- end
205
-
206
- def find_min_from_candidates(q, imin, imax, istep, js)
207
- optimal_split_index_prev = 0
208
-
209
- (imin..imax).step(istep) do |i|
210
- optimal_split_index = optimal_split_index_prev
211
- optimal_split = js[optimal_split_index]
212
- cost[q][i] = cost[q - 1][optimal_split - 1] + dissim(optimal_split, i)
213
- splits[q][i] = optimal_split
214
-
215
- ((optimal_split_index + 1)...js.size).each do |split_index|
216
- jabs = js[split_index]
217
-
218
- next if jabs < splits[q - 1][i]
219
- break if jabs > i
220
-
221
- sj = cost[q - 1][jabs - 1] + dissim(jabs, i)
222
-
223
- next unless sj <= cost[q][i]
224
-
225
- cost[q][i] = sj
226
- splits[q][i] = js[split_index]
227
- optimal_split_index_prev = split_index
228
- end
229
- end
230
- end
231
-
232
- def prune_candidates(imin, imax, istep, q, js)
233
- n = ((imax - imin) / istep) + 1
234
- m = js.size
235
-
236
- return js if n >= m
237
-
238
- pruned = js.dup
239
- left = -1
240
- right = 0
241
-
242
- while m > n
243
- p = left + 1
244
- i = imin + (p * istep)
245
- j = pruned[right]
246
- jnext = pruned[right + 1]
247
- sl = cost[q - 1][j - 1] + dissim(j, i)
248
- snext = cost[q - 1][jnext - 1] + dissim(jnext, i)
249
-
250
- if (sl < snext) && (p < n - 1)
251
- left += 1
252
- pruned[left] = j
253
- right += 1
254
- elsif (sl < snext) && (p == n - 1)
255
- right += 1
256
- pruned[right] = j
257
- m -= 1
258
- else
259
- if p > 0
260
- pruned[right] = pruned[left]
261
- left -= 1
262
- else
263
- right += 1
264
- end
265
-
266
- m -= 1
267
- end
268
- end
269
-
270
- ((left + 1)...m).each do |r|
271
- pruned[r] = pruned[right]
272
- right += 1
273
- end
274
-
275
- pruned.slice!(m..-1) if pruned.size > m
276
- pruned
277
- end
278
-
279
- def fill_even_positions(imin, imax, istep, q, js)
280
- n = js.size
281
- istepx2 = istep * 2
282
- jl = js[0]
283
-
284
- i = imin
285
- r = 0
286
- while i <= imax
287
- r += 1 while js[r] < jl
288
-
289
- cost[q][i] = cost[q - 1][js[r] - 1] + dissim(js[r], i)
290
- splits[q][i] = js[r]
291
- jh = (i + istep) <= imax ? splits[q][i + istep] : js[n - 1]
292
- jmax = [jh, i].min
293
- sjimin = dissim(jmax, i)
294
-
295
- r += 1
296
- while r < n && js[r] <= jmax
297
- jabs = js[r]
298
-
299
- break if jabs > i
300
-
301
- if jabs < splits[q - 1][i]
302
- r += 1
303
- next
304
- end
305
-
306
- cost_base = cost[q - 1][jabs - 1]
307
- sj = cost_base + dissim(jabs, i)
308
-
309
- if sj <= cost[q][i]
310
- cost[q][i] = sj
311
- splits[q][i] = jabs
312
- elsif cost_base + sjimin > cost[q][i]
313
- break
314
- end
315
-
316
- r += 1
317
27
  end
318
-
319
- r -= 1
320
- jl = jh
321
-
322
- i += istepx2
323
- end
324
28
  end
325
29
  end
326
30
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmeans
4
- VERSION = "1.0.0"
4
+ VERSION = "1.0.2"
5
5
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ckmeans
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Lebedev
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2025-04-22 00:00:00.000000000 Z
10
+ date: 2025-04-24 00:00:00.000000000 Z
11
11
  dependencies: []
12
12
  description: Repeatable clustering of unidimensional data
13
13
  email: