ckmeans 1.0.4 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 17dd59ae47e814d5cf0b45665856a52e33e1af22c90722955750004405633a4e
4
- data.tar.gz: 4278bb18d8a987ac71fd7ea179055ab6d2c15292d772b7d9df1dd8c4adde011b
3
+ metadata.gz: 508f78311a643e1fa8e693e4abf1cdf6df4eb06ff09756fa534ff4a514d0f34f
4
+ data.tar.gz: 4ef313387c2e45df4a8afde58e429093023a555a32f4af395a8b79c048a9d98d
5
5
  SHA512:
6
- metadata.gz: 7e3d19cfbfbebb0b26bf1ffdd7c99998a898ccf123359994e339147735b819f3f16fc73c2ac202a3fbe3c4f1c13c747e7181d01d56770be5404ca6354533b23d
7
- data.tar.gz: 2be82db12f8d9da2cafb03713440f3083d2ffd7fd7f6917ad8e98d1c864b1d97f99e9a0771afe6aaaff502fee86d81e9221b8d689a388817b060fc7ce1917a87
6
+ metadata.gz: ae0f1aff4bd6a78da04123d3728234012d0692ec22396b9529b245c8fa473343314508f053ee02ac876131b243704316948f476840d3c495d4e72eba68e095fd
7
+ data.tar.gz: ab95cfdacac4d9204887d4d5c5a7b85aafa3c869ec4b7a851ae994d8f15ddf096e99cb2d1691e6120a7a6bbe51cbde0524c93ad09b5811b7561c43c60a49256c
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 3.2.2
1
+ 3.2.8
data/README.md CHANGED
@@ -19,9 +19,16 @@ gem install ckmeans
19
19
  ## Usage
20
20
 
21
21
  ```rb
22
- Ckmeans::Clusterer(data, kmin).clusters # fixed cluster count
23
- Ckmeans::Clusterer(data, kmin, kmax).clusters # estimate optimal cluster count within kmin and kmax
24
- Ckmeans::Clusterer(data, kmin, kmax, :sensitive).clusters # Adjust Bayesian Information Criteria favoring more smaller clusters
22
+ # Fixed cluster count
23
+ Ckmeans::Clusterer(data, kmin).clusters
24
+ Ckmedian::Clusterer(data, kmin).clusters
25
+
26
+ # Estimate optimal cluster count within kmin and kmax
27
+ Ckmeans::Clusterer(data, kmin, kmax).clusters
28
+ Ckmedian::Clusterer(data, kmin, kmax).clusters
29
+
30
+ # Adjust Bayesian Information Criteria favoring more smaller clusters (Ckmeans only)
31
+ Ckmeans::Clusterer(data, kmin, kmax, :sensitive).clusters
25
32
  ```
26
33
 
27
34
  ## License
@@ -33,6 +33,8 @@ typedef struct VectorI {
33
33
  uint32_t *values;
34
34
  } VectorI;
35
35
 
36
+ typedef LDouble (FnDissim)(uint32_t, uint32_t, VectorF*, VectorF*);
37
+
36
38
  typedef struct State {
37
39
  uint32_t xcount;
38
40
  uint32_t kmin;
@@ -44,6 +46,7 @@ typedef struct State {
44
46
  MatrixI *splits;
45
47
  VectorF *xsum;
46
48
  VectorF *xsumsq;
49
+ FnDissim *dissim;
47
50
  } State;
48
51
 
49
52
  typedef struct RowParams {
@@ -59,6 +62,8 @@ typedef struct {
59
62
  } SegmentStats;
60
63
 
61
64
  VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
65
+ VALUE rb_ckmedian_sorted_group_sizes(VALUE self);
66
+ VALUE rb_sorted_group_sizes(VALUE self, FnDissim*);
62
67
 
63
68
  Arena *arena_create(size_t);
64
69
  void *arena_alloc(Arena*, size_t);
@@ -85,7 +90,8 @@ uint32_t vector_get_i(VectorI*, uint32_t offset);
85
90
  void vector_downsize_i(VectorI*, uint32_t);
86
91
  void vector_inspect_i(VectorI*);
87
92
 
88
- LDouble dissimilarity(uint32_t, uint32_t, VectorF*, VectorF*);
93
+ LDouble dissimilarity_l2(uint32_t, uint32_t, VectorF*, VectorF*);
94
+ LDouble dissimilarity_l1(uint32_t, uint32_t, VectorF*, VectorF*);
89
95
  void fill_row(State, uint32_t, uint32_t, uint32_t);
90
96
  void smawk(State, RowParams, VectorI*);
91
97
  void find_min_from_candidates(State, RowParams, VectorI*);
@@ -95,11 +101,15 @@ SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
95
101
  VectorI *backtrack_sizes(State, VectorI*, uint32_t);
96
102
  uint32_t find_koptimal(State);
97
103
 
104
+
98
105
  void Init_extensions(void) {
99
- VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
100
- VALUE clusterer_class = rb_const_get(ckmeans_module, rb_intern("Clusterer"));
106
+ VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
107
+ VALUE ckmedian_module = rb_const_get(rb_cObject, rb_intern("Ckmedian"));
108
+ VALUE ckmeans_clusterer = rb_const_get(ckmeans_module, rb_intern("Clusterer"));
109
+ VALUE ckmedian_clusterer = rb_const_get(ckmedian_module, rb_intern("Clusterer"));
101
110
 
102
- rb_define_private_method(clusterer_class, "sorted_group_sizes", rb_ckmeans_sorted_group_sizes, 0);
111
+ rb_define_private_method(ckmeans_clusterer, "sorted_group_sizes", rb_ckmeans_sorted_group_sizes, 0);
112
+ rb_define_private_method(ckmedian_clusterer, "sorted_group_sizes", rb_ckmedian_sorted_group_sizes, 0);
103
113
  }
104
114
 
105
115
  # define ARENA_MIN_CAPACITY 100
@@ -107,6 +117,16 @@ void Init_extensions(void) {
107
117
  # define PIx2 (M_PI * 2.0)
108
118
 
109
119
  VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
120
+ {
121
+ return rb_sorted_group_sizes(self, dissimilarity_l2);
122
+ }
123
+
124
+ VALUE rb_ckmedian_sorted_group_sizes(VALUE self)
125
+ {
126
+ return rb_sorted_group_sizes(self, dissimilarity_l1);
127
+ }
128
+
129
+ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
110
130
  {
111
131
  uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
112
132
  uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
@@ -139,7 +159,8 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
139
159
  .cost = cost,
140
160
  .splits = splits,
141
161
  .xsum = xsum,
142
- .xsumsq = xsumsq
162
+ .xsumsq = xsumsq,
163
+ .dissim = criteria
143
164
  };
144
165
 
145
166
 
@@ -157,7 +178,7 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
157
178
 
158
179
  vector_set_f(xsum, i, xsum_prev + diff);
159
180
  vector_set_f(xsumsq, i, xsumsq_prev + diff * diff);
160
- matrix_set_f(cost, 0, i, dissimilarity(0, i, xsum, xsumsq));
181
+ matrix_set_f(cost, 0, i, criteria(0, i, xsum, xsumsq));
161
182
  matrix_set_i(splits, 0, i, 0);
162
183
  }
163
184
 
@@ -336,7 +357,7 @@ void smawk(State state, RowParams rparams, VectorI *split_candidates)
336
357
  }
337
358
  }
338
359
 
339
- void fill_even_positions(State state, RowParams rparams, VectorI *split_candidates)
360
+ inline void fill_even_positions(State state, RowParams rparams, VectorI *split_candidates)
340
361
  {
341
362
  uint32_t row = rparams.row;
342
363
  uint32_t imin = rparams.imin;
@@ -345,9 +366,10 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
345
366
  uint32_t n = split_candidates->size;
346
367
  uint32_t istepx2 = istep * 2;
347
368
  uint32_t jl = vector_get_i(split_candidates, 0);
348
- VectorF *xsum = state.xsum;
349
- VectorF *xsumsq = state.xsumsq;
350
- MatrixI *splits = state.splits;
369
+ VectorF *const xsum = state.xsum;
370
+ VectorF *const xsumsq = state.xsumsq;
371
+ MatrixI *const splits = state.splits;
372
+ FnDissim *const dissim = state.dissim;
351
373
 
352
374
  for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
353
375
  while (vector_get_i(split_candidates, r) < jl) r++;
@@ -356,7 +378,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
356
378
  uint32_t cost_base_row = row - 1;
357
379
  uint32_t cost_base_col = rcandidate - 1;
358
380
  LDouble cost =
359
- matrix_get_f(state.cost, cost_base_row, cost_base_col) + dissimilarity(rcandidate, i, xsum, xsumsq);
381
+ matrix_get_f(state.cost, cost_base_row, cost_base_col) + dissim(rcandidate, i, xsum, xsumsq);
360
382
 
361
383
  matrix_set_f(state.cost, row, i, cost);
362
384
  matrix_set_i(state.splits, row, i, rcandidate);
@@ -367,7 +389,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
367
389
  : vector_get_i(split_candidates, n - 1);
368
390
 
369
391
  uint32_t jmax = jh < i ? jh : i;
370
- LDouble sjimin = dissimilarity(jmax, i, xsum, xsumsq);
392
+ LDouble sjimin = dissim(jmax, i, xsum, xsumsq);
371
393
 
372
394
  for (++r; r < n && vector_get_i(split_candidates, r) <= jmax; r++) {
373
395
  uint32_t jabs = vector_get_i(split_candidates, r);
@@ -376,7 +398,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
376
398
  if (jabs < matrix_get_i(splits, row - 1, i)) continue;
377
399
 
378
400
  LDouble cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
379
- LDouble sj = cost_base + dissimilarity(jabs, i, xsum, xsumsq);
401
+ LDouble sj = cost_base + dissim(jabs, i, xsum, xsumsq);
380
402
  LDouble cost_prev = matrix_get_f(state.cost, row, i);
381
403
 
382
404
  if (sj <= cost_prev) {
@@ -392,7 +414,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
392
414
  }
393
415
  }
394
416
 
395
- void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
417
+ inline void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
396
418
  {
397
419
  const uint32_t row = rparams.row;
398
420
  const uint32_t imin = rparams.imin;
@@ -400,6 +422,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
400
422
  const uint32_t istep = rparams.istep;
401
423
  MatrixF *const cost = state.cost;
402
424
  MatrixI *const splits = state.splits;
425
+ FnDissim *const dissim = state.dissim;
403
426
 
404
427
  uint32_t optimal_split_idx_prev = 0;
405
428
 
@@ -408,7 +431,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
408
431
  const uint32_t optimal_split_idx = optimal_split_idx_prev;
409
432
  const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
410
433
  const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
411
- const LDouble added_cost = dissimilarity(optimal_split, i, state.xsum, state.xsumsq);
434
+ const LDouble added_cost = dissim(optimal_split, i, state.xsum, state.xsumsq);
412
435
 
413
436
  matrix_set_f(cost, row, i, cost_prev + added_cost);
414
437
  matrix_set_i(splits, row, i, optimal_split);
@@ -421,7 +444,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
421
444
  if (split > i) break;
422
445
 
423
446
  LDouble split_cost =
424
- matrix_get_f(cost, row - 1, split - 1) + dissimilarity(split, i, state.xsum, state.xsumsq);
447
+ matrix_get_f(cost, row - 1, split - 1) + dissim(split, i, state.xsum, state.xsumsq);
425
448
 
426
449
  if (split_cost > matrix_get_f(cost, row, i)) continue;
427
450
 
@@ -432,7 +455,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
432
455
  }
433
456
  }
434
457
 
435
- VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
458
+ inline VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
436
459
  {
437
460
  uint32_t imin = rparams.imin;
438
461
  uint32_t row = rparams.row;
@@ -445,6 +468,7 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
445
468
  uint32_t left = 0;
446
469
  uint32_t right = 0;
447
470
  VectorI *pruned = vector_dup_i(split_candidates, state.arena);
471
+ FnDissim *const dissim = state.dissim;
448
472
 
449
473
  while (m > n)
450
474
  {
@@ -452,9 +476,9 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
452
476
  uint32_t j = vector_get_i(pruned, right);
453
477
  uint32_t jnext = vector_get_i(pruned, right + 1);
454
478
  LDouble sl =
455
- matrix_get_f(state.cost, row - 1, j - 1) + dissimilarity(j, i, state.xsum, state.xsumsq);
479
+ matrix_get_f(state.cost, row - 1, j - 1) + dissim(j, i, state.xsum, state.xsumsq);
456
480
  LDouble snext =
457
- matrix_get_f(state.cost, row - 1, jnext - 1) + dissimilarity(jnext, i, state.xsum, state.xsumsq);
481
+ matrix_get_f(state.cost, row - 1, jnext - 1) + dissim(jnext, i, state.xsum, state.xsumsq);
458
482
 
459
483
  if ((sl < snext) && (left < n - 1)) {
460
484
  vector_set_i(pruned, left, j);
@@ -484,7 +508,8 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
484
508
  return pruned;
485
509
  }
486
510
 
487
- inline LDouble dissimilarity(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict xsumsq) {
511
+ /* L2 aka Euclidean aka Mean dissimilarity criteria */
512
+ inline LDouble dissimilarity_l2(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict xsumsq) {
488
513
  LDouble sji = 0.0;
489
514
 
490
515
  if (j >= i) return sji;
@@ -501,6 +526,48 @@ inline LDouble dissimilarity(uint32_t j, uint32_t i, VectorF *restrict xsum, Vec
501
526
  return (sji > 0) ? sji : 0.0;
502
527
  }
503
528
 
529
+ /* L1 aka Manhattan aka Median dissimilarity criteria */
530
+ inline LDouble dissimilarity_l1(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict _xsumsq)
531
+ {
532
+ LDouble sji = 0.0;
533
+
534
+ if (j >= i) return sji;
535
+
536
+ if (j > 0) {
537
+ uint32_t median_idx = (i + j) >> 1;
538
+
539
+ if (((i - j + 1) % 2) == 1) {
540
+ sji =
541
+ - vector_get_f(xsum, median_idx - 1)
542
+ + vector_get_f(xsum, j - 1)
543
+ + vector_get_f(xsum, i)
544
+ - vector_get_f(xsum, median_idx);
545
+ } else {
546
+ sji =
547
+ - vector_get_f(xsum, median_idx)
548
+ + vector_get_f(xsum, j - 1)
549
+ + vector_get_f(xsum, i)
550
+ - vector_get_f(xsum, median_idx);
551
+ }
552
+ } else { // j == 0
553
+ uint32_t median_idx = i >> 1;
554
+
555
+ if (((i + 1) % 2) == 1) {
556
+ sji =
557
+ - vector_get_f(xsum, median_idx - 1)
558
+ + vector_get_f(xsum, i)
559
+ - vector_get_f(xsum, median_idx);
560
+ } else {
561
+ sji =
562
+ - vector_get_f(xsum, median_idx)
563
+ + vector_get_f(xsum, i)
564
+ - vector_get_f(xsum, median_idx);
565
+ }
566
+ }
567
+
568
+ return (sji < 0) ? 0.0 : sji;
569
+ }
570
+
504
571
  inline VectorF *vector_create_f(Arena *arena, uint32_t size) {
505
572
  VectorF *v;
506
573
 
@@ -28,5 +28,3 @@ module Ckmeans
28
28
  end
29
29
  end
30
30
  end
31
-
32
- require "ckmeans/extensions"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmeans
4
- VERSION = "1.0.4"
4
+ VERSION = "1.1.0"
5
5
  end
data/lib/ckmeans.rb CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  require_relative "ckmeans/version"
4
4
  require_relative "ckmeans/clusterer"
5
+ require_relative "ckmedian/clusterer"
6
+ require "ckmeans/extensions"
5
7
 
6
8
  module Ckmeans
7
9
  class Error < StandardError; end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ckmedian
4
+ class Clusterer # rubocop:disable Style/Documentation
5
+ def initialize(entries, kmin, kmax = kmin)
6
+ @xcount = entries.size
7
+
8
+ raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
9
+ raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
10
+
11
+ @kmin = kmin
12
+ @unique_xcount = entries.uniq.size
13
+ @kmax = [@unique_xcount, kmax].min
14
+ @xsorted_original = entries.sort
15
+ @xsorted = @xsorted_original.map(&:to_f)
16
+ end
17
+
18
+ def clusters
19
+ @clusters ||=
20
+ if @unique_xcount <= 1
21
+ [@xsorted_original]
22
+ else
23
+ sorted_group_sizes.each_with_object([]) do |size, groups|
24
+ groups << @xsorted_original.shift(size)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
metadata CHANGED
@@ -1,13 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ckmeans
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Lebedev
8
+ autorequire:
8
9
  bindir: exe
9
10
  cert_chain: []
10
- date: 2025-05-01 00:00:00.000000000 Z
11
+ date: 2025-05-23 00:00:00.000000000 Z
11
12
  dependencies: []
12
13
  description: Repeatable clustering of unidimensional data
13
14
  email:
@@ -32,6 +33,7 @@ files:
32
33
  - lib/ckmeans.rb
33
34
  - lib/ckmeans/clusterer.rb
34
35
  - lib/ckmeans/version.rb
36
+ - lib/ckmedian/clusterer.rb
35
37
  - sig/ckmeans.rbs
36
38
  homepage: https://github.com/vlebedeff/rb-ckmeans
37
39
  licenses:
@@ -41,6 +43,7 @@ metadata:
41
43
  homepage_uri: https://github.com/vlebedeff/rb-ckmeans
42
44
  source_code_uri: https://github.com/vlebedeff/rb-ckmeans
43
45
  changelog_uri: https://github.com/vlebedeff/rb-ckmeans/blob/main/CHANGELOG.md
46
+ post_install_message:
44
47
  rdoc_options: []
45
48
  require_paths:
46
49
  - lib
@@ -55,7 +58,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
55
58
  - !ruby/object:Gem::Version
56
59
  version: '0'
57
60
  requirements: []
58
- rubygems_version: 3.6.5
61
+ rubygems_version: 3.4.19
62
+ signing_key:
59
63
  specification_version: 4
60
64
  summary: Ruby implementation of Ckmeans.1d.dp
61
65
  test_files: []