ckmeans 1.0.3 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5ad7e8c24dd367d5e6a6dd66abc529ae92079cf99d1c781a7646c929547b0e62
4
- data.tar.gz: 2e338ca878eba2d250ca61fff2ea8bee44ec8387b37e12b31600edf9da2b7130
3
+ metadata.gz: 508f78311a643e1fa8e693e4abf1cdf6df4eb06ff09756fa534ff4a514d0f34f
4
+ data.tar.gz: 4ef313387c2e45df4a8afde58e429093023a555a32f4af395a8b79c048a9d98d
5
5
  SHA512:
6
- metadata.gz: 8c59e1e159cc9cada8afed9e016a5d8956cfe909bb7b7d82c8d155f388fdf1924a49072d37e52065fa643a539da3a192767eddb38da95b2c2524bcc7d0a39ebd
7
- data.tar.gz: f2b535377d441bc1f2ee309a5466c8231b425aa0dd9b0512aa36257defa12b3b645694ae953b2b5e3b6997c50bde796e8fa1c2f8f10d4055b1cc9cb6abcf1353
6
+ metadata.gz: ae0f1aff4bd6a78da04123d3728234012d0692ec22396b9529b245c8fa473343314508f053ee02ac876131b243704316948f476840d3c495d4e72eba68e095fd
7
+ data.tar.gz: ab95cfdacac4d9204887d4d5c5a7b85aafa3c869ec4b7a851ae994d8f15ddf096e99cb2d1691e6120a7a6bbe51cbde0524c93ad09b5811b7561c43c60a49256c
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 3.2.2
1
+ 3.2.8
data/CHANGELOG.md CHANGED
@@ -1,9 +1,22 @@
1
1
  ## [Unreleased]
2
2
 
3
- ## [1.0.1] - 2025-04-24
3
+ ## [1.0.4] - 2025-05-01
4
4
 
5
- - https://github.com/vlebedeff/rb-ckmeans/pull/9
6
- - https://github.com/vlebedeff/rb-ckmeans/pull/8
5
+ - Simpler capacity size expression ([#14](https://github.com/vlebedeff/rb-ckmeans/pull/14))
6
+
7
+ ## [1.0.3] - 2025-05-01
8
+
9
+ - More frugal memory allocation ([#11](https://github.com/vlebedeff/rb-ckmeans/pull/11))
10
+ - Use `rb_iv_get` for brevity
11
+ - Various optimizations ([#10](https://github.com/vlebedeff/rb-ckmeans/pull/10))
12
+ - Extract `LDouble` type definition
13
+ - Remove `ruby-prof` gem
14
+ - Rename `nvalues` to `size`
15
+
16
+ ## [1.0.2] - 2025-04-24
17
+
18
+ - Polish & Housekeeping ([#9](https://github.com/vlebedeff/rb-ckmeans/pull/9))
19
+ - Fix int variable sign ([#8](https://github.com/vlebedeff/rb-ckmeans/pull/8))
7
20
 
8
21
  ## [1.0.0] - 2025-04-22
9
22
 
data/README.md CHANGED
@@ -19,9 +19,16 @@ gem install ckmeans
19
19
  ## Usage
20
20
 
21
21
  ```rb
22
- Ckmeans::Clusterer(data, kmin).clusters # fixed cluster count
23
- Ckmeans::Clusterer(data, kmin, kmax).clusters # estimate optimal cluster count within kmin and kmax
24
- Ckmeans::Clusterer(data, kmin, kmax, :sensitive).clusters # Adjust Bayesian Information Criteria favoring more smaller clusters
22
+ # Fixed cluster count
23
+ Ckmeans::Clusterer(data, kmin).clusters
24
+ Ckmedian::Clusterer(data, kmin).clusters
25
+
26
+ # Estimate optimal cluster count within kmin and kmax
27
+ Ckmeans::Clusterer(data, kmin, kmax).clusters
28
+ Ckmedian::Clusterer(data, kmin, kmax).clusters
29
+
30
+ # Adjust Bayesian Information Criteria favoring more smaller clusters (Ckmeans only)
31
+ Ckmeans::Clusterer(data, kmin, kmax, :sensitive).clusters
25
32
  ```
26
33
 
27
34
  ## License
@@ -33,6 +33,8 @@ typedef struct VectorI {
33
33
  uint32_t *values;
34
34
  } VectorI;
35
35
 
36
+ typedef LDouble (FnDissim)(uint32_t, uint32_t, VectorF*, VectorF*);
37
+
36
38
  typedef struct State {
37
39
  uint32_t xcount;
38
40
  uint32_t kmin;
@@ -44,6 +46,7 @@ typedef struct State {
44
46
  MatrixI *splits;
45
47
  VectorF *xsum;
46
48
  VectorF *xsumsq;
49
+ FnDissim *dissim;
47
50
  } State;
48
51
 
49
52
  typedef struct RowParams {
@@ -59,6 +62,8 @@ typedef struct {
59
62
  } SegmentStats;
60
63
 
61
64
  VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
65
+ VALUE rb_ckmedian_sorted_group_sizes(VALUE self);
66
+ VALUE rb_sorted_group_sizes(VALUE self, FnDissim*);
62
67
 
63
68
  Arena *arena_create(size_t);
64
69
  void *arena_alloc(Arena*, size_t);
@@ -85,7 +90,8 @@ uint32_t vector_get_i(VectorI*, uint32_t offset);
85
90
  void vector_downsize_i(VectorI*, uint32_t);
86
91
  void vector_inspect_i(VectorI*);
87
92
 
88
- LDouble dissimilarity(uint32_t, uint32_t, VectorF*, VectorF*);
93
+ LDouble dissimilarity_l2(uint32_t, uint32_t, VectorF*, VectorF*);
94
+ LDouble dissimilarity_l1(uint32_t, uint32_t, VectorF*, VectorF*);
89
95
  void fill_row(State, uint32_t, uint32_t, uint32_t);
90
96
  void smawk(State, RowParams, VectorI*);
91
97
  void find_min_from_candidates(State, RowParams, VectorI*);
@@ -95,30 +101,40 @@ SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
95
101
  VectorI *backtrack_sizes(State, VectorI*, uint32_t);
96
102
  uint32_t find_koptimal(State);
97
103
 
104
+
98
105
  void Init_extensions(void) {
99
- VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
100
- VALUE clusterer_class = rb_const_get(ckmeans_module, rb_intern("Clusterer"));
106
+ VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
107
+ VALUE ckmedian_module = rb_const_get(rb_cObject, rb_intern("Ckmedian"));
108
+ VALUE ckmeans_clusterer = rb_const_get(ckmeans_module, rb_intern("Clusterer"));
109
+ VALUE ckmedian_clusterer = rb_const_get(ckmedian_module, rb_intern("Clusterer"));
101
110
 
102
- rb_define_private_method(clusterer_class, "sorted_group_sizes", rb_ckmeans_sorted_group_sizes, 0);
111
+ rb_define_private_method(ckmeans_clusterer, "sorted_group_sizes", rb_ckmeans_sorted_group_sizes, 0);
112
+ rb_define_private_method(ckmedian_clusterer, "sorted_group_sizes", rb_ckmedian_sorted_group_sizes, 0);
103
113
  }
104
114
 
105
115
  # define ARENA_MIN_CAPACITY 100
116
+ # define ALLOCATION_FACTOR 3
106
117
  # define PIx2 (M_PI * 2.0)
107
118
 
108
119
  VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
120
+ {
121
+ return rb_sorted_group_sizes(self, dissimilarity_l2);
122
+ }
123
+
124
+ VALUE rb_ckmedian_sorted_group_sizes(VALUE self)
125
+ {
126
+ return rb_sorted_group_sizes(self, dissimilarity_l1);
127
+ }
128
+
129
+ VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
109
130
  {
110
131
  uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
111
132
  uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
112
133
  uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
113
134
  bool apply_deviation = RTEST(rb_iv_get(self, "@apply_bic_deviation"));
114
135
  VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
115
-
116
- Arena *arena =
117
- arena_create(
118
- sizeof(LDouble) * xcount * (kmax + 4) +
119
- sizeof(uint32_t) * xcount * kmax * 5 +
120
- ARENA_MIN_CAPACITY
121
- );
136
+ size_t capacity = sizeof(LDouble) * (xcount + 1) * (kmax + 1) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
137
+ Arena *arena = arena_create(capacity);
122
138
 
123
139
  if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
124
140
 
@@ -143,7 +159,8 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
143
159
  .cost = cost,
144
160
  .splits = splits,
145
161
  .xsum = xsum,
146
- .xsumsq = xsumsq
162
+ .xsumsq = xsumsq,
163
+ .dissim = criteria
147
164
  };
148
165
 
149
166
 
@@ -161,7 +178,7 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
161
178
 
162
179
  vector_set_f(xsum, i, xsum_prev + diff);
163
180
  vector_set_f(xsumsq, i, xsumsq_prev + diff * diff);
164
- matrix_set_f(cost, 0, i, dissimilarity(0, i, xsum, xsumsq));
181
+ matrix_set_f(cost, 0, i, criteria(0, i, xsum, xsumsq));
165
182
  matrix_set_i(splits, 0, i, 0);
166
183
  }
167
184
 
@@ -340,7 +357,7 @@ void smawk(State state, RowParams rparams, VectorI *split_candidates)
340
357
  }
341
358
  }
342
359
 
343
- void fill_even_positions(State state, RowParams rparams, VectorI *split_candidates)
360
+ inline void fill_even_positions(State state, RowParams rparams, VectorI *split_candidates)
344
361
  {
345
362
  uint32_t row = rparams.row;
346
363
  uint32_t imin = rparams.imin;
@@ -349,9 +366,10 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
349
366
  uint32_t n = split_candidates->size;
350
367
  uint32_t istepx2 = istep * 2;
351
368
  uint32_t jl = vector_get_i(split_candidates, 0);
352
- VectorF *xsum = state.xsum;
353
- VectorF *xsumsq = state.xsumsq;
354
- MatrixI *splits = state.splits;
369
+ VectorF *const xsum = state.xsum;
370
+ VectorF *const xsumsq = state.xsumsq;
371
+ MatrixI *const splits = state.splits;
372
+ FnDissim *const dissim = state.dissim;
355
373
 
356
374
  for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
357
375
  while (vector_get_i(split_candidates, r) < jl) r++;
@@ -360,7 +378,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
360
378
  uint32_t cost_base_row = row - 1;
361
379
  uint32_t cost_base_col = rcandidate - 1;
362
380
  LDouble cost =
363
- matrix_get_f(state.cost, cost_base_row, cost_base_col) + dissimilarity(rcandidate, i, xsum, xsumsq);
381
+ matrix_get_f(state.cost, cost_base_row, cost_base_col) + dissim(rcandidate, i, xsum, xsumsq);
364
382
 
365
383
  matrix_set_f(state.cost, row, i, cost);
366
384
  matrix_set_i(state.splits, row, i, rcandidate);
@@ -371,7 +389,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
371
389
  : vector_get_i(split_candidates, n - 1);
372
390
 
373
391
  uint32_t jmax = jh < i ? jh : i;
374
- LDouble sjimin = dissimilarity(jmax, i, xsum, xsumsq);
392
+ LDouble sjimin = dissim(jmax, i, xsum, xsumsq);
375
393
 
376
394
  for (++r; r < n && vector_get_i(split_candidates, r) <= jmax; r++) {
377
395
  uint32_t jabs = vector_get_i(split_candidates, r);
@@ -380,7 +398,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
380
398
  if (jabs < matrix_get_i(splits, row - 1, i)) continue;
381
399
 
382
400
  LDouble cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
383
- LDouble sj = cost_base + dissimilarity(jabs, i, xsum, xsumsq);
401
+ LDouble sj = cost_base + dissim(jabs, i, xsum, xsumsq);
384
402
  LDouble cost_prev = matrix_get_f(state.cost, row, i);
385
403
 
386
404
  if (sj <= cost_prev) {
@@ -396,7 +414,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
396
414
  }
397
415
  }
398
416
 
399
- void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
417
+ inline void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
400
418
  {
401
419
  const uint32_t row = rparams.row;
402
420
  const uint32_t imin = rparams.imin;
@@ -404,6 +422,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
404
422
  const uint32_t istep = rparams.istep;
405
423
  MatrixF *const cost = state.cost;
406
424
  MatrixI *const splits = state.splits;
425
+ FnDissim *const dissim = state.dissim;
407
426
 
408
427
  uint32_t optimal_split_idx_prev = 0;
409
428
 
@@ -412,7 +431,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
412
431
  const uint32_t optimal_split_idx = optimal_split_idx_prev;
413
432
  const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
414
433
  const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
415
- const LDouble added_cost = dissimilarity(optimal_split, i, state.xsum, state.xsumsq);
434
+ const LDouble added_cost = dissim(optimal_split, i, state.xsum, state.xsumsq);
416
435
 
417
436
  matrix_set_f(cost, row, i, cost_prev + added_cost);
418
437
  matrix_set_i(splits, row, i, optimal_split);
@@ -425,7 +444,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
425
444
  if (split > i) break;
426
445
 
427
446
  LDouble split_cost =
428
- matrix_get_f(cost, row - 1, split - 1) + dissimilarity(split, i, state.xsum, state.xsumsq);
447
+ matrix_get_f(cost, row - 1, split - 1) + dissim(split, i, state.xsum, state.xsumsq);
429
448
 
430
449
  if (split_cost > matrix_get_f(cost, row, i)) continue;
431
450
 
@@ -436,7 +455,7 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
436
455
  }
437
456
  }
438
457
 
439
- VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
458
+ inline VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
440
459
  {
441
460
  uint32_t imin = rparams.imin;
442
461
  uint32_t row = rparams.row;
@@ -449,6 +468,7 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
449
468
  uint32_t left = 0;
450
469
  uint32_t right = 0;
451
470
  VectorI *pruned = vector_dup_i(split_candidates, state.arena);
471
+ FnDissim *const dissim = state.dissim;
452
472
 
453
473
  while (m > n)
454
474
  {
@@ -456,9 +476,9 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
456
476
  uint32_t j = vector_get_i(pruned, right);
457
477
  uint32_t jnext = vector_get_i(pruned, right + 1);
458
478
  LDouble sl =
459
- matrix_get_f(state.cost, row - 1, j - 1) + dissimilarity(j, i, state.xsum, state.xsumsq);
479
+ matrix_get_f(state.cost, row - 1, j - 1) + dissim(j, i, state.xsum, state.xsumsq);
460
480
  LDouble snext =
461
- matrix_get_f(state.cost, row - 1, jnext - 1) + dissimilarity(jnext, i, state.xsum, state.xsumsq);
481
+ matrix_get_f(state.cost, row - 1, jnext - 1) + dissim(jnext, i, state.xsum, state.xsumsq);
462
482
 
463
483
  if ((sl < snext) && (left < n - 1)) {
464
484
  vector_set_i(pruned, left, j);
@@ -488,7 +508,8 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
488
508
  return pruned;
489
509
  }
490
510
 
491
- inline LDouble dissimilarity(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict xsumsq) {
511
+ /* L2 aka Euclidean aka Mean dissimilarity criteria */
512
+ inline LDouble dissimilarity_l2(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict xsumsq) {
492
513
  LDouble sji = 0.0;
493
514
 
494
515
  if (j >= i) return sji;
@@ -505,6 +526,48 @@ inline LDouble dissimilarity(uint32_t j, uint32_t i, VectorF *restrict xsum, Vec
505
526
  return (sji > 0) ? sji : 0.0;
506
527
  }
507
528
 
529
+ /* L1 aka Manhattan aka Median dissimilarity criteria */
530
+ inline LDouble dissimilarity_l1(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict _xsumsq)
531
+ {
532
+ LDouble sji = 0.0;
533
+
534
+ if (j >= i) return sji;
535
+
536
+ if (j > 0) {
537
+ uint32_t median_idx = (i + j) >> 1;
538
+
539
+ if (((i - j + 1) % 2) == 1) {
540
+ sji =
541
+ - vector_get_f(xsum, median_idx - 1)
542
+ + vector_get_f(xsum, j - 1)
543
+ + vector_get_f(xsum, i)
544
+ - vector_get_f(xsum, median_idx);
545
+ } else {
546
+ sji =
547
+ - vector_get_f(xsum, median_idx)
548
+ + vector_get_f(xsum, j - 1)
549
+ + vector_get_f(xsum, i)
550
+ - vector_get_f(xsum, median_idx);
551
+ }
552
+ } else { // j == 0
553
+ uint32_t median_idx = i >> 1;
554
+
555
+ if (((i + 1) % 2) == 1) {
556
+ sji =
557
+ - vector_get_f(xsum, median_idx - 1)
558
+ + vector_get_f(xsum, i)
559
+ - vector_get_f(xsum, median_idx);
560
+ } else {
561
+ sji =
562
+ - vector_get_f(xsum, median_idx)
563
+ + vector_get_f(xsum, i)
564
+ - vector_get_f(xsum, median_idx);
565
+ }
566
+ }
567
+
568
+ return (sji < 0) ? 0.0 : sji;
569
+ }
570
+
508
571
  inline VectorF *vector_create_f(Arena *arena, uint32_t size) {
509
572
  VectorF *v;
510
573
 
@@ -28,5 +28,3 @@ module Ckmeans
28
28
  end
29
29
  end
30
30
  end
31
-
32
- require "ckmeans/extensions"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmeans
4
- VERSION = "1.0.3"
4
+ VERSION = "1.1.0"
5
5
  end
data/lib/ckmeans.rb CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  require_relative "ckmeans/version"
4
4
  require_relative "ckmeans/clusterer"
5
+ require_relative "ckmedian/clusterer"
6
+ require "ckmeans/extensions"
5
7
 
6
8
  module Ckmeans
7
9
  class Error < StandardError; end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ckmedian
4
+ class Clusterer # rubocop:disable Style/Documentation
5
+ def initialize(entries, kmin, kmax = kmin)
6
+ @xcount = entries.size
7
+
8
+ raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
9
+ raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
10
+
11
+ @kmin = kmin
12
+ @unique_xcount = entries.uniq.size
13
+ @kmax = [@unique_xcount, kmax].min
14
+ @xsorted_original = entries.sort
15
+ @xsorted = @xsorted_original.map(&:to_f)
16
+ end
17
+
18
+ def clusters
19
+ @clusters ||=
20
+ if @unique_xcount <= 1
21
+ [@xsorted_original]
22
+ else
23
+ sorted_group_sizes.each_with_object([]) do |size, groups|
24
+ groups << @xsorted_original.shift(size)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
metadata CHANGED
@@ -1,13 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ckmeans
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Lebedev
8
+ autorequire:
8
9
  bindir: exe
9
10
  cert_chain: []
10
- date: 2025-05-01 00:00:00.000000000 Z
11
+ date: 2025-05-23 00:00:00.000000000 Z
11
12
  dependencies: []
12
13
  description: Repeatable clustering of unidimensional data
13
14
  email:
@@ -32,6 +33,7 @@ files:
32
33
  - lib/ckmeans.rb
33
34
  - lib/ckmeans/clusterer.rb
34
35
  - lib/ckmeans/version.rb
36
+ - lib/ckmedian/clusterer.rb
35
37
  - sig/ckmeans.rbs
36
38
  homepage: https://github.com/vlebedeff/rb-ckmeans
37
39
  licenses:
@@ -41,6 +43,7 @@ metadata:
41
43
  homepage_uri: https://github.com/vlebedeff/rb-ckmeans
42
44
  source_code_uri: https://github.com/vlebedeff/rb-ckmeans
43
45
  changelog_uri: https://github.com/vlebedeff/rb-ckmeans/blob/main/CHANGELOG.md
46
+ post_install_message:
44
47
  rdoc_options: []
45
48
  require_paths:
46
49
  - lib
@@ -55,7 +58,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
55
58
  - !ruby/object:Gem::Version
56
59
  version: '0'
57
60
  requirements: []
58
- rubygems_version: 3.6.5
61
+ rubygems_version: 3.4.19
62
+ signing_key:
59
63
  specification_version: 4
60
64
  summary: Ruby implementation of Ckmeans.1d.dp
61
65
  test_files: []