ckmeans 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3646d5dfdbc85e6168c9b315a52d7b345432f1b964ab96f1dd18bd04c62f867c
4
- data.tar.gz: 562d6aaeff0d81b1ff804886870ed09df1d0115867f9127b8a9b8283be638e11
3
+ metadata.gz: 5ad7e8c24dd367d5e6a6dd66abc529ae92079cf99d1c781a7646c929547b0e62
4
+ data.tar.gz: 2e338ca878eba2d250ca61fff2ea8bee44ec8387b37e12b31600edf9da2b7130
5
5
  SHA512:
6
- metadata.gz: bca50713fcd779e6d2fb7bd37b44cd7ed16a4879736139f531a37f99666f4c797bae5b27b574e26fb6fffdd616ded97a397b29ac1d6e8e268e63eae073051de1
7
- data.tar.gz: 167e1865fb1707b054a13f0dcabfcd8bde2ed19811886fb312a54d129b136a8c4663b961e220097fbede697f02a58c1e72de3ec0640c33aadba6b59e404c0acb
6
+ metadata.gz: 8c59e1e159cc9cada8afed9e016a5d8956cfe909bb7b7d82c8d155f388fdf1924a49072d37e52065fa643a539da3a192767eddb38da95b2c2524bcc7d0a39ebd
7
+ data.tar.gz: f2b535377d441bc1f2ee309a5466c8231b425aa0dd9b0512aa36257defa12b3b645694ae953b2b5e3b6997c50bde796e8fa1c2f8f10d4055b1cc9cb6abcf1353
data/.dockerignore ADDED
@@ -0,0 +1,13 @@
1
+ tmp
2
+
3
+ # Ignore compiled extension files
4
+ *.bundle
5
+ *.so
6
+ *.o
7
+ *.dll
8
+
9
+ # Ignore generated makefiles and compilation artifacts
10
+ ext/**/Makefile
11
+ ext/**/mkmf.log
12
+ ext/**/*.log
13
+ ext/**/tmp/
data/.rubocop_todo.yml CHANGED
@@ -1,49 +1,13 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2025-04-17 07:09:28 UTC using RuboCop version 1.75.1.
3
+ # on 2025-04-24 06:16:37 UTC using RuboCop version 1.75.1.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
7
7
  # versions of RuboCop, may require this file to be generated again.
8
8
 
9
- # Offense count: 7
10
- # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes.
11
- Metrics/AbcSize:
12
- Max: 95
13
-
14
- # Offense count: 2
9
+ # Offense count: 1
15
10
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
16
11
  # AllowedMethods: refine
17
12
  Metrics/BlockLength:
18
- Max: 41
19
-
20
- # Offense count: 3
21
- # Configuration parameters: AllowedMethods, AllowedPatterns.
22
- Metrics/CyclomaticComplexity:
23
- Max: 10
24
-
25
- # Offense count: 6
26
- # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
27
- Metrics/MethodLength:
28
- Max: 48
29
-
30
- # Offense count: 3
31
- # Configuration parameters: AllowedMethods, AllowedPatterns.
32
- Metrics/PerceivedComplexity:
33
- Max: 13
34
-
35
- # Offense count: 12
36
- # Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
37
- # AllowedNames: as, at, by, cc, db, id, if, in, io, ip, of, on, os, pp, to
38
- Naming/MethodParameterName:
39
- Exclude:
40
- - 'lib/ckmeans/clusterer.rb'
41
-
42
- # Offense count: 5
43
- # This cop supports unsafe autocorrection (--autocorrect-all).
44
- # Configuration parameters: EnforcedStyle, AllowedMethods, AllowedPatterns.
45
- # SupportedStyles: predicate, comparison
46
- Style/NumericPredicate:
47
- Exclude:
48
- - 'spec/**/*'
49
- - 'lib/ckmeans/clusterer.rb'
13
+ Max: 26
data/CHANGELOG.md CHANGED
@@ -1,5 +1,14 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [1.0.1] - 2025-04-24
4
+
5
+ - https://github.com/vlebedeff/rb-ckmeans/pull/9
6
+ - https://github.com/vlebedeff/rb-ckmeans/pull/8
7
+
8
+ ## [1.0.0] - 2025-04-22
9
+
10
+ - https://github.com/vlebedeff/rb-ckmeans/pull/6
11
+
3
12
  ## [0.1.2] - 2025-03-31
4
13
 
5
14
  - https://github.com/vlebedeff/rb-ckmeans/pull/3
data/Dockerfile ADDED
@@ -0,0 +1,11 @@
1
+ FROM public.ecr.aws/docker/library/ruby:3.2.2
2
+
3
+ RUN apt-get update && apt-get install -y build-essential ruby-dev
4
+
5
+ RUN gem install bundler -v 2.6.5
6
+
7
+ WORKDIR /opt/rb-ckmeans
8
+ COPY . .
9
+ RUN bundle install -j 12
10
+
11
+ ENTRYPOINT ["bundle", "exec"]
@@ -1,18 +1,20 @@
1
1
  #include <stdio.h>
2
- #include <assert.h>
3
2
  #include <math.h>
3
+ #include <string.h>
4
4
  #include "ruby.h"
5
5
 
6
+ typedef long double LDouble;
7
+
6
8
  typedef struct Arena {
7
- uint32_t capacity;
8
- uint32_t offset;
9
+ size_t capacity;
10
+ size_t offset;
9
11
  uint8_t *buffer;
10
12
  } Arena;
11
13
 
12
14
  typedef struct MatrixF {
13
15
  uint32_t ncols;
14
16
  uint32_t nrows;
15
- long double *values;
17
+ LDouble *values;
16
18
  } MatrixF;
17
19
 
18
20
  typedef struct MatrixI {
@@ -22,12 +24,12 @@ typedef struct MatrixI {
22
24
  } MatrixI;
23
25
 
24
26
  typedef struct VectorF {
25
- uint32_t nvalues;
26
- long double *values;
27
+ uint32_t size;
28
+ LDouble *values;
27
29
  } VectorF;
28
30
 
29
31
  typedef struct VectorI {
30
- uint32_t nvalues;
32
+ uint32_t size;
31
33
  uint32_t *values;
32
34
  } VectorI;
33
35
 
@@ -52,84 +54,82 @@ typedef struct RowParams {
52
54
  } RowParams;
53
55
 
54
56
  typedef struct {
55
- long double mean;
56
- long double variance;
57
+ LDouble mean;
58
+ LDouble variance;
57
59
  } SegmentStats;
58
60
 
59
- VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
60
-
61
- Arena *arena_create(uint32_t);
62
- void *arena_alloc(Arena*, uint32_t);
63
- void arena_rewind(Arena*);
64
- void arena_destroy(Arena*);
65
-
66
- MatrixF *matrix_create_f(Arena*, uint32_t, uint32_t);
67
- MatrixI *matrix_create_i(Arena*, uint32_t, uint32_t);
68
- void matrix_set_f(MatrixF*, uint32_t, uint32_t, long double value);
69
- long double matrix_get_f(MatrixF*, uint32_t, uint32_t);
70
- void matrix_inspect_f(MatrixF*);
71
- void matrix_set_i(MatrixI*, uint32_t, uint32_t, uint32_t value);
72
- uint32_t matrix_get_i(MatrixI*, uint32_t, uint32_t);
73
- void matrix_inspect_i(MatrixI*);
74
-
75
- VectorF *vector_create_f(Arena*, uint32_t);
76
- void vector_set_f(VectorF*, uint32_t offset, long double value);
77
- long double vector_get_f(VectorF*, uint32_t offset);
78
- long double vector_get_diff_f(VectorF*, uint32_t, uint32_t);
79
- void vector_inspect_f(VectorF*);
80
- VectorI *vector_create_i(Arena*, uint32_t);
81
- VectorI *vector_dup_i(VectorI*, Arena*);
82
- void vector_set_i(VectorI*, uint32_t offset, uint32_t value);
83
- uint32_t vector_get_i(VectorI*, uint32_t offset);
84
- void vector_downsize_i(VectorI*, uint32_t);
85
- void vector_inspect_i(VectorI*);
86
-
87
- long double dissimilarity(uint32_t, uint32_t, VectorF*, VectorF*);
61
+ VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
62
+
63
+ Arena *arena_create(size_t);
64
+ void *arena_alloc(Arena*, size_t);
65
+ void arena_destroy(Arena*);
66
+
67
+ MatrixF *matrix_create_f(Arena*, uint32_t, uint32_t);
68
+ MatrixI *matrix_create_i(Arena*, uint32_t, uint32_t);
69
+ void matrix_set_f(MatrixF*, uint32_t, uint32_t, LDouble value);
70
+ LDouble matrix_get_f(MatrixF*, uint32_t, uint32_t);
71
+ void matrix_inspect_f(MatrixF*);
72
+ void matrix_set_i(MatrixI*, uint32_t, uint32_t, uint32_t value);
73
+ uint32_t matrix_get_i(MatrixI*, uint32_t, uint32_t);
74
+ void matrix_inspect_i(MatrixI*);
75
+
76
+ VectorF *vector_create_f(Arena*, uint32_t);
77
+ void vector_set_f(VectorF*, uint32_t offset, LDouble value);
78
+ LDouble vector_get_f(VectorF*, uint32_t offset);
79
+ LDouble vector_get_diff_f(VectorF*, uint32_t, uint32_t);
80
+ void vector_inspect_f(VectorF*);
81
+ VectorI *vector_create_i(Arena*, uint32_t);
82
+ VectorI *vector_dup_i(VectorI*, Arena*);
83
+ void vector_set_i(VectorI*, uint32_t offset, uint32_t value);
84
+ uint32_t vector_get_i(VectorI*, uint32_t offset);
85
+ void vector_downsize_i(VectorI*, uint32_t);
86
+ void vector_inspect_i(VectorI*);
87
+
88
+ LDouble dissimilarity(uint32_t, uint32_t, VectorF*, VectorF*);
88
89
  void fill_row(State, uint32_t, uint32_t, uint32_t);
89
90
  void smawk(State, RowParams, VectorI*);
90
91
  void find_min_from_candidates(State, RowParams, VectorI*);
91
- VectorI *prune_candidates(State, RowParams, VectorI*);
92
+ VectorI *prune_candidates(State, RowParams, VectorI*);
92
93
  void fill_even_positions(State, RowParams, VectorI*);
93
94
  SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
94
- VectorI *backtrack_sizes(State, uint32_t);
95
+ VectorI *backtrack_sizes(State, VectorI*, uint32_t);
95
96
  uint32_t find_koptimal(State);
96
97
 
97
98
  void Init_extensions(void) {
98
- VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
99
+ VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
99
100
  VALUE clusterer_class = rb_const_get(ckmeans_module, rb_intern("Clusterer"));
100
101
 
101
102
  rb_define_private_method(clusterer_class, "sorted_group_sizes", rb_ckmeans_sorted_group_sizes, 0);
102
103
  }
103
104
 
104
- # define ARENA_MIN_CAPACITY 1024
105
- # define ALLOCATION_FACTOR 20
105
+ # define ARENA_MIN_CAPACITY 100
106
106
  # define PIx2 (M_PI * 2.0)
107
107
 
108
- VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
109
- VALUE rb_xcount = rb_ivar_get(self, rb_intern("@xcount"));
110
- VALUE rb_kmin = rb_ivar_get(self, rb_intern("@kmin"));
111
- VALUE rb_kmax = rb_ivar_get(self, rb_intern("@kmax"));
112
- VALUE rb_xsorted = rb_ivar_get(self, rb_intern("@xsorted"));
113
- VALUE rb_apply_bic_deviation = rb_ivar_get(self, rb_intern("@apply_bic_deviation"));
114
- uint32_t xcount = NUM2UINT(rb_xcount);
115
- uint32_t kmin = NUM2UINT(rb_kmin);
116
- uint32_t kmax = NUM2UINT(rb_kmax);
117
- bool apply_deviation = RTEST(rb_apply_bic_deviation);
118
- Arena *arena = arena_create(sizeof(int) * xcount * kmax * ALLOCATION_FACTOR);
119
-
120
- if (arena == NULL) {
121
- return Qnil;
122
- }
108
+ VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
109
+ {
110
+ uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
111
+ uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
112
+ uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
113
+ bool apply_deviation = RTEST(rb_iv_get(self, "@apply_bic_deviation"));
114
+ VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
115
+
116
+ Arena *arena =
117
+ arena_create(
118
+ sizeof(LDouble) * xcount * (kmax + 4) +
119
+ sizeof(uint32_t) * xcount * kmax * 5 +
120
+ ARENA_MIN_CAPACITY
121
+ );
122
+
123
+ if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
123
124
 
124
125
  MatrixF *cost = matrix_create_f(arena, kmax, xcount);
125
126
  MatrixI *splits = matrix_create_i(arena, kmax, xcount);
126
127
  VectorF *xsorted = vector_create_f(arena, xcount);
127
- /* TODO: pack sums into one vector of pairs */
128
128
  VectorF *xsum = vector_create_f(arena, xcount);
129
129
  VectorF *xsumsq = vector_create_f(arena, xcount);
130
130
 
131
131
  for (uint32_t i = 0; i < xcount; i++) {
132
- long double xi = NUM2DBL(rb_ary_entry(rb_xsorted, i));
132
+ LDouble xi = NUM2DBL(rb_ary_entry(rb_xsorted, i));
133
133
  vector_set_f(xsorted, i, xi);
134
134
  }
135
135
 
@@ -147,17 +147,17 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
147
147
  };
148
148
 
149
149
 
150
- long double shift = vector_get_f(xsorted, xcount / 2);
151
- long double diff_initial = vector_get_f(xsorted, 0) - shift;
150
+ LDouble shift = vector_get_f(xsorted, xcount / 2);
151
+ LDouble diff_initial = vector_get_f(xsorted, 0) - shift;
152
152
 
153
153
  vector_set_f(xsum, 0, diff_initial);
154
154
  vector_set_f(xsumsq, 0, diff_initial * diff_initial);
155
155
 
156
156
  for (uint32_t i = 1; i < xcount; i++) {
157
- long double xi = vector_get_f(xsorted, i);
158
- long double xsum_prev = vector_get_f(xsum, i - 1);
159
- long double xsumsq_prev = vector_get_f(xsumsq, i - 1);
160
- long double diff = xi - shift;
157
+ LDouble xi = vector_get_f(xsorted, i);
158
+ LDouble xsum_prev = vector_get_f(xsum, i - 1);
159
+ LDouble xsumsq_prev = vector_get_f(xsumsq, i - 1);
160
+ LDouble diff = xi - shift;
161
161
 
162
162
  vector_set_f(xsum, i, xsum_prev + diff);
163
163
  vector_set_f(xsumsq, i, xsumsq_prev + diff * diff);
@@ -172,7 +172,8 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
172
172
 
173
173
  uint32_t koptimal = find_koptimal(state);
174
174
 
175
- VectorI *sizes = backtrack_sizes(state, koptimal);
175
+ VectorI *sizes = vector_create_i(arena, koptimal);
176
+ backtrack_sizes(state, sizes, koptimal);
176
177
 
177
178
  /* printf("XSORTED \t"); vector_inspect_f(xsorted); */
178
179
  /* printf("K OPTIMAL: %lld\n", koptimal); */
@@ -180,8 +181,8 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
180
181
  /* printf("FINAL COST\n"); matrix_inspect_f(cost); */
181
182
  /* printf("FINAL SPLITS\n"); matrix_inspect_i(splits); */
182
183
 
183
- VALUE response = rb_ary_new2(sizes->nvalues);
184
- for (uint32_t i = 0; i < sizes->nvalues; i++) {
184
+ VALUE response = rb_ary_new2(sizes->size);
185
+ for (uint32_t i = 0; i < sizes->size; i++) {
185
186
  VALUE size = LONG2NUM(vector_get_i(sizes, i));
186
187
  rb_ary_store(response, i, size);
187
188
  }
@@ -193,29 +194,30 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
193
194
 
194
195
  uint32_t find_koptimal(State state)
195
196
  {
196
- uint32_t kmin = state.kmin;
197
- uint32_t kmax = state.kmax;
198
- uint32_t xcount = state.xcount;
199
- uint32_t kopt = kmin;
200
- uint32_t xindex_max = state.xcount - 1;
201
- VectorF *xsorted = state.xsorted;
202
- long double x0 = vector_get_f(xsorted, 0);
203
- long double xn = vector_get_f(xsorted, xindex_max);
204
- long double max_bic = 0.0;
205
- long double adjustment = state.apply_deviation ? 0.0 : 1.0;
206
-
197
+ uint32_t kmin = state.kmin;
198
+ uint32_t kmax = state.kmax;
199
+ uint32_t xcount = state.xcount;
200
+ uint32_t kopt = kmin;
201
+ uint32_t xindex_max = state.xcount - 1;
202
+ VectorF *xsorted = state.xsorted;
203
+ LDouble x0 = vector_get_f(xsorted, 0);
204
+ LDouble xn = vector_get_f(xsorted, xindex_max);
205
+ LDouble max_bic = 0.0;
206
+ LDouble xcount_log = log((LDouble) xcount);
207
+
208
+ VectorI *sizes = vector_create_i(state.arena, kmax);
207
209
  for (uint32_t k = kmin; k <= kmax; k++) {
208
210
  uint32_t index_right, index_left = 0;
209
- long double bin_left, bin_right, loglikelihood = 0.0;
210
- VectorI *sizes = backtrack_sizes(state, k);
211
+ LDouble bin_left, bin_right, loglikelihood = 0.0;
212
+ backtrack_sizes(state, sizes, k);
211
213
 
212
214
  for (uint32_t kb = 0; kb < k; kb++) {
213
- uint32_t npoints = vector_get_i(sizes, kb);
214
- index_right = index_left + npoints - 1;
215
- long double xleft = vector_get_f(xsorted, index_left);
216
- long double xright = vector_get_f(xsorted, index_right);
217
- bin_left = xleft;
218
- bin_right = xright;
215
+ uint32_t npoints = vector_get_i(sizes, kb);
216
+ index_right = index_left + npoints - 1;
217
+ LDouble xleft = vector_get_f(xsorted, index_left);
218
+ LDouble xright = vector_get_f(xsorted, index_right);
219
+ bin_left = xleft;
220
+ bin_right = xright;
219
221
 
220
222
  if (xleft == xright) {
221
223
  bin_left = index_left == 0
@@ -226,18 +228,18 @@ uint32_t find_koptimal(State state)
226
228
  : xn;
227
229
  }
228
230
 
229
- long double bin_width = bin_right - bin_left;
230
- SegmentStats stats = shifted_data_variance(xsorted, index_left, index_right);
231
- long double mean = stats.mean;
232
- long double variance = stats.variance;
231
+ LDouble bin_width = bin_right - bin_left;
232
+ SegmentStats stats = shifted_data_variance(xsorted, index_left, index_right);
233
+ LDouble mean = stats.mean;
234
+ LDouble variance = stats.variance;
233
235
 
234
236
  if (variance > 0) {
235
237
  for (uint32_t i = index_left; i <= index_right; i++) {
236
- long double xi = vector_get_f(xsorted, i);
238
+ LDouble xi = vector_get_f(xsorted, i);
237
239
  loglikelihood += -(xi - mean) * (xi - mean) / (2.0 * variance);
238
240
  }
239
241
  loglikelihood += npoints * (
240
- (log(npoints / (long double) xcount) * adjustment) -
242
+ (state.apply_deviation ? 0.0 : log(npoints / (LDouble) xcount)) -
241
243
  (0.5 * log(PIx2 * variance))
242
244
  );
243
245
  } else {
@@ -247,24 +249,23 @@ uint32_t find_koptimal(State state)
247
249
  index_left = index_right + 1;
248
250
  }
249
251
 
250
- long double bic = (2.0 * loglikelihood) - (((3 * k) - 1) * log((long double) xcount));
252
+ LDouble bic = (2.0 * loglikelihood) - (((3 * k) - 1) * xcount_log);
251
253
 
252
254
  if (k == kmin) {
253
255
  max_bic = bic;
254
- kopt = kmin;
256
+ kopt = kmin;
255
257
  } else if (bic > max_bic) {
256
258
  max_bic = bic;
257
- kopt = k;
259
+ kopt = k;
258
260
  }
259
261
  }
260
262
 
261
263
  return kopt;
262
264
  }
263
265
 
264
- VectorI *backtrack_sizes(State state, uint32_t k)
266
+ VectorI *backtrack_sizes(State state, VectorI *sizes, uint32_t k)
265
267
  {
266
268
  MatrixI *splits = state.splits;
267
- VectorI *sizes = vector_create_i(state.arena, k);
268
269
  uint32_t xcount = state.xcount;
269
270
  uint32_t right = xcount - 1;
270
271
  uint32_t left = 0;
@@ -274,6 +275,7 @@ VectorI *backtrack_sizes(State state, uint32_t k)
274
275
  left = matrix_get_i(splits, i, right);
275
276
  vector_set_i(sizes, i, right - left + 1);
276
277
  }
278
+ // Special case outside of the loop removing the need for conditionals
277
279
  left = matrix_get_i(splits, 0, right);
278
280
  vector_set_i(sizes, 0, right - left + 1);
279
281
 
@@ -282,16 +284,16 @@ VectorI *backtrack_sizes(State state, uint32_t k)
282
284
 
283
285
  SegmentStats shifted_data_variance(VectorF *xsorted, uint32_t left, uint32_t right)
284
286
  {
285
- const uint32_t n = right - left + 1;
286
- long double sum = 0.0;
287
- long double sumsq = 0.0;
287
+ const uint32_t n = right - left + 1;
288
+ LDouble sum = 0.0;
289
+ LDouble sumsq = 0.0;
288
290
  SegmentStats stats = { .mean = 0.0, .variance = 0.0 };
289
291
 
290
292
  if (right >= left) {
291
- const long double median = vector_get_f(xsorted, (left + right) / 2);
293
+ const LDouble median = vector_get_f(xsorted, (left + right) / 2);
292
294
 
293
295
  for (uint32_t i = left; i <= right; i++) {
294
- const long double sumi = vector_get_f(xsorted, i) - median;
296
+ const LDouble sumi = vector_get_f(xsorted, i) - median;
295
297
 
296
298
  sum += sumi;
297
299
  sumsq += sumi * sumi;
@@ -306,7 +308,8 @@ SegmentStats shifted_data_variance(VectorF *xsorted, uint32_t left, uint32_t rig
306
308
  return stats;
307
309
  }
308
310
 
309
- void fill_row(State state, uint32_t q, uint32_t imin, uint32_t imax) {
311
+ void fill_row(State state, uint32_t q, uint32_t imin, uint32_t imax)
312
+ {
310
313
  uint32_t size = imax - q + 1;
311
314
  VectorI *split_candidates = vector_create_i(state.arena, size);
312
315
  for (uint32_t i = 0; i < size; i++) {
@@ -316,7 +319,8 @@ void fill_row(State state, uint32_t q, uint32_t imin, uint32_t imax) {
316
319
  smawk(state, rparams, split_candidates);
317
320
  }
318
321
 
319
- void smawk(State state, RowParams rparams, VectorI *split_candidates) {
322
+ void smawk(State state, RowParams rparams, VectorI *split_candidates)
323
+ {
320
324
  const uint32_t imin = rparams.imin;
321
325
  const uint32_t imax = rparams.imax;
322
326
  const uint32_t istep = rparams.istep;
@@ -326,9 +330,9 @@ void smawk(State state, RowParams rparams, VectorI *split_candidates) {
326
330
  } else {
327
331
  VectorI *odd_candidates = prune_candidates(state, rparams, split_candidates);
328
332
  /* printf("PRUNED\t"); vector_inspect_i(odd_candidates); */
329
- uint32_t istepx2 = istep * 2;
330
- uint32_t imin_odd = imin + istep;
331
- uint32_t imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2);
333
+ uint32_t istepx2 = istep * 2;
334
+ uint32_t imin_odd = imin + istep;
335
+ uint32_t imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2);
332
336
  RowParams rparams_odd = { .row = rparams.row, .imin = imin_odd, .imax = imax_odd, .istep = istepx2 };
333
337
 
334
338
  smawk(state, rparams_odd, odd_candidates);
@@ -342,32 +346,32 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
342
346
  uint32_t imin = rparams.imin;
343
347
  uint32_t imax = rparams.imax;
344
348
  uint32_t istep = rparams.istep;
345
- uint32_t n = split_candidates->nvalues;
349
+ uint32_t n = split_candidates->size;
346
350
  uint32_t istepx2 = istep * 2;
347
351
  uint32_t jl = vector_get_i(split_candidates, 0);
348
- VectorF *xsum = state.xsum;
349
- VectorF *xsumsq = state.xsumsq;
350
- MatrixI *splits = state.splits;
352
+ VectorF *xsum = state.xsum;
353
+ VectorF *xsumsq = state.xsumsq;
354
+ MatrixI *splits = state.splits;
351
355
 
352
356
  for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
353
357
  while (vector_get_i(split_candidates, r) < jl) r++;
354
358
 
355
- uint32_t rcandidate = vector_get_i(split_candidates, r);
359
+ uint32_t rcandidate = vector_get_i(split_candidates, r);
356
360
  uint32_t cost_base_row = row - 1;
357
361
  uint32_t cost_base_col = rcandidate - 1;
358
- long double cost =
362
+ LDouble cost =
359
363
  matrix_get_f(state.cost, cost_base_row, cost_base_col) + dissimilarity(rcandidate, i, xsum, xsumsq);
360
364
 
361
365
  matrix_set_f(state.cost, row, i, cost);
362
366
  matrix_set_i(state.splits, row, i, rcandidate);
363
367
 
364
- uint32_t jh =
368
+ uint32_t jh =
365
369
  (i + istep) <= imax
366
370
  ? matrix_get_i(splits, row, i + istep)
367
371
  : vector_get_i(split_candidates, n - 1);
368
372
 
369
- uint32_t jmax = jh < i ? jh : i;
370
- long double sjimin = dissimilarity(jmax, i, xsum, xsumsq);
373
+ uint32_t jmax = jh < i ? jh : i;
374
+ LDouble sjimin = dissimilarity(jmax, i, xsum, xsumsq);
371
375
 
372
376
  for (++r; r < n && vector_get_i(split_candidates, r) <= jmax; r++) {
373
377
  uint32_t jabs = vector_get_i(split_candidates, r);
@@ -375,9 +379,9 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
375
379
  if (jabs > i) break;
376
380
  if (jabs < matrix_get_i(splits, row - 1, i)) continue;
377
381
 
378
- long double cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
379
- long double sj = cost_base + dissimilarity(jabs, i, xsum, xsumsq);
380
- long double cost_prev = matrix_get_f(state.cost, row, i);
382
+ LDouble cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
383
+ LDouble sj = cost_base + dissimilarity(jabs, i, xsum, xsumsq);
384
+ LDouble cost_prev = matrix_get_f(state.cost, row, i);
381
385
 
382
386
  if (sj <= cost_prev) {
383
387
  matrix_set_f(state.cost, row, i, sj);
@@ -394,10 +398,10 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
394
398
 
395
399
  void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
396
400
  {
397
- const uint32_t row = rparams.row;
398
- const uint32_t imin = rparams.imin;
399
- const uint32_t imax = rparams.imax;
400
- const uint32_t istep = rparams.istep;
401
+ const uint32_t row = rparams.row;
402
+ const uint32_t imin = rparams.imin;
403
+ const uint32_t imax = rparams.imax;
404
+ const uint32_t istep = rparams.istep;
401
405
  MatrixF *const cost = state.cost;
402
406
  MatrixI *const splits = state.splits;
403
407
 
@@ -408,19 +412,19 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
408
412
  const uint32_t optimal_split_idx = optimal_split_idx_prev;
409
413
  const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
410
414
  const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
411
- const long double added_cost = dissimilarity(optimal_split, i, state.xsum, state.xsumsq);
415
+ const LDouble added_cost = dissimilarity(optimal_split, i, state.xsum, state.xsumsq);
412
416
 
413
417
  matrix_set_f(cost, row, i, cost_prev + added_cost);
414
418
  matrix_set_i(splits, row, i, optimal_split);
415
419
 
416
- for (uint32_t r = optimal_split_idx + 1; r < split_candidates->nvalues; r++)
420
+ for (uint32_t r = optimal_split_idx + 1; r < split_candidates->size; r++)
417
421
  {
418
422
  uint32_t split = vector_get_i(split_candidates, r);
419
423
 
420
424
  if (split < matrix_get_i(splits, row - 1, i)) continue;
421
425
  if (split > i) break;
422
426
 
423
- long double split_cost =
427
+ LDouble split_cost =
424
428
  matrix_get_f(cost, row - 1, split - 1) + dissimilarity(split, i, state.xsum, state.xsumsq);
425
429
 
426
430
  if (split_cost > matrix_get_f(cost, row, i)) continue;
@@ -434,39 +438,39 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
434
438
 
435
439
  VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
436
440
  {
437
- uint32_t n = ((rparams.imax - rparams.imin) / rparams.istep) + 1;
438
- uint32_t m = split_candidates->nvalues;
441
+ uint32_t imin = rparams.imin;
442
+ uint32_t row = rparams.row;
443
+ uint32_t istep = rparams.istep;
444
+ uint32_t n = ((rparams.imax - imin) / istep) + 1;
445
+ uint32_t m = split_candidates->size;
439
446
 
440
447
  if (n >= m) return split_candidates;
441
448
 
442
- int32_t left = -1;
449
+ uint32_t left = 0;
443
450
  uint32_t right = 0;
444
451
  VectorI *pruned = vector_dup_i(split_candidates, state.arena);
445
452
 
446
453
  while (m > n)
447
454
  {
448
- uint32_t p = left + 1;
449
- uint32_t i = rparams.imin + p * rparams.istep;
450
- uint32_t j = vector_get_i(pruned, right);
451
- uint32_t jnext = vector_get_i(pruned, right + 1);
452
- long double sl =
453
- matrix_get_f(state.cost, rparams.row - 1, j - 1) + dissimilarity(j, i, state.xsum, state.xsumsq);
454
- long double snext =
455
- matrix_get_f(state.cost, rparams.row - 1, jnext - 1) + dissimilarity(jnext, i, state.xsum, state.xsumsq);
456
-
457
- if ((sl < snext) && (p < n - 1)) {
455
+ uint32_t i = imin + left * istep;
456
+ uint32_t j = vector_get_i(pruned, right);
457
+ uint32_t jnext = vector_get_i(pruned, right + 1);
458
+ LDouble sl =
459
+ matrix_get_f(state.cost, row - 1, j - 1) + dissimilarity(j, i, state.xsum, state.xsumsq);
460
+ LDouble snext =
461
+ matrix_get_f(state.cost, row - 1, jnext - 1) + dissimilarity(jnext, i, state.xsum, state.xsumsq);
462
+
463
+ if ((sl < snext) && (left < n - 1)) {
464
+ vector_set_i(pruned, left, j);
458
465
  left++;
459
466
  right++;
460
- vector_set_i(pruned, left, j);
461
- } else if ((sl < snext) && (p == n - 1)) {
467
+ } else if ((sl < snext) && (left == n - 1)) {
462
468
  right++;
463
469
  m--;
464
470
  vector_set_i(pruned, right, j);
465
471
  } else {
466
- if (p > 0) {
467
- /* TODO: extract `vector_setcpy_T` */
468
- vector_set_i(pruned, right, vector_get_i(pruned, left));
469
- left--;
472
+ if (left > 0) {
473
+ vector_set_i(pruned, right, vector_get_i(pruned, --left));
470
474
  } else {
471
475
  right++;
472
476
  }
@@ -475,8 +479,7 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
475
479
  }
476
480
  }
477
481
 
478
- for (uint32_t i = left + 1; i < m; i++) {
479
- /* TODO: extract `vector_setcpy_T` */
482
+ for (uint32_t i = left; i < m; i++) {
480
483
  vector_set_i(pruned, i, vector_get_i(pruned, right++));
481
484
  }
482
485
 
@@ -485,101 +488,86 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
485
488
  return pruned;
486
489
  }
487
490
 
488
- long double dissimilarity(uint32_t j, uint32_t i, VectorF *xsum, VectorF *xsumsq) {
489
- long double sji = 0.0;
491
+ inline LDouble dissimilarity(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict xsumsq) {
492
+ LDouble sji = 0.0;
490
493
 
491
494
  if (j >= i) return sji;
492
495
 
493
496
  if (j > 0) {
494
- /* TODO: looks more like `segment_delta` */
495
- long double segment_sum = vector_get_diff_f(xsum, i, j - 1);
496
- uint32_t segment_size = i - j + 1;
497
- sji = vector_get_diff_f(xsumsq, i, j - 1) - (segment_sum * segment_sum / segment_size);
497
+ LDouble segment_diff = vector_get_diff_f(xsum, i, j - 1);
498
+ uint32_t segment_size = i - j + 1;
499
+ sji = vector_get_diff_f(xsumsq, i, j - 1) - (segment_diff * segment_diff / segment_size);
498
500
  } else {
499
- long double xsumi = vector_get_f(xsum, i);
500
- sji = vector_get_f(xsumsq, i) - (xsumi * xsumi / (i + 1));
501
+ LDouble xsumi = vector_get_f(xsum, i);
502
+ sji = vector_get_f(xsumsq, i) - (xsumi * xsumi / (i + 1));
501
503
  }
502
504
 
503
505
  return (sji > 0) ? sji : 0.0;
504
506
  }
505
507
 
506
- VectorF *vector_create_f(Arena *arena, uint32_t nvalues) {
508
+ inline VectorF *vector_create_f(Arena *arena, uint32_t size) {
507
509
  VectorF *v;
508
510
 
509
- v = arena_alloc(arena, sizeof(*v));
510
- v->values = arena_alloc(arena, sizeof(*(v->values)) * nvalues);
511
- v->nvalues = nvalues;
511
+ v = arena_alloc(arena, sizeof(*v));
512
+ v->values = arena_alloc(arena, sizeof(*(v->values)) * size);
513
+ v->size = size;
512
514
 
513
515
  return v;
514
516
  }
515
517
 
516
- VectorI *vector_create_i(Arena *arena, uint32_t nvalues) {
518
+ inline VectorI *vector_create_i(Arena *arena, uint32_t size) {
517
519
  VectorI *v;
518
520
 
519
- v = arena_alloc(arena, sizeof(*v));
520
- v->values = arena_alloc(arena, sizeof(*(v->values)) * nvalues);
521
- v->nvalues = nvalues;
521
+ v = arena_alloc(arena, sizeof(*v));
522
+ v->values = arena_alloc(arena, sizeof(*(v->values)) * size);
523
+ v->size = size;
522
524
 
523
525
  return v;
524
526
  }
525
527
 
526
- VectorI *vector_dup_i(VectorI *v, Arena *arena)
528
+ inline VectorI *vector_dup_i(VectorI *v, Arena *arena)
527
529
  {
528
- VectorI *vdup = vector_create_i(arena, v->nvalues);
530
+ VectorI *vdup = vector_create_i(arena, v->size);
529
531
 
530
- /* TODO: use one memcpy call */
531
- for (uint32_t i = 0; i < v->nvalues; i++) {
532
- vector_set_i(vdup, i, vector_get_i(v, i));
533
- }
532
+ memcpy(vdup->values, v->values, sizeof(*(v->values)) * v->size);
534
533
 
535
534
  return vdup;
536
535
  }
537
536
 
538
- void vector_set_f(VectorF *v, uint32_t offset, long double value) {
539
- assert(offset < v->nvalues && "[vector_set_f] element index should be less than nvalues");
540
-
537
+ inline void vector_set_f(VectorF *v, uint32_t offset, LDouble value) {
541
538
  *(v->values + offset) = value;
542
539
  }
543
540
 
544
- void vector_set_i(VectorI *v, uint32_t offset, uint32_t value) {
545
- assert(offset < v->nvalues && "[vector_set_i] element index should be less than nvalues");
546
-
541
+ inline void vector_set_i(VectorI *v, uint32_t offset, uint32_t value) {
547
542
  *(v->values + offset) = value;
548
543
  }
549
544
 
550
- uint32_t vector_get_i(VectorI *v, uint32_t offset) {
551
- assert(offset < v->nvalues && "[vector_get_i] element index should be less than nvalues");
552
-
545
+ inline uint32_t vector_get_i(VectorI *v, uint32_t offset) {
553
546
  return *(v->values + offset);
554
547
  }
555
548
 
556
- void vector_downsize_i(VectorI *v, uint32_t new_size) {
557
- v->nvalues = new_size;
549
+ inline void vector_downsize_i(VectorI *v, uint32_t new_size) {
550
+ v->size = new_size;
558
551
  }
559
552
 
560
553
  void vector_inspect_i(VectorI *v) {
561
- for (uint32_t i = 0; i < v->nvalues - 1; i++)
554
+ for (uint32_t i = 0; i < v->size - 1; i++)
562
555
  printf("%u, ", vector_get_i(v, i));
563
- printf("%u\n", vector_get_i(v, v->nvalues - 1));
556
+ printf("%u\n", vector_get_i(v, v->size - 1));
564
557
  }
565
558
 
566
- long double vector_get_f(VectorF *v, uint32_t offset) {
567
- assert(offset < v->nvalues && "[vector_get_f] element index should be less than nvalues");
568
-
559
+ inline LDouble vector_get_f(VectorF *v, uint32_t offset) {
569
560
  return *(v->values + offset);
570
561
  }
571
562
 
572
- long double vector_get_diff_f(VectorF *v, uint32_t i, uint32_t j) {
573
- assert(i < v->nvalues && "[vector_get_diff_f] i should be less than nvalues");
574
- assert(j < v->nvalues && "[vector_get_diff_f] j should be less than nvalues");
575
-
563
+ inline LDouble vector_get_diff_f(VectorF *v, uint32_t i, uint32_t j) {
576
564
  return *(v->values + i) - *(v->values + j);
577
565
  }
578
566
 
579
567
  void vector_inspect_f(VectorF *v) {
580
- for (uint32_t i = 0; i < v->nvalues - 1; i++)
568
+ for (uint32_t i = 0; i < v->size - 1; i++)
581
569
  printf("%Lf, ", vector_get_f(v, i));
582
- printf("%Lf\n", vector_get_f(v, v->nvalues - 1));
570
+ printf("%Lf\n", vector_get_f(v, v->size - 1));
583
571
  }
584
572
 
585
573
  MatrixF *matrix_create_f(Arena *arena, uint32_t nrows, uint32_t ncols) {
@@ -604,18 +592,12 @@ MatrixI *matrix_create_i(Arena *arena, uint32_t nrows, uint32_t ncols) {
604
592
  return m;
605
593
  }
606
594
 
607
- void matrix_set_f(MatrixF *m, uint32_t i, uint32_t j, long double value) {
608
- assert(i < m->nrows && "[matrix_set_f] row offset should be less than nrows");
609
- assert(j < m->cols && "[matrix_set_f] col offset should be less than ncols");
610
-
595
+ inline void matrix_set_f(MatrixF *m, uint32_t i, uint32_t j, LDouble value) {
611
596
  uint32_t offset = i * m->ncols + j;
612
597
  *(m->values + offset) = value;
613
598
  }
614
599
 
615
- long double matrix_get_f(MatrixF *m, uint32_t i, uint32_t j) {
616
- assert(i < m->nrows && "[matrix_get_f] row offset should be less than nrows");
617
- assert(j < m->cols && "[matrix_get_f] col offset should be less than ncols");
618
-
600
+ inline LDouble matrix_get_f(MatrixF *m, uint32_t i, uint32_t j) {
619
601
  uint32_t offset = i * m->ncols + j;
620
602
  return *(m->values + offset);
621
603
  }
@@ -623,7 +605,7 @@ long double matrix_get_f(MatrixF *m, uint32_t i, uint32_t j) {
623
605
  void matrix_inspect_f(MatrixF *m) {
624
606
  for (uint32_t i = 0; i < m->nrows; i++) {
625
607
  for (uint32_t j = 0; j < m->ncols - 1; j++) {
626
- long double value = matrix_get_f(m, i, j);
608
+ LDouble value = matrix_get_f(m, i, j);
627
609
 
628
610
  printf("%Lf, ", value);
629
611
  }
@@ -639,23 +621,17 @@ void matrix_inspect_i(MatrixI *m) {
639
621
  }
640
622
  }
641
623
 
642
- void matrix_set_i(MatrixI *m, uint32_t i, uint32_t j, uint32_t value) {
643
- assert(i < m->nrows && "[matrix_set_i] row offset should be less than nrows");
644
- assert(j < m->cols && "[matrix_set_i] col offset should be less than ncols");
645
-
624
+ inline void matrix_set_i(MatrixI *m, uint32_t i, uint32_t j, uint32_t value) {
646
625
  uint32_t offset = i * m->ncols + j;
647
626
  *(m->values + offset) = value;
648
627
  }
649
628
 
650
- uint32_t matrix_get_i(MatrixI *m, uint32_t i, uint32_t j) {
651
- assert(i < m->nrows && "[matrix_get_i] row offset should be less than nrows");
652
- assert(j < m->cols && "[matrix_get_i] col offset should be less than ncols");
653
-
629
+ inline uint32_t matrix_get_i(MatrixI *m, uint32_t i, uint32_t j) {
654
630
  uint32_t offset = i * m->ncols + j;
655
631
  return *(m->values + offset);
656
632
  }
657
633
 
658
- Arena *arena_create(uint32_t capacity) {
634
+ Arena *arena_create(size_t capacity) {
659
635
  if (capacity < ARENA_MIN_CAPACITY) {
660
636
  capacity = ARENA_MIN_CAPACITY;
661
637
  }
@@ -683,11 +659,11 @@ Arena *arena_create(uint32_t capacity) {
683
659
  return arena;
684
660
  }
685
661
 
686
- void *arena_alloc(Arena *arena, uint32_t size) {
662
+ void *arena_alloc(Arena *arena, size_t size) {
687
663
  size = (size + 7) & ~7;
688
664
 
689
665
  if (arena->offset + size > arena->capacity) {
690
- printf("Arena Out Of Memory\n");
666
+ rb_raise(rb_eNoMemError, "Arena Insufficient Capacity");
691
667
  return NULL;
692
668
  }
693
669
 
@@ -698,7 +674,8 @@ void *arena_alloc(Arena *arena, uint32_t size) {
698
674
  }
699
675
 
700
676
  void arena_destroy(Arena *arena) {
701
- /* printf("[Arena Destroy] Capacity: %u, offset: %u, left: %u\n", arena->capacity, arena->offset, arena->capacity - arena->offset); */
677
+ /* double leftover = ((double) arena->capacity - arena->offset) / arena->capacity * 100; */
678
+ /* printf("[Arena Destroy] Capacity: %zu, offset: %zu, left: %2.2f%%\n", arena->capacity, arena->offset, leftover); */
702
679
  free(arena->buffer);
703
680
  free(arena);
704
681
  }
@@ -1,9 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmeans
4
- class Clusterer # rubocop:disable Style/Documentation, Metrics/ClassLength
5
- PI_DOUBLE = Math::PI * 2
6
-
4
+ class Clusterer # rubocop:disable Style/Documentation
7
5
  def initialize(entries, kmin, kmax = kmin, kestimate = :regular)
8
6
  @xcount = entries.size
9
7
 
@@ -26,301 +24,7 @@ module Ckmeans
26
24
  sorted_group_sizes.each_with_object([]) do |size, groups|
27
25
  groups << @xsorted_original.shift(size)
28
26
  end
29
-
30
- =begin # rubocop:disable Style/BlockComments
31
- @cost = Array.new(kmax) { Array.new(xcount) { 0.0 } }
32
- @splits = Array.new(kmax) { Array.new(xcount) { 0 } }
33
- @xsum = Array.new(xcount)
34
- @xsumsq = Array.new(xcount)
35
-
36
- shift = xsorted[xcount / 2]
37
- xsum[0] = xsorted[0].to_f - shift
38
- xsumsq[0] = xsum[0]**2
39
-
40
- 1.upto(xcount - 1) do |i|
41
- xf = xsorted[i].to_f
42
- xsum[i] = xsum[i - 1] + xf - shift
43
- xsumsq[i] = xsumsq[i - 1] + ((xf - shift) * (xf - shift))
44
- cost[0][i] = dissim(0, i)
45
- splits[0][i] = 0
46
- end
47
-
48
- kmax_idx = kmax - 1
49
- 1.upto(kmax_idx) do |q|
50
- imin = q < kmax_idx ? [1, q].max : xcount - 1
51
- fill_row(q, imin, xcount - 1)
52
- end
53
-
54
- kopt = koptimal
55
-
56
- puts "RB COST\n", cost.map(&:inspect)
57
- puts "RB SPLITS\n", splits.map(&:inspect)
58
- puts "RB K OPTIMAL: #{kopt}"
59
-
60
- backtrack(kopt).each_with_object(Array.new(kopt)) do |(q, left, right), res|
61
- res[q] = xsorted[left..right]
62
- end
63
- =end
64
- end
65
- end
66
-
67
- private
68
-
69
- attr_reader :cost, :splits, :xsum, :xsumsq, :xcount, :xsorted, :kmin, :kmax
70
-
71
- def koptimal # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
72
- kopt = kmin
73
- n = xcount
74
- max_bic = 0.0
75
- adjustment = kestimate == :sensitive ? 0.0 : 1.0 # Deviation from BIC formula to favor smaller clusters
76
-
77
- kmin.upto(kmax) do |k|
78
- sizes = backtrack(k).each_with_object(Array.new(k)) { |(q, left, right), sz| sz[q] = right - left + 1 }
79
-
80
- index_left = 0
81
- index_right = nil
82
- loglikelihood = 0.0
83
- bin_left = nil
84
- bin_right = nil
85
-
86
- k.times do |kb|
87
- num_points_in_bin = sizes[kb]
88
- index_right = index_left + num_points_in_bin - 1
89
-
90
- if xsorted[index_left] < xsorted[index_right]
91
- bin_left = xsorted[index_left]
92
- bin_right = xsorted[index_right]
93
- elsif xsorted[index_left] == xsorted[index_right]
94
- bin_left = index_left == 0 ? xsorted[0] : (xsorted[index_left - 1] + xsorted[index_left]) / 2.0
95
- bin_right = index_right < n - 1 ? (xsorted[index_right] + xsorted[index_right + 1]) / 2.0 : xsorted[n - 1]
96
- else
97
- raise "ERROR: binLeft > binRight"
98
- end
99
-
100
- bin_width = bin_right.to_f - bin_left
101
-
102
- mean, variance = shifted_data_variance(index_left, index_right)
103
-
104
- if variance > 0
105
- (index_left..index_right).each do |i|
106
- loglikelihood += -(xsorted[i] - mean) * (xsorted[i] - mean) / (2.0 * variance)
107
- end
108
- loglikelihood +=
109
- num_points_in_bin *
110
- ((Math.log(num_points_in_bin / n.to_f) * adjustment) - (0.5 * Math.log(PI_DOUBLE * variance)))
111
- else
112
- loglikelihood += num_points_in_bin * Math.log(1.0 / bin_width / n)
113
- end
114
-
115
- index_left = index_right + 1
116
- end
117
-
118
- bic = (2.0 * loglikelihood) - (((3 * k) - 1) * Math.log(n.to_f))
119
-
120
- if k == kmin
121
- max_bic = bic
122
- kopt = kmin
123
- elsif bic > max_bic
124
- max_bic = bic
125
- kopt = k
126
- end
127
- end
128
-
129
- kopt
130
- end
131
-
132
- def shifted_data_variance(ileft, iright)
133
- sum = 0.0
134
- sumsq = 0.0
135
- mean = 0.0
136
- variance = 0.0
137
- n = iright - ileft + 1
138
-
139
- if iright >= ileft
140
- median = xsorted[(ileft + iright) / 2].to_f
141
-
142
- ileft.upto(iright) do |i|
143
- sumi = xsorted[i] - median
144
- sum += sumi
145
- sumsq += sumi**2
146
- end
147
-
148
- mean = (sum / n) + median
149
- variance = (sumsq - (sum * sum / n)) / (n - 1) if n > 1
150
- end
151
-
152
- [mean, variance]
153
- end
154
-
155
- def backtrack(k)
156
- return to_enum(__method__, k) unless block_given?
157
-
158
- right = xcount - 1
159
- left = nil
160
-
161
- (k - 1).downto(0) do |q|
162
- left = splits[q][right]
163
-
164
- yield q, left, right
165
-
166
- right = left - 1 if q > 0
167
- end
168
- end
169
-
170
- def dissim(j, i)
171
- return 0.0 if j >= i
172
-
173
- sji =
174
- if j > 0
175
- segment_sum = xsum[i] - xsum[j - 1]
176
- segment_size = i - j + 1
177
- xsumsq[i] - xsumsq[j - 1] - (segment_sum * segment_sum / segment_size)
178
- else
179
- xsumsq[i] - (xsum[i] * xsum[i] / (i + 1))
180
- end
181
-
182
- [0, sji].max
183
- end
184
-
185
- def fill_row(q, imin, imax)
186
- size = imax - q + 1
187
-
188
- js = Array.new(size) { |i| q + i }
189
- smawk(imin, imax, 1, q, js)
190
- end
191
-
192
- def smawk(imin, imax, istep, q, js)
193
- if (imax - imin) <= (0 * istep)
194
- find_min_from_candidates(q, imin, imax, istep, js)
195
- else
196
- js_odd = prune_candidates(imin, imax, istep, q, js)
197
- # puts "Pruned: #{js_odd.inspect}"
198
- istepx2 = istep * 2
199
- imin_odd = imin + istep
200
- imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2)
201
- smawk(imin_odd, imax_odd, istepx2, q, js_odd)
202
- fill_even_positions(imin, imax, istep, q, js)
203
- end
204
- end
205
-
206
- def find_min_from_candidates(q, imin, imax, istep, js)
207
- optimal_split_index_prev = 0
208
-
209
- (imin..imax).step(istep) do |i|
210
- optimal_split_index = optimal_split_index_prev
211
- optimal_split = js[optimal_split_index]
212
- cost[q][i] = cost[q - 1][optimal_split - 1] + dissim(optimal_split, i)
213
- splits[q][i] = optimal_split
214
-
215
- ((optimal_split_index + 1)...js.size).each do |split_index|
216
- jabs = js[split_index]
217
-
218
- next if jabs < splits[q - 1][i]
219
- break if jabs > i
220
-
221
- sj = cost[q - 1][jabs - 1] + dissim(jabs, i)
222
-
223
- next unless sj <= cost[q][i]
224
-
225
- cost[q][i] = sj
226
- splits[q][i] = js[split_index]
227
- optimal_split_index_prev = split_index
228
- end
229
- end
230
- end
231
-
232
- def prune_candidates(imin, imax, istep, q, js)
233
- n = ((imax - imin) / istep) + 1
234
- m = js.size
235
-
236
- return js if n >= m
237
-
238
- pruned = js.dup
239
- left = -1
240
- right = 0
241
-
242
- while m > n
243
- p = left + 1
244
- i = imin + (p * istep)
245
- j = pruned[right]
246
- jnext = pruned[right + 1]
247
- sl = cost[q - 1][j - 1] + dissim(j, i)
248
- snext = cost[q - 1][jnext - 1] + dissim(jnext, i)
249
-
250
- if (sl < snext) && (p < n - 1)
251
- left += 1
252
- pruned[left] = j
253
- right += 1
254
- elsif (sl < snext) && (p == n - 1)
255
- right += 1
256
- pruned[right] = j
257
- m -= 1
258
- else
259
- if p > 0
260
- pruned[right] = pruned[left]
261
- left -= 1
262
- else
263
- right += 1
264
- end
265
-
266
- m -= 1
267
- end
268
- end
269
-
270
- ((left + 1)...m).each do |r|
271
- pruned[r] = pruned[right]
272
- right += 1
273
- end
274
-
275
- pruned.slice!(m..-1) if pruned.size > m
276
- pruned
277
- end
278
-
279
- def fill_even_positions(imin, imax, istep, q, js)
280
- n = js.size
281
- istepx2 = istep * 2
282
- jl = js[0]
283
-
284
- i = imin
285
- r = 0
286
- while i <= imax
287
- r += 1 while js[r] < jl
288
-
289
- cost[q][i] = cost[q - 1][js[r] - 1] + dissim(js[r], i)
290
- splits[q][i] = js[r]
291
- jh = (i + istep) <= imax ? splits[q][i + istep] : js[n - 1]
292
- jmax = [jh, i].min
293
- sjimin = dissim(jmax, i)
294
-
295
- r += 1
296
- while r < n && js[r] <= jmax
297
- jabs = js[r]
298
-
299
- break if jabs > i
300
-
301
- if jabs < splits[q - 1][i]
302
- r += 1
303
- next
304
- end
305
-
306
- cost_base = cost[q - 1][jabs - 1]
307
- sj = cost_base + dissim(jabs, i)
308
-
309
- if sj <= cost[q][i]
310
- cost[q][i] = sj
311
- splits[q][i] = jabs
312
- elsif cost_base + sjimin > cost[q][i]
313
- break
314
- end
315
-
316
- r += 1
317
27
  end
318
-
319
- r -= 1
320
- jl = jh
321
-
322
- i += istepx2
323
- end
324
28
  end
325
29
  end
326
30
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmeans
4
- VERSION = "1.0.1"
4
+ VERSION = "1.0.3"
5
5
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ckmeans
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Lebedev
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2025-04-22 00:00:00.000000000 Z
10
+ date: 2025-05-01 00:00:00.000000000 Z
11
11
  dependencies: []
12
12
  description: Repeatable clustering of unidimensional data
13
13
  email:
@@ -17,11 +17,13 @@ extensions:
17
17
  - ext/ckmeans/extconf.rb
18
18
  extra_rdoc_files: []
19
19
  files:
20
+ - ".dockerignore"
20
21
  - ".rspec"
21
22
  - ".rubocop.yml"
22
23
  - ".rubocop_todo.yml"
23
24
  - ".ruby-version"
24
25
  - CHANGELOG.md
26
+ - Dockerfile
25
27
  - LICENSE
26
28
  - README.md
27
29
  - Rakefile