ckmeans 0.1.2 → 1.0.0.rc

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 72a2d84963628565eb94d962dc73aa1230f7af8c6948dc6713b0f9582d1bb401
4
- data.tar.gz: ec3cb0c09eaaf38147d1a7dfd1f772b960905f336b801aade402d6f795329d27
3
+ metadata.gz: 0c032b968f4f996b50ea2d63b1624eccd2d6dd4ff4922042143ada4200664216
4
+ data.tar.gz: c8b220a8ebe08b2aebc78cb7c5e347e4d734627aac6ad61da298dc22ee30e884
5
5
  SHA512:
6
- metadata.gz: ceb63e72327d2f3a00aee2c23c7ba8bd63f835d7b0132ce0785f9636b550a79ea4827eadfeb9a6d95c36e9b046ae5005325adeaf2cdd1689444d8f7af181bbc1
7
- data.tar.gz: 0d7d7ca2c942ecb238c7d26cc601d64187ae97218bdd59aeb6ca067d11f08472725e6b3e8ca9021be88d63aa4ae45a5d629a5f8966a3731b7941aa6c0a205619
6
+ metadata.gz: 0fa159e921f89ba73ca478476903ae3a07893ea8cdff2b86ec25605f22f2864c75eb8763c5daf50657b194ed216dec69cfe99e405035f2b83b540e3e5c5c2599
7
+ data.tar.gz: 99f4ea7b2db58fb076325a4b3cd3b5866e3ad35d48c3d89521555b22e14720ac37e422a3ad258c402494cc6c2896a9a48bab4c126d4660912f29caa4a393a28b
data/.rubocop_todo.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2025-03-31 15:04:58 UTC using RuboCop version 1.75.1.
3
+ # on 2025-04-17 07:09:28 UTC using RuboCop version 1.75.1.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
@@ -11,48 +11,35 @@
11
11
  Metrics/AbcSize:
12
12
  Max: 95
13
13
 
14
- # Offense count: 3
14
+ # Offense count: 2
15
15
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
16
16
  # AllowedMethods: refine
17
17
  Metrics/BlockLength:
18
- Max: 112
18
+ Max: 41
19
19
 
20
20
  # Offense count: 3
21
21
  # Configuration parameters: AllowedMethods, AllowedPatterns.
22
22
  Metrics/CyclomaticComplexity:
23
23
  Max: 10
24
24
 
25
- # Offense count: 5
25
+ # Offense count: 6
26
26
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
27
27
  Metrics/MethodLength:
28
28
  Max: 48
29
29
 
30
- # Offense count: 5
31
- # Configuration parameters: CountKeywordArgs, MaxOptionalParameters.
32
- Metrics/ParameterLists:
33
- Max: 9
34
-
35
30
  # Offense count: 3
36
31
  # Configuration parameters: AllowedMethods, AllowedPatterns.
37
32
  Metrics/PerceivedComplexity:
38
33
  Max: 13
39
34
 
40
- # Offense count: 13
35
+ # Offense count: 12
41
36
  # Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
42
37
  # AllowedNames: as, at, by, cc, db, id, if, in, io, ip, of, on, os, pp, to
43
38
  Naming/MethodParameterName:
44
39
  Exclude:
45
40
  - 'lib/ckmeans/clusterer.rb'
46
41
 
47
- # Offense count: 1
48
- # Configuration parameters: EnforcedStyle, CheckMethodNames, CheckSymbols, AllowedIdentifiers, AllowedPatterns.
49
- # SupportedStyles: snake_case, normalcase, non_integer
50
- # AllowedIdentifiers: TLS1_1, TLS1_2, capture3, iso8601, rfc1123_date, rfc822, rfc2822, rfc3339, x86_64
51
- Naming/VariableNumber:
52
- Exclude:
53
- - 'lib/ckmeans/clusterer.rb'
54
-
55
- # Offense count: 6
42
+ # Offense count: 5
56
43
  # This cop supports unsafe autocorrection (--autocorrect-all).
57
44
  # Configuration parameters: EnforcedStyle, AllowedMethods, AllowedPatterns.
58
45
  # SupportedStyles: predicate, comparison
data/Rakefile CHANGED
@@ -2,6 +2,12 @@
2
2
 
3
3
  require "bundler/gem_tasks"
4
4
  require "rspec/core/rake_task"
5
+ require "rake/extensiontask"
6
+
7
+ Rake::ExtensionTask.new("extensions") do |ext|
8
+ ext.lib_dir = "lib/ckmeans"
9
+ ext.ext_dir = "ext/ckmeans"
10
+ end
5
11
 
6
12
  RSpec::Core::RakeTask.new(:spec)
7
13
 
@@ -9,4 +15,4 @@ require "rubocop/rake_task"
9
15
 
10
16
  RuboCop::RakeTask.new
11
17
 
12
- task default: %i[spec rubocop]
18
+ task default: %i[compile spec rubocop]
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mkmf"
4
+
5
+ create_makefile("ckmeans/extensions")
@@ -0,0 +1,704 @@
1
+ #include <stdio.h>
2
+ #include <assert.h>
3
+ #include <math.h>
4
+ #include "ruby.h"
5
+
6
+ typedef struct Arena {
7
+ uint32_t capacity;
8
+ uint32_t offset;
9
+ uint8_t *buffer;
10
+ } Arena;
11
+
12
+ typedef struct MatrixF {
13
+ uint32_t ncols;
14
+ uint32_t nrows;
15
+ long double *values;
16
+ } MatrixF;
17
+
18
+ typedef struct MatrixI {
19
+ uint32_t ncols;
20
+ uint32_t nrows;
21
+ uint32_t *values;
22
+ } MatrixI;
23
+
24
+ typedef struct VectorF {
25
+ uint32_t nvalues;
26
+ long double *values;
27
+ } VectorF;
28
+
29
+ typedef struct VectorI {
30
+ uint32_t nvalues;
31
+ uint32_t *values;
32
+ } VectorI;
33
+
34
+ typedef struct State {
35
+ uint32_t xcount;
36
+ uint32_t kmin;
37
+ uint32_t kmax;
38
+ bool apply_deviation;
39
+ Arena *arena;
40
+ VectorF *xsorted;
41
+ MatrixF *cost;
42
+ MatrixI *splits;
43
+ VectorF *xsum;
44
+ VectorF *xsumsq;
45
+ } State;
46
+
47
+ typedef struct RowParams {
48
+ uint32_t row;
49
+ uint32_t imin;
50
+ uint32_t imax;
51
+ uint32_t istep;
52
+ } RowParams;
53
+
54
+ typedef struct {
55
+ long double mean;
56
+ long double variance;
57
+ } SegmentStats;
58
+
59
+ VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
60
+
61
+ Arena *arena_create(uint32_t);
62
+ void *arena_alloc(Arena*, uint32_t);
63
+ void arena_rewind(Arena*);
64
+ void arena_destroy(Arena*);
65
+
66
+ MatrixF *matrix_create_f(Arena*, uint32_t, uint32_t);
67
+ MatrixI *matrix_create_i(Arena*, uint32_t, uint32_t);
68
+ void matrix_set_f(MatrixF*, uint32_t, uint32_t, long double value);
69
+ long double matrix_get_f(MatrixF*, uint32_t, uint32_t);
70
+ void matrix_inspect_f(MatrixF*);
71
+ void matrix_set_i(MatrixI*, uint32_t, uint32_t, uint32_t value);
72
+ uint32_t matrix_get_i(MatrixI*, uint32_t, uint32_t);
73
+ void matrix_inspect_i(MatrixI*);
74
+
75
+ VectorF *vector_create_f(Arena*, uint32_t);
76
+ void vector_set_f(VectorF*, uint32_t offset, long double value);
77
+ long double vector_get_f(VectorF*, uint32_t offset);
78
+ long double vector_get_diff_f(VectorF*, uint32_t, uint32_t);
79
+ void vector_inspect_f(VectorF*);
80
+ VectorI *vector_create_i(Arena*, uint32_t);
81
+ VectorI *vector_dup_i(VectorI*, Arena*);
82
+ void vector_set_i(VectorI*, uint32_t offset, uint32_t value);
83
+ uint32_t vector_get_i(VectorI*, uint32_t offset);
84
+ void vector_downsize_i(VectorI*, uint32_t);
85
+ void vector_inspect_i(VectorI*);
86
+
87
+ long double dissimilarity(uint32_t, uint32_t, VectorF*, VectorF*);
88
+ void fill_row(State, uint32_t, uint32_t, uint32_t);
89
+ void smawk(State, RowParams, VectorI*);
90
+ void find_min_from_candidates(State, RowParams, VectorI*);
91
+ VectorI *prune_candidates(State, RowParams, VectorI*);
92
+ void fill_even_positions(State, RowParams, VectorI*);
93
+ SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
94
+ VectorI *backtrack_sizes(State, uint32_t);
95
+ uint32_t find_koptimal(State);
96
+
97
+ void Init_extensions(void) {
98
+ VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
99
+ VALUE clusterer_class = rb_const_get(ckmeans_module, rb_intern("Clusterer"));
100
+
101
+ rb_define_private_method(clusterer_class, "sorted_group_sizes", rb_ckmeans_sorted_group_sizes, 0);
102
+ }
103
+
104
+ # define ARENA_MIN_CAPACITY 1024
105
+ # define ALLOCATION_FACTOR 20
106
+ # define PIx2 (M_PI * 2.0)
107
+
108
+ VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
109
+ VALUE rb_xcount = rb_ivar_get(self, rb_intern("@xcount"));
110
+ VALUE rb_kmin = rb_ivar_get(self, rb_intern("@kmin"));
111
+ VALUE rb_kmax = rb_ivar_get(self, rb_intern("@kmax"));
112
+ VALUE rb_xsorted = rb_ivar_get(self, rb_intern("@xsorted"));
113
+ VALUE rb_apply_bic_deviation = rb_ivar_get(self, rb_intern("@apply_bic_deviation"));
114
+ uint32_t xcount = NUM2UINT(rb_xcount);
115
+ uint32_t kmin = NUM2UINT(rb_kmin);
116
+ uint32_t kmax = NUM2UINT(rb_kmax);
117
+ bool apply_deviation = RTEST(rb_apply_bic_deviation);
118
+ Arena *arena = arena_create(sizeof(int) * xcount * kmax * ALLOCATION_FACTOR);
119
+
120
+ if (arena == NULL) {
121
+ return Qnil;
122
+ }
123
+
124
+ MatrixF *cost = matrix_create_f(arena, kmax, xcount);
125
+ MatrixI *splits = matrix_create_i(arena, kmax, xcount);
126
+ VectorF *xsorted = vector_create_f(arena, xcount);
127
+ /* TODO: pack sums into one vector of pairs */
128
+ VectorF *xsum = vector_create_f(arena, xcount);
129
+ VectorF *xsumsq = vector_create_f(arena, xcount);
130
+
131
+ for (uint32_t i = 0; i < xcount; i++) {
132
+ long double xi = NUM2DBL(rb_ary_entry(rb_xsorted, i));
133
+ vector_set_f(xsorted, i, xi);
134
+ }
135
+
136
+ State state = {
137
+ .arena = arena,
138
+ .xcount = xcount,
139
+ .kmin = kmin,
140
+ .kmax = kmax,
141
+ .apply_deviation = apply_deviation,
142
+ .xsorted = xsorted,
143
+ .cost = cost,
144
+ .splits = splits,
145
+ .xsum = xsum,
146
+ .xsumsq = xsumsq
147
+ };
148
+
149
+
150
+ long double shift = vector_get_f(xsorted, xcount / 2);
151
+ long double diff_initial = vector_get_f(xsorted, 0) - shift;
152
+
153
+ vector_set_f(xsum, 0, diff_initial);
154
+ vector_set_f(xsumsq, 0, diff_initial * diff_initial);
155
+
156
+ for (uint32_t i = 1; i < xcount; i++) {
157
+ long double xi = vector_get_f(xsorted, i);
158
+ long double xsum_prev = vector_get_f(xsum, i - 1);
159
+ long double xsumsq_prev = vector_get_f(xsumsq, i - 1);
160
+ long double diff = xi - shift;
161
+
162
+ vector_set_f(xsum, i, xsum_prev + diff);
163
+ vector_set_f(xsumsq, i, xsumsq_prev + diff * diff);
164
+ matrix_set_f(cost, 0, i, dissimilarity(0, i, xsum, xsumsq));
165
+ matrix_set_i(splits, 0, i, 0);
166
+ }
167
+
168
+ for (uint32_t q = 1; q <= kmax - 1; q++) {
169
+ uint32_t imin = (q < kmax - 1) ? ((q > 1) ? q : 1) : xcount - 1;
170
+ fill_row(state, q, imin, xcount - 1);
171
+ }
172
+
173
+ uint32_t koptimal = find_koptimal(state);
174
+
175
+ VectorI *sizes = backtrack_sizes(state, koptimal);
176
+
177
+ /* printf("XSORTED \t"); vector_inspect_f(xsorted); */
178
+ /* printf("K OPTIMAL: %lld\n", koptimal); */
179
+ /* printf("SIZES \t"); vector_inspect_i(sizes); */
180
+ /* printf("FINAL COST\n"); matrix_inspect_f(cost); */
181
+ /* printf("FINAL SPLITS\n"); matrix_inspect_i(splits); */
182
+
183
+ VALUE response = rb_ary_new2(sizes->nvalues);
184
+ for (uint32_t i = 0; i < sizes->nvalues; i++) {
185
+ VALUE size = LONG2NUM(vector_get_i(sizes, i));
186
+ rb_ary_store(response, i, size);
187
+ }
188
+
189
+ arena_destroy(arena);
190
+
191
+ return response;
192
+ }
193
+
194
+ uint32_t find_koptimal(State state)
195
+ {
196
+ uint32_t kmin = state.kmin;
197
+ uint32_t kmax = state.kmax;
198
+ uint32_t xcount = state.xcount;
199
+ uint32_t kopt = kmin;
200
+ uint32_t xindex_max = state.xcount - 1;
201
+ VectorF *xsorted = state.xsorted;
202
+ long double x0 = vector_get_f(xsorted, 0);
203
+ long double xn = vector_get_f(xsorted, xindex_max);
204
+ long double max_bic = 0.0;
205
+ long double adjustment = state.apply_deviation ? 0.0 : 1.0;
206
+
207
+ for (uint32_t k = kmin; k <= kmax; k++) {
208
+ uint32_t index_right, index_left = 0;
209
+ long double bin_left, bin_right, loglikelihood = 0.0;
210
+ VectorI *sizes = backtrack_sizes(state, k);
211
+
212
+ for (uint32_t kb = 0; kb < k; kb++) {
213
+ uint32_t npoints = vector_get_i(sizes, kb);
214
+ index_right = index_left + npoints - 1;
215
+ long double xleft = vector_get_f(xsorted, index_left);
216
+ long double xright = vector_get_f(xsorted, index_right);
217
+ bin_left = xleft;
218
+ bin_right = xright;
219
+
220
+ if (xleft == xright) {
221
+ bin_left = index_left == 0
222
+ ? x0
223
+ : (vector_get_f(xsorted, index_left - 1) + xleft) / 2;
224
+ bin_right = index_right < xindex_max
225
+ ? (xright + vector_get_f(xsorted, index_right + 1)) / 2
226
+ : xn;
227
+ }
228
+
229
+ long double bin_width = bin_right - bin_left;
230
+ SegmentStats stats = shifted_data_variance(xsorted, index_left, index_right);
231
+ long double mean = stats.mean;
232
+ long double variance = stats.variance;
233
+
234
+ if (variance > 0) {
235
+ for (uint32_t i = index_left; i <= index_right; i++) {
236
+ long double xi = vector_get_f(xsorted, i);
237
+ loglikelihood += -(xi - mean) * (xi - mean) / (2.0 * variance);
238
+ }
239
+ loglikelihood += npoints * (
240
+ (log(npoints / (long double) xcount) * adjustment) -
241
+ (0.5 * log(PIx2 * variance))
242
+ );
243
+ } else {
244
+ loglikelihood += npoints * log(1.0 / bin_width / xcount);
245
+ }
246
+
247
+ index_left = index_right + 1;
248
+ }
249
+
250
+ long double bic = (2.0 * loglikelihood) - (((3 * k) - 1) * log((long double) xcount));
251
+
252
+ if (k == kmin) {
253
+ max_bic = bic;
254
+ kopt = kmin;
255
+ } else if (bic > max_bic) {
256
+ max_bic = bic;
257
+ kopt = k;
258
+ }
259
+ }
260
+
261
+ return kopt;
262
+ }
263
+
264
+ VectorI *backtrack_sizes(State state, uint32_t k)
265
+ {
266
+ MatrixI *splits = state.splits;
267
+ VectorI *sizes = vector_create_i(state.arena, k);
268
+ uint32_t xcount = state.xcount;
269
+ uint32_t right = xcount - 1;
270
+ uint32_t left = 0;
271
+
272
+ // Common case works with `i` remaining unsigned and unconditional assignment of the next `left` and `right`
273
+ for (uint32_t i = k - 1; i > 0; i--, right = left - 1) {
274
+ left = matrix_get_i(splits, i, right);
275
+ vector_set_i(sizes, i, right - left + 1);
276
+ }
277
+ left = matrix_get_i(splits, 0, right);
278
+ vector_set_i(sizes, 0, right - left + 1);
279
+
280
+ return sizes;
281
+ }
282
+
283
+ SegmentStats shifted_data_variance(VectorF *xsorted, uint32_t left, uint32_t right)
284
+ {
285
+ const uint32_t n = right - left + 1;
286
+ long double sum = 0.0;
287
+ long double sumsq = 0.0;
288
+ SegmentStats stats = { .mean = 0.0, .variance = 0.0 };
289
+
290
+ if (right >= left) {
291
+ const long double median = vector_get_f(xsorted, (left + right) / 2);
292
+
293
+ for (uint32_t i = left; i <= right; i++) {
294
+ const long double sumi = vector_get_f(xsorted, i) - median;
295
+
296
+ sum += sumi;
297
+ sumsq += sumi * sumi;
298
+ }
299
+
300
+ stats.mean = (sum / n) + median;
301
+ if (n > 1) {
302
+ stats.variance = (sumsq - (sum * sum / n)) / (n - 1);
303
+ }
304
+ }
305
+
306
+ return stats;
307
+ }
308
+
309
+ void fill_row(State state, uint32_t q, uint32_t imin, uint32_t imax) {
310
+ uint32_t size = imax - q + 1;
311
+ VectorI *split_candidates = vector_create_i(state.arena, size);
312
+ for (uint32_t i = 0; i < size; i++) {
313
+ vector_set_i(split_candidates, i, q + i);
314
+ }
315
+ RowParams rparams = { .row = q, .imin = imin, .imax = imax, .istep = 1 };
316
+ smawk(state, rparams, split_candidates);
317
+ }
318
+
319
+ void smawk(State state, RowParams rparams, VectorI *split_candidates) {
320
+ const uint32_t imin = rparams.imin;
321
+ const uint32_t imax = rparams.imax;
322
+ const uint32_t istep = rparams.istep;
323
+
324
+ if ((imax - imin) <= (0 * istep)) {
325
+ find_min_from_candidates(state, rparams, split_candidates);
326
+ } else {
327
+ VectorI *odd_candidates = prune_candidates(state, rparams, split_candidates);
328
+ /* printf("PRUNED\t"); vector_inspect_i(odd_candidates); */
329
+ uint32_t istepx2 = istep * 2;
330
+ uint32_t imin_odd = imin + istep;
331
+ uint32_t imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2);
332
+ RowParams rparams_odd = { .row = rparams.row, .imin = imin_odd, .imax = imax_odd, .istep = istepx2 };
333
+
334
+ smawk(state, rparams_odd, odd_candidates);
335
+ fill_even_positions(state, rparams, split_candidates);
336
+ }
337
+ }
338
+
339
+ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidates)
340
+ {
341
+ uint32_t row = rparams.row;
342
+ uint32_t imin = rparams.imin;
343
+ uint32_t imax = rparams.imax;
344
+ uint32_t istep = rparams.istep;
345
+ uint32_t n = split_candidates->nvalues;
346
+ uint32_t istepx2 = istep * 2;
347
+ uint32_t jl = vector_get_i(split_candidates, 0);
348
+ VectorF *xsum = state.xsum;
349
+ VectorF *xsumsq = state.xsumsq;
350
+ MatrixI *splits = state.splits;
351
+
352
+ for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
353
+ while (vector_get_i(split_candidates, r) < jl) r++;
354
+
355
+ uint32_t rcandidate = vector_get_i(split_candidates, r);
356
+ uint32_t cost_base_row = row - 1;
357
+ uint32_t cost_base_col = rcandidate - 1;
358
+ long double cost =
359
+ matrix_get_f(state.cost, cost_base_row, cost_base_col) + dissimilarity(rcandidate, i, xsum, xsumsq);
360
+
361
+ matrix_set_f(state.cost, row, i, cost);
362
+ matrix_set_i(state.splits, row, i, rcandidate);
363
+
364
+ uint32_t jh =
365
+ (i + istep) <= imax
366
+ ? matrix_get_i(splits, row, i + istep)
367
+ : vector_get_i(split_candidates, n - 1);
368
+
369
+ uint32_t jmax = jh < i ? jh : i;
370
+ long double sjimin = dissimilarity(jmax, i, xsum, xsumsq);
371
+
372
+ for (++r; r < n && vector_get_i(split_candidates, r) <= jmax; r++) {
373
+ uint32_t jabs = vector_get_i(split_candidates, r);
374
+
375
+ if (jabs > i) break;
376
+ if (jabs < matrix_get_i(splits, row - 1, i)) continue;
377
+
378
+ long double cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
379
+ long double sj = cost_base + dissimilarity(jabs, i, xsum, xsumsq);
380
+ long double cost_prev = matrix_get_f(state.cost, row, i);
381
+
382
+ if (sj <= cost_prev) {
383
+ matrix_set_f(state.cost, row, i, sj);
384
+ matrix_set_i(state.splits, row, i, jabs);
385
+ } else if (cost_base + sjimin > cost_prev) {
386
+ break;
387
+ }
388
+ }
389
+
390
+ r--;
391
+ jl = jh;
392
+ }
393
+ }
394
+
395
+ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
396
+ {
397
+ const uint32_t row = rparams.row;
398
+ const uint32_t imin = rparams.imin;
399
+ const uint32_t imax = rparams.imax;
400
+ const uint32_t istep = rparams.istep;
401
+ MatrixF *const cost = state.cost;
402
+ MatrixI *const splits = state.splits;
403
+
404
+ uint32_t optimal_split_idx_prev = 0;
405
+
406
+ for (uint32_t i = imin; i <= imax; i += istep)
407
+ {
408
+ const uint32_t optimal_split_idx = optimal_split_idx_prev;
409
+ const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
410
+ const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
411
+ const long double added_cost = dissimilarity(optimal_split, i, state.xsum, state.xsumsq);
412
+
413
+ matrix_set_f(cost, row, i, cost_prev + added_cost);
414
+ matrix_set_i(splits, row, i, optimal_split);
415
+
416
+ for (uint32_t r = optimal_split_idx + 1; r < split_candidates->nvalues; r++)
417
+ {
418
+ uint32_t split = vector_get_i(split_candidates, r);
419
+
420
+ if (split < matrix_get_i(splits, row - 1, i)) continue;
421
+ if (split > i) break;
422
+
423
+ long double split_cost =
424
+ matrix_get_f(cost, row - 1, split - 1) + dissimilarity(split, i, state.xsum, state.xsumsq);
425
+
426
+ if (split_cost > matrix_get_f(cost, row, i)) continue;
427
+
428
+ matrix_set_f(cost, row, i, split_cost);
429
+ matrix_set_i(splits, row, i, split);
430
+ optimal_split_idx_prev = r;
431
+ }
432
+ }
433
+ }
434
+
435
+ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
436
+ {
437
+ uint32_t n = ((rparams.imax - rparams.imin) / rparams.istep) + 1;
438
+ uint32_t m = split_candidates->nvalues;
439
+
440
+ if (n >= m) return split_candidates;
441
+
442
+ uint32_t left = -1;
443
+ uint32_t right = 0;
444
+ VectorI *pruned = vector_dup_i(split_candidates, state.arena);
445
+
446
+ while (m > n)
447
+ {
448
+ uint32_t p = left + 1;
449
+ uint32_t i = rparams.imin + p * rparams.istep;
450
+ uint32_t j = vector_get_i(pruned, right);
451
+ uint32_t jnext = vector_get_i(pruned, right + 1);
452
+ long double sl =
453
+ matrix_get_f(state.cost, rparams.row - 1, j - 1) + dissimilarity(j, i, state.xsum, state.xsumsq);
454
+ long double snext =
455
+ matrix_get_f(state.cost, rparams.row - 1, jnext - 1) + dissimilarity(jnext, i, state.xsum, state.xsumsq);
456
+
457
+ if ((sl < snext) && (p < n - 1)) {
458
+ left++;
459
+ right++;
460
+ vector_set_i(pruned, left, j);
461
+ } else if ((sl < snext) && (p == n - 1)) {
462
+ right++;
463
+ m--;
464
+ vector_set_i(pruned, right, j);
465
+ } else {
466
+ if (p > 0) {
467
+ /* TODO: extract `vector_setcpy_T` */
468
+ vector_set_i(pruned, right, vector_get_i(pruned, left));
469
+ left--;
470
+ } else {
471
+ right++;
472
+ }
473
+
474
+ m--;
475
+ }
476
+ }
477
+
478
+ for (uint32_t i = left + 1; i < m; i++) {
479
+ /* TODO: extract `vector_setcpy_T` */
480
+ vector_set_i(pruned, i, vector_get_i(pruned, right++));
481
+ }
482
+
483
+ vector_downsize_i(pruned, m);
484
+
485
+ return pruned;
486
+ }
487
+
488
+ long double dissimilarity(uint32_t j, uint32_t i, VectorF *xsum, VectorF *xsumsq) {
489
+ long double sji = 0.0;
490
+
491
+ if (j >= i) return sji;
492
+
493
+ if (j > 0) {
494
+ /* TODO: looks more like `segment_delta` */
495
+ long double segment_sum = vector_get_diff_f(xsum, i, j - 1);
496
+ uint32_t segment_size = i - j + 1;
497
+ sji = vector_get_diff_f(xsumsq, i, j - 1) - (segment_sum * segment_sum / segment_size);
498
+ } else {
499
+ long double xsumi = vector_get_f(xsum, i);
500
+ sji = vector_get_f(xsumsq, i) - (xsumi * xsumi / (i + 1));
501
+ }
502
+
503
+ return (sji > 0) ? sji : 0.0;
504
+ }
505
+
506
+ VectorF *vector_create_f(Arena *arena, uint32_t nvalues) {
507
+ VectorF *v;
508
+
509
+ v = arena_alloc(arena, sizeof(*v));
510
+ v->values = arena_alloc(arena, sizeof(*(v->values)) * nvalues);
511
+ v->nvalues = nvalues;
512
+
513
+ return v;
514
+ }
515
+
516
+ VectorI *vector_create_i(Arena *arena, uint32_t nvalues) {
517
+ VectorI *v;
518
+
519
+ v = arena_alloc(arena, sizeof(*v));
520
+ v->values = arena_alloc(arena, sizeof(*(v->values)) * nvalues);
521
+ v->nvalues = nvalues;
522
+
523
+ return v;
524
+ }
525
+
526
+ VectorI *vector_dup_i(VectorI *v, Arena *arena)
527
+ {
528
+ VectorI *vdup = vector_create_i(arena, v->nvalues);
529
+
530
+ /* TODO: use one memcpy call */
531
+ for (uint32_t i = 0; i < v->nvalues; i++) {
532
+ vector_set_i(vdup, i, vector_get_i(v, i));
533
+ }
534
+
535
+ return vdup;
536
+ }
537
+
538
+ void vector_set_f(VectorF *v, uint32_t offset, long double value) {
539
+ assert(offset < v->nvalues && "[vector_set_f] element index should be less than nvalues");
540
+
541
+ *(v->values + offset) = value;
542
+ }
543
+
544
+ void vector_set_i(VectorI *v, uint32_t offset, uint32_t value) {
545
+ assert(offset < v->nvalues && "[vector_set_i] element index should be less than nvalues");
546
+
547
+ *(v->values + offset) = value;
548
+ }
549
+
550
+ uint32_t vector_get_i(VectorI *v, uint32_t offset) {
551
+ assert(offset < v->nvalues && "[vector_get_i] element index should be less than nvalues");
552
+
553
+ return *(v->values + offset);
554
+ }
555
+
556
+ void vector_downsize_i(VectorI *v, uint32_t new_size) {
557
+ v->nvalues = new_size;
558
+ }
559
+
560
+ void vector_inspect_i(VectorI *v) {
561
+ for (uint32_t i = 0; i < v->nvalues - 1; i++)
562
+ printf("%u, ", vector_get_i(v, i));
563
+ printf("%u\n", vector_get_i(v, v->nvalues - 1));
564
+ }
565
+
566
+ long double vector_get_f(VectorF *v, uint32_t offset) {
567
+ assert(offset < v->nvalues && "[vector_get_f] element index should be less than nvalues");
568
+
569
+ return *(v->values + offset);
570
+ }
571
+
572
+ long double vector_get_diff_f(VectorF *v, uint32_t i, uint32_t j) {
573
+ assert(i < v->nvalues && "[vector_get_diff_f] i should be less than nvalues");
574
+ assert(j < v->nvalues && "[vector_get_diff_f] j should be less than nvalues");
575
+
576
+ return *(v->values + i) - *(v->values + j);
577
+ }
578
+
579
+ void vector_inspect_f(VectorF *v) {
580
+ for (uint32_t i = 0; i < v->nvalues - 1; i++)
581
+ printf("%Lf, ", vector_get_f(v, i));
582
+ printf("%Lf\n", vector_get_f(v, v->nvalues - 1));
583
+ }
584
+
585
+ MatrixF *matrix_create_f(Arena *arena, uint32_t nrows, uint32_t ncols) {
586
+ MatrixF *m;
587
+
588
+ m = arena_alloc(arena, sizeof(*m));
589
+ m->values = arena_alloc(arena, sizeof(*(m->values)) * ncols * nrows);
590
+ m->ncols = ncols;
591
+ m->nrows = nrows;
592
+
593
+ return m;
594
+ }
595
+
596
+ MatrixI *matrix_create_i(Arena *arena, uint32_t nrows, uint32_t ncols) {
597
+ MatrixI *m;
598
+
599
+ m = arena_alloc(arena, sizeof(*m));
600
+ m->values = arena_alloc(arena, sizeof(*(m->values)) * ncols * nrows);
601
+ m->ncols = ncols;
602
+ m->nrows = nrows;
603
+
604
+ return m;
605
+ }
606
+
607
+ void matrix_set_f(MatrixF *m, uint32_t i, uint32_t j, long double value) {
608
+ assert(i < m->nrows && "[matrix_set_f] row offset should be less than nrows");
609
+ assert(j < m->cols && "[matrix_set_f] col offset should be less than ncols");
610
+
611
+ uint32_t offset = i * m->ncols + j;
612
+ *(m->values + offset) = value;
613
+ }
614
+
615
+ long double matrix_get_f(MatrixF *m, uint32_t i, uint32_t j) {
616
+ assert(i < m->nrows && "[matrix_get_f] row offset should be less than nrows");
617
+ assert(j < m->cols && "[matrix_get_f] col offset should be less than ncols");
618
+
619
+ uint32_t offset = i * m->ncols + j;
620
+ return *(m->values + offset);
621
+ }
622
+
623
+ void matrix_inspect_f(MatrixF *m) {
624
+ for (uint32_t i = 0; i < m->nrows; i++) {
625
+ for (uint32_t j = 0; j < m->ncols - 1; j++) {
626
+ long double value = matrix_get_f(m, i, j);
627
+
628
+ printf("%Lf, ", value);
629
+ }
630
+ printf("%Lf\n", matrix_get_f(m, i, m->ncols - 1));
631
+ }
632
+ }
633
+
634
+ void matrix_inspect_i(MatrixI *m) {
635
+ for (uint32_t i = 0; i < m->nrows; i++) {
636
+ for (uint32_t j = 0; j < m->ncols - 1; j++)
637
+ printf("%u, ", matrix_get_i(m, i, j));
638
+ printf("%u\n", matrix_get_i(m, i, m->ncols - 1));
639
+ }
640
+ }
641
+
642
+ void matrix_set_i(MatrixI *m, uint32_t i, uint32_t j, uint32_t value) {
643
+ assert(i < m->nrows && "[matrix_set_i] row offset should be less than nrows");
644
+ assert(j < m->cols && "[matrix_set_i] col offset should be less than ncols");
645
+
646
+ uint32_t offset = i * m->ncols + j;
647
+ *(m->values + offset) = value;
648
+ }
649
+
650
+ uint32_t matrix_get_i(MatrixI *m, uint32_t i, uint32_t j) {
651
+ assert(i < m->nrows && "[matrix_get_i] row offset should be less than nrows");
652
+ assert(j < m->cols && "[matrix_get_i] col offset should be less than ncols");
653
+
654
+ uint32_t offset = i * m->ncols + j;
655
+ return *(m->values + offset);
656
+ }
657
+
658
+ Arena *arena_create(uint32_t capacity) {
659
+ if (capacity < ARENA_MIN_CAPACITY) {
660
+ capacity = ARENA_MIN_CAPACITY;
661
+ }
662
+
663
+ Arena *arena;
664
+
665
+ arena = malloc(sizeof(*arena));
666
+ if (!arena) {
667
+ printf("Failed to allocate arena\n");
668
+ return NULL;
669
+ }
670
+
671
+ arena->buffer = calloc(1, capacity);
672
+ if (!arena->buffer) {
673
+ printf("Failed to allocate arena\n");
674
+ free(arena);
675
+ return NULL;
676
+ }
677
+
678
+ arena->capacity = capacity;
679
+ arena->offset = 0;
680
+
681
+ printf("[Arena Created] Capacity: %u, offset: %u\n", arena->capacity, arena->offset);
682
+
683
+ return arena;
684
+ }
685
+
686
+ void *arena_alloc(Arena *arena, uint32_t size) {
687
+ size = (size + 7) & ~7;
688
+
689
+ if (arena->offset + size > arena->capacity) {
690
+ printf("Arena Out Of Memory\n");
691
+ return NULL;
692
+ }
693
+
694
+ void *ptr = arena->buffer + arena->offset;
695
+ arena->offset += size;
696
+
697
+ return ptr;
698
+ }
699
+
700
+ void arena_destroy(Arena *arena) {
701
+ printf("[Arena Destroy] Capacity: %u, offset: %u, left: %u\n", arena->capacity, arena->offset, arena->capacity - arena->offset);
702
+ free(arena->buffer);
703
+ free(arena);
704
+ }
@@ -2,8 +2,6 @@
2
2
 
3
3
  module Ckmeans
4
4
  class Clusterer # rubocop:disable Style/Documentation, Metrics/ClassLength
5
- attr_reader :xcount, :xsorted, :kmin, :kmax, :smat, :jmat, :kestimate
6
-
7
5
  PI_DOUBLE = Math::PI * 2
8
6
 
9
7
  def initialize(entries, kmin, kmax = kmin, kestimate = :regular)
@@ -12,85 +10,94 @@ module Ckmeans
12
10
  raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
13
11
  raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
14
12
 
15
- @kmin = kmin
16
- @unique_xcount = entries.uniq.size
17
- @kmax = [@unique_xcount, kmax].min
18
- @xsorted = entries.sort
19
- @kestimate = kestimate
13
+ @kmin = kmin
14
+ @unique_xcount = entries.uniq.size
15
+ @kmax = [@unique_xcount, kmax].min
16
+ @xsorted_original = entries.sort
17
+ @xsorted = @xsorted_original.map(&:to_f)
18
+ @apply_bic_deviation = kestimate == :sensitive
20
19
  end
21
20
 
22
21
  def clusters
23
22
  @clusters ||=
24
23
  if @unique_xcount <= 1
25
- [xsorted]
24
+ [@xsorted_original]
26
25
  else
27
- @smat = Array.new(kmax) { Array.new(xcount) { 0.0 } }
28
- @jmat = Array.new(kmax) { Array.new(xcount) { 0 } }
29
-
30
- kappa = kmax
31
- n = xcount
32
- xsum = Array.new(n)
33
- xsumsq = Array.new(n)
34
- shift = xsorted[n / 2]
35
- xsum[0] = xsorted[0] - shift
26
+ sorted_group_sizes.each_with_object([]) do |size, groups|
27
+ groups << @xsorted_original.shift(size)
28
+ end
29
+
30
+ =begin # rubocop:disable Style/BlockComments
31
+ @cost = Array.new(kmax) { Array.new(xcount) { 0.0 } }
32
+ @splits = Array.new(kmax) { Array.new(xcount) { 0 } }
33
+ @xsum = Array.new(xcount)
34
+ @xsumsq = Array.new(xcount)
35
+
36
+ shift = xsorted[xcount / 2]
37
+ xsum[0] = xsorted[0].to_f - shift
36
38
  xsumsq[0] = xsum[0]**2
37
- 1.upto(n - 1) do |i|
38
- xsum[i] = xsum[i - 1] + xsorted[i] - shift
39
- xsumsq[i] = xsumsq[i - 1] + ((xsorted[i] - shift) * (xsorted[i] - shift))
40
- smat[0][i] = dissim(0, i, xsum, xsumsq)
41
- jmat[0][i] = 0
39
+
40
+ 1.upto(xcount - 1) do |i|
41
+ xf = xsorted[i].to_f
42
+ xsum[i] = xsum[i - 1] + xf - shift
43
+ xsumsq[i] = xsumsq[i - 1] + ((xf - shift) * (xf - shift))
44
+ cost[0][i] = dissim(0, i)
45
+ splits[0][i] = 0
42
46
  end
43
47
 
44
- kappa_dec = kappa - 1
45
- 1.upto(kappa_dec) do |q|
46
- imin = q < kappa_dec ? [1, q].max : n - 1
47
- fill_row(q, imin, n - 1, xsum, xsumsq)
48
+ kmax_idx = kmax - 1
49
+ 1.upto(kmax_idx) do |q|
50
+ imin = q < kmax_idx ? [1, q].max : xcount - 1
51
+ fill_row(q, imin, xcount - 1)
48
52
  end
49
53
 
50
54
  kopt = koptimal
51
55
 
52
- results = []
53
- backtrack(kopt) do |q, left, right|
54
- results[q] = xsorted[left..right]
56
+ puts "RB COST\n", cost.map(&:inspect)
57
+ puts "RB SPLITS\n", splits.map(&:inspect)
58
+ puts "RB K OPTIMAL: #{kopt}"
59
+
60
+ backtrack(kopt).each_with_object(Array.new(kopt)) do |(q, left, right), res|
61
+ res[q] = xsorted[left..right]
55
62
  end
56
- results
63
+ =end
57
64
  end
58
65
  end
59
66
 
60
67
  private
61
68
 
62
- def koptimal
63
- kopt = kmin
64
- n = xcount
65
- max_bic = 0.0
69
+ attr_reader :cost, :splits, :xsum, :xsumsq, :xcount, :xsorted, :kmin, :kmax
66
70
 
67
- # Deviation from BIC formula to favor smaller clusters
68
- adjustment = kestimate == :sensitive ? 0.0 : 1.0
71
+ def koptimal # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
72
+ kopt = kmin
73
+ n = xcount
74
+ max_bic = 0.0
75
+ adjustment = kestimate == :sensitive ? 0.0 : 1.0 # Deviation from BIC formula to favor smaller clusters
69
76
 
70
77
  kmin.upto(kmax) do |k|
71
- sizes = Array.new(k)
72
- backtrack(k) { |q, left, right| sizes[q] = right - left + 1 }
73
- index_left = 0
74
- index_right = nil
78
+ sizes = backtrack(k).each_with_object(Array.new(k)) { |(q, left, right), sz| sz[q] = right - left + 1 }
79
+
80
+ index_left = 0
81
+ index_right = nil
75
82
  loglikelihood = 0.0
76
- bin_left = nil
77
- bin_right = nil
83
+ bin_left = nil
84
+ bin_right = nil
78
85
 
79
86
  k.times do |kb|
80
87
  num_points_in_bin = sizes[kb]
81
88
  index_right = index_left + num_points_in_bin - 1
82
89
 
83
90
  if xsorted[index_left] < xsorted[index_right]
84
- bin_left = xsorted[index_left]
91
+ bin_left = xsorted[index_left]
85
92
  bin_right = xsorted[index_right]
86
93
  elsif xsorted[index_left] == xsorted[index_right]
87
- bin_left = index_left == 0 ? xsorted[0] : (xsorted[index_left - 1] + xsorted[index_left]) / 2.0
94
+ bin_left = index_left == 0 ? xsorted[0] : (xsorted[index_left - 1] + xsorted[index_left]) / 2.0
88
95
  bin_right = index_right < n - 1 ? (xsorted[index_right] + xsorted[index_right + 1]) / 2.0 : xsorted[n - 1]
89
96
  else
90
97
  raise "ERROR: binLeft > binRight"
91
98
  end
92
99
 
93
- bin_width = bin_right - bin_left
100
+ bin_width = bin_right.to_f - bin_left
94
101
 
95
102
  mean, variance = shifted_data_variance(index_left, index_right)
96
103
 
@@ -112,10 +119,10 @@ module Ckmeans
112
119
 
113
120
  if k == kmin
114
121
  max_bic = bic
115
- kopt = kmin
122
+ kopt = kmin
116
123
  elsif bic > max_bic
117
124
  max_bic = bic
118
- kopt = k
125
+ kopt = k
119
126
  end
120
127
  end
121
128
 
@@ -123,22 +130,22 @@ module Ckmeans
123
130
  end
124
131
 
125
132
  def shifted_data_variance(ileft, iright)
126
- sum = 0.0
127
- sumsq = 0.0
128
- mean = 0.0
133
+ sum = 0.0
134
+ sumsq = 0.0
135
+ mean = 0.0
129
136
  variance = 0.0
130
- n = iright - ileft + 1
137
+ n = iright - ileft + 1
131
138
 
132
139
  if iright >= ileft
133
- median = xsorted[(ileft + iright) / 2]
140
+ median = xsorted[(ileft + iright) / 2].to_f
134
141
 
135
142
  ileft.upto(iright) do |i|
136
- sumi = xsorted[i] - median
137
- sum += sumi
143
+ sumi = xsorted[i] - median
144
+ sum += sumi
138
145
  sumsq += sumi**2
139
146
  end
140
147
 
141
- mean = (sum / n) + median
148
+ mean = (sum / n) + median
142
149
  variance = (sumsq - (sum * sum / n)) / (n - 1) if n > 1
143
150
  end
144
151
 
@@ -146,12 +153,13 @@ module Ckmeans
146
153
  end
147
154
 
148
155
  def backtrack(k)
149
- n = jmat[0].size
150
- right = n - 1
151
- left = nil
156
+ return to_enum(__method__, k) unless block_given?
157
+
158
+ right = xcount - 1
159
+ left = nil
152
160
 
153
161
  (k - 1).downto(0) do |q|
154
- left = jmat[q][right]
162
+ left = splits[q][right]
155
163
 
156
164
  yield q, left, right
157
165
 
@@ -159,7 +167,7 @@ module Ckmeans
159
167
  end
160
168
  end
161
169
 
162
- def dissim(j, i, xsum, xsumsq)
170
+ def dissim(j, i)
163
171
  return 0.0 if j >= i
164
172
 
165
173
  sji =
@@ -174,80 +182,82 @@ module Ckmeans
174
182
  [0, sji].max
175
183
  end
176
184
 
177
- def fill_row(q, imin, imax, xsum, xsumsq)
185
+ def fill_row(q, imin, imax)
178
186
  size = imax - q + 1
179
187
 
180
188
  js = Array.new(size) { |i| q + i }
181
- smawk(imin, imax, 1, q, js, xsum, xsumsq)
189
+ smawk(imin, imax, 1, q, js)
182
190
  end
183
191
 
184
- def smawk(imin, imax, istep, q, js, xsum, xsumsq)
192
+ def smawk(imin, imax, istep, q, js)
185
193
  if (imax - imin) <= (0 * istep)
186
- find_min_from_candidates(q, imin, imax, istep, js, xsum, xsumsq)
194
+ find_min_from_candidates(q, imin, imax, istep, js)
187
195
  else
188
- js_odd = js_reduced(imin, imax, istep, q, js, xsum, xsumsq)
196
+ js_odd = prune_candidates(imin, imax, istep, q, js)
197
+ # puts "Pruned: #{js_odd.inspect}"
189
198
  istepx2 = istep * 2
190
199
  imin_odd = imin + istep
191
200
  imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2)
192
- smawk(imin_odd, imax_odd, istepx2, q, js_odd, xsum, xsumsq)
193
- fill_even_positions(imin, imax, istep, q, js, smat, jmat, xsum, xsumsq)
201
+ smawk(imin_odd, imax_odd, istepx2, q, js_odd)
202
+ fill_even_positions(imin, imax, istep, q, js)
194
203
  end
195
204
  end
196
205
 
197
- def find_min_from_candidates(q, imin, imax, istep, js, xsum, xsumsq)
198
- rmin_prev = 0
206
+ def find_min_from_candidates(q, imin, imax, istep, js)
207
+ optimal_split_index_prev = 0
199
208
 
200
209
  (imin..imax).step(istep) do |i|
201
- rmin = rmin_prev
202
- smat[q][i] = smat[q - 1][js[rmin] - 1] + dissim(js[rmin], i, xsum, xsumsq)
203
- jmat[q][i] = js[rmin]
210
+ optimal_split_index = optimal_split_index_prev
211
+ optimal_split = js[optimal_split_index]
212
+ cost[q][i] = cost[q - 1][optimal_split - 1] + dissim(optimal_split, i)
213
+ splits[q][i] = optimal_split
204
214
 
205
- ((rmin + 1)...js.size).each do |r|
206
- jabs = js[r]
215
+ ((optimal_split_index + 1)...js.size).each do |split_index|
216
+ jabs = js[split_index]
207
217
 
208
- next if jabs < jmat[q - 1][i]
218
+ next if jabs < splits[q - 1][i]
209
219
  break if jabs > i
210
220
 
211
- sj = smat[q - 1][jabs - 1] + dissim(jabs, i, xsum, xsumsq)
221
+ sj = cost[q - 1][jabs - 1] + dissim(jabs, i)
212
222
 
213
- next unless sj <= smat[q][i]
223
+ next unless sj <= cost[q][i]
214
224
 
215
- smat[q][i] = sj
216
- jmat[q][i] = js[r]
217
- rmin_prev = r
225
+ cost[q][i] = sj
226
+ splits[q][i] = js[split_index]
227
+ optimal_split_index_prev = split_index
218
228
  end
219
229
  end
220
230
  end
221
231
 
222
- def js_reduced(imin, imax, istep, q, js, xsum, xsumsq)
232
+ def prune_candidates(imin, imax, istep, q, js)
223
233
  n = ((imax - imin) / istep) + 1
224
234
  m = js.size
225
235
 
226
236
  return js if n >= m
227
237
 
228
- js_red = js.dup
238
+ pruned = js.dup
229
239
  left = -1
230
240
  right = 0
231
241
 
232
242
  while m > n
233
- p = left + 1
234
- i = imin + (p * istep)
235
- j = js_red[right]
236
- sl = smat[q - 1][j - 1] + dissim(j, i, xsum, xsumsq)
237
- jplus1 = js_red[right + 1]
238
- splus1 = smat[q - 1][jplus1 - 1] + dissim(jplus1, i, xsum, xsumsq)
239
-
240
- if (sl < splus1) && (p < n - 1)
243
+ p = left + 1
244
+ i = imin + (p * istep)
245
+ j = pruned[right]
246
+ jnext = pruned[right + 1]
247
+ sl = cost[q - 1][j - 1] + dissim(j, i)
248
+ snext = cost[q - 1][jnext - 1] + dissim(jnext, i)
249
+
250
+ if (sl < snext) && (p < n - 1)
241
251
  left += 1
242
- js_red[left] = j
252
+ pruned[left] = j
243
253
  right += 1
244
- elsif (sl < splus1) && (p == n - 1)
254
+ elsif (sl < snext) && (p == n - 1)
245
255
  right += 1
246
- js_red[right] = j
256
+ pruned[right] = j
247
257
  m -= 1
248
258
  else
249
259
  if p > 0
250
- js_red[right] = js_red[left]
260
+ pruned[right] = pruned[left]
251
261
  left -= 1
252
262
  else
253
263
  right += 1
@@ -258,15 +268,15 @@ module Ckmeans
258
268
  end
259
269
 
260
270
  ((left + 1)...m).each do |r|
261
- js_red[r] = js_red[right]
271
+ pruned[r] = pruned[right]
262
272
  right += 1
263
273
  end
264
274
 
265
- js_red.slice!(m..-1) if js_red.size > m
266
- js_red
275
+ pruned.slice!(m..-1) if pruned.size > m
276
+ pruned
267
277
  end
268
278
 
269
- def fill_even_positions(imin, imax, istep, q, js, smat, jmat, xsum, xsumsq)
279
+ def fill_even_positions(imin, imax, istep, q, js)
270
280
  n = js.size
271
281
  istepx2 = istep * 2
272
282
  jl = js[0]
@@ -276,11 +286,11 @@ module Ckmeans
276
286
  while i <= imax
277
287
  r += 1 while js[r] < jl
278
288
 
279
- smat[q][i] = smat[q - 1][js[r] - 1] + dissim(js[r], i, xsum, xsumsq)
280
- jmat[q][i] = js[r]
281
- jh = ((i + istep) <= imax ? jmat[q][i + istep] : js[n - 1]).to_i
282
- jmax = [jh, i].min.to_i
283
- sjimin = dissim(jmax, i, xsum, xsumsq)
289
+ cost[q][i] = cost[q - 1][js[r] - 1] + dissim(js[r], i)
290
+ splits[q][i] = js[r]
291
+ jh = (i + istep) <= imax ? splits[q][i + istep] : js[n - 1]
292
+ jmax = [jh, i].min
293
+ sjimin = dissim(jmax, i)
284
294
 
285
295
  r += 1
286
296
  while r < n && js[r] <= jmax
@@ -288,18 +298,18 @@ module Ckmeans
288
298
 
289
299
  break if jabs > i
290
300
 
291
- if jabs < jmat[q - 1][i]
301
+ if jabs < splits[q - 1][i]
292
302
  r += 1
293
303
  next
294
304
  end
295
305
 
296
- s = dissim(jabs, i, xsum, xsumsq)
297
- sj = smat[q - 1][jabs - 1] + s
306
+ cost_base = cost[q - 1][jabs - 1]
307
+ sj = cost_base + dissim(jabs, i)
298
308
 
299
- if sj <= smat[q][i]
300
- smat[q][i] = sj
301
- jmat[q][i] = js[r]
302
- elsif smat[q - 1][jabs - 1] + sjimin > smat[q][i]
309
+ if sj <= cost[q][i]
310
+ cost[q][i] = sj
311
+ splits[q][i] = jabs
312
+ elsif cost_base + sjimin > cost[q][i]
303
313
  break
304
314
  end
305
315
 
@@ -314,3 +324,5 @@ module Ckmeans
314
324
  end
315
325
  end
316
326
  end
327
+
328
+ require "ckmeans/extensions"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ckmeans
4
- VERSION = "0.1.2"
4
+ VERSION = "1.0.0.rc"
5
5
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ckmeans
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 1.0.0.rc
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Lebedev
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2025-03-31 00:00:00.000000000 Z
10
+ date: 2025-04-22 00:00:00.000000000 Z
11
11
  dependencies: []
12
12
  description: Repeatable clustering of unidimensional data
13
13
  email:
@@ -24,6 +24,8 @@ files:
24
24
  - LICENSE
25
25
  - README.md
26
26
  - Rakefile
27
+ - ext/ckmeans/extconf.rb
28
+ - ext/ckmeans/extensions.c
27
29
  - lib/ckmeans.rb
28
30
  - lib/ckmeans/clusterer.rb
29
31
  - lib/ckmeans/version.rb