ckmeans 0.1.2 → 1.0.0.rc
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +6 -19
- data/Rakefile +7 -1
- data/ext/ckmeans/extconf.rb +5 -0
- data/ext/ckmeans/extensions.c +704 -0
- data/lib/ckmeans/clusterer.rb +122 -110
- data/lib/ckmeans/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0c032b968f4f996b50ea2d63b1624eccd2d6dd4ff4922042143ada4200664216
|
|
4
|
+
data.tar.gz: c8b220a8ebe08b2aebc78cb7c5e347e4d734627aac6ad61da298dc22ee30e884
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0fa159e921f89ba73ca478476903ae3a07893ea8cdff2b86ec25605f22f2864c75eb8763c5daf50657b194ed216dec69cfe99e405035f2b83b540e3e5c5c2599
|
|
7
|
+
data.tar.gz: 99f4ea7b2db58fb076325a4b3cd3b5866e3ad35d48c3d89521555b22e14720ac37e422a3ad258c402494cc6c2896a9a48bab4c126d4660912f29caa4a393a28b
|
data/.rubocop_todo.yml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# This configuration was generated by
|
|
2
2
|
# `rubocop --auto-gen-config`
|
|
3
|
-
# on 2025-
|
|
3
|
+
# on 2025-04-17 07:09:28 UTC using RuboCop version 1.75.1.
|
|
4
4
|
# The point is for the user to remove these configuration records
|
|
5
5
|
# one by one as the offenses are removed from the code base.
|
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
|
@@ -11,48 +11,35 @@
|
|
|
11
11
|
Metrics/AbcSize:
|
|
12
12
|
Max: 95
|
|
13
13
|
|
|
14
|
-
# Offense count:
|
|
14
|
+
# Offense count: 2
|
|
15
15
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
|
16
16
|
# AllowedMethods: refine
|
|
17
17
|
Metrics/BlockLength:
|
|
18
|
-
Max:
|
|
18
|
+
Max: 41
|
|
19
19
|
|
|
20
20
|
# Offense count: 3
|
|
21
21
|
# Configuration parameters: AllowedMethods, AllowedPatterns.
|
|
22
22
|
Metrics/CyclomaticComplexity:
|
|
23
23
|
Max: 10
|
|
24
24
|
|
|
25
|
-
# Offense count:
|
|
25
|
+
# Offense count: 6
|
|
26
26
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
|
27
27
|
Metrics/MethodLength:
|
|
28
28
|
Max: 48
|
|
29
29
|
|
|
30
|
-
# Offense count: 5
|
|
31
|
-
# Configuration parameters: CountKeywordArgs, MaxOptionalParameters.
|
|
32
|
-
Metrics/ParameterLists:
|
|
33
|
-
Max: 9
|
|
34
|
-
|
|
35
30
|
# Offense count: 3
|
|
36
31
|
# Configuration parameters: AllowedMethods, AllowedPatterns.
|
|
37
32
|
Metrics/PerceivedComplexity:
|
|
38
33
|
Max: 13
|
|
39
34
|
|
|
40
|
-
# Offense count:
|
|
35
|
+
# Offense count: 12
|
|
41
36
|
# Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
|
|
42
37
|
# AllowedNames: as, at, by, cc, db, id, if, in, io, ip, of, on, os, pp, to
|
|
43
38
|
Naming/MethodParameterName:
|
|
44
39
|
Exclude:
|
|
45
40
|
- 'lib/ckmeans/clusterer.rb'
|
|
46
41
|
|
|
47
|
-
# Offense count:
|
|
48
|
-
# Configuration parameters: EnforcedStyle, CheckMethodNames, CheckSymbols, AllowedIdentifiers, AllowedPatterns.
|
|
49
|
-
# SupportedStyles: snake_case, normalcase, non_integer
|
|
50
|
-
# AllowedIdentifiers: TLS1_1, TLS1_2, capture3, iso8601, rfc1123_date, rfc822, rfc2822, rfc3339, x86_64
|
|
51
|
-
Naming/VariableNumber:
|
|
52
|
-
Exclude:
|
|
53
|
-
- 'lib/ckmeans/clusterer.rb'
|
|
54
|
-
|
|
55
|
-
# Offense count: 6
|
|
42
|
+
# Offense count: 5
|
|
56
43
|
# This cop supports unsafe autocorrection (--autocorrect-all).
|
|
57
44
|
# Configuration parameters: EnforcedStyle, AllowedMethods, AllowedPatterns.
|
|
58
45
|
# SupportedStyles: predicate, comparison
|
data/Rakefile
CHANGED
|
@@ -2,6 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
require "bundler/gem_tasks"
|
|
4
4
|
require "rspec/core/rake_task"
|
|
5
|
+
require "rake/extensiontask"
|
|
6
|
+
|
|
7
|
+
Rake::ExtensionTask.new("extensions") do |ext|
|
|
8
|
+
ext.lib_dir = "lib/ckmeans"
|
|
9
|
+
ext.ext_dir = "ext/ckmeans"
|
|
10
|
+
end
|
|
5
11
|
|
|
6
12
|
RSpec::Core::RakeTask.new(:spec)
|
|
7
13
|
|
|
@@ -9,4 +15,4 @@ require "rubocop/rake_task"
|
|
|
9
15
|
|
|
10
16
|
RuboCop::RakeTask.new
|
|
11
17
|
|
|
12
|
-
task default: %i[spec rubocop]
|
|
18
|
+
task default: %i[compile spec rubocop]
|
|
@@ -0,0 +1,704 @@
|
|
|
1
|
+
#include <stdio.h>
|
|
2
|
+
#include <assert.h>
|
|
3
|
+
#include <math.h>
|
|
4
|
+
#include "ruby.h"
|
|
5
|
+
|
|
6
|
+
typedef struct Arena {
|
|
7
|
+
uint32_t capacity;
|
|
8
|
+
uint32_t offset;
|
|
9
|
+
uint8_t *buffer;
|
|
10
|
+
} Arena;
|
|
11
|
+
|
|
12
|
+
typedef struct MatrixF {
|
|
13
|
+
uint32_t ncols;
|
|
14
|
+
uint32_t nrows;
|
|
15
|
+
long double *values;
|
|
16
|
+
} MatrixF;
|
|
17
|
+
|
|
18
|
+
typedef struct MatrixI {
|
|
19
|
+
uint32_t ncols;
|
|
20
|
+
uint32_t nrows;
|
|
21
|
+
uint32_t *values;
|
|
22
|
+
} MatrixI;
|
|
23
|
+
|
|
24
|
+
typedef struct VectorF {
|
|
25
|
+
uint32_t nvalues;
|
|
26
|
+
long double *values;
|
|
27
|
+
} VectorF;
|
|
28
|
+
|
|
29
|
+
typedef struct VectorI {
|
|
30
|
+
uint32_t nvalues;
|
|
31
|
+
uint32_t *values;
|
|
32
|
+
} VectorI;
|
|
33
|
+
|
|
34
|
+
typedef struct State {
|
|
35
|
+
uint32_t xcount;
|
|
36
|
+
uint32_t kmin;
|
|
37
|
+
uint32_t kmax;
|
|
38
|
+
bool apply_deviation;
|
|
39
|
+
Arena *arena;
|
|
40
|
+
VectorF *xsorted;
|
|
41
|
+
MatrixF *cost;
|
|
42
|
+
MatrixI *splits;
|
|
43
|
+
VectorF *xsum;
|
|
44
|
+
VectorF *xsumsq;
|
|
45
|
+
} State;
|
|
46
|
+
|
|
47
|
+
typedef struct RowParams {
|
|
48
|
+
uint32_t row;
|
|
49
|
+
uint32_t imin;
|
|
50
|
+
uint32_t imax;
|
|
51
|
+
uint32_t istep;
|
|
52
|
+
} RowParams;
|
|
53
|
+
|
|
54
|
+
typedef struct {
|
|
55
|
+
long double mean;
|
|
56
|
+
long double variance;
|
|
57
|
+
} SegmentStats;
|
|
58
|
+
|
|
59
|
+
VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
|
|
60
|
+
|
|
61
|
+
Arena *arena_create(uint32_t);
|
|
62
|
+
void *arena_alloc(Arena*, uint32_t);
|
|
63
|
+
void arena_rewind(Arena*);
|
|
64
|
+
void arena_destroy(Arena*);
|
|
65
|
+
|
|
66
|
+
MatrixF *matrix_create_f(Arena*, uint32_t, uint32_t);
|
|
67
|
+
MatrixI *matrix_create_i(Arena*, uint32_t, uint32_t);
|
|
68
|
+
void matrix_set_f(MatrixF*, uint32_t, uint32_t, long double value);
|
|
69
|
+
long double matrix_get_f(MatrixF*, uint32_t, uint32_t);
|
|
70
|
+
void matrix_inspect_f(MatrixF*);
|
|
71
|
+
void matrix_set_i(MatrixI*, uint32_t, uint32_t, uint32_t value);
|
|
72
|
+
uint32_t matrix_get_i(MatrixI*, uint32_t, uint32_t);
|
|
73
|
+
void matrix_inspect_i(MatrixI*);
|
|
74
|
+
|
|
75
|
+
VectorF *vector_create_f(Arena*, uint32_t);
|
|
76
|
+
void vector_set_f(VectorF*, uint32_t offset, long double value);
|
|
77
|
+
long double vector_get_f(VectorF*, uint32_t offset);
|
|
78
|
+
long double vector_get_diff_f(VectorF*, uint32_t, uint32_t);
|
|
79
|
+
void vector_inspect_f(VectorF*);
|
|
80
|
+
VectorI *vector_create_i(Arena*, uint32_t);
|
|
81
|
+
VectorI *vector_dup_i(VectorI*, Arena*);
|
|
82
|
+
void vector_set_i(VectorI*, uint32_t offset, uint32_t value);
|
|
83
|
+
uint32_t vector_get_i(VectorI*, uint32_t offset);
|
|
84
|
+
void vector_downsize_i(VectorI*, uint32_t);
|
|
85
|
+
void vector_inspect_i(VectorI*);
|
|
86
|
+
|
|
87
|
+
long double dissimilarity(uint32_t, uint32_t, VectorF*, VectorF*);
|
|
88
|
+
void fill_row(State, uint32_t, uint32_t, uint32_t);
|
|
89
|
+
void smawk(State, RowParams, VectorI*);
|
|
90
|
+
void find_min_from_candidates(State, RowParams, VectorI*);
|
|
91
|
+
VectorI *prune_candidates(State, RowParams, VectorI*);
|
|
92
|
+
void fill_even_positions(State, RowParams, VectorI*);
|
|
93
|
+
SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
|
|
94
|
+
VectorI *backtrack_sizes(State, uint32_t);
|
|
95
|
+
uint32_t find_koptimal(State);
|
|
96
|
+
|
|
97
|
+
void Init_extensions(void) {
|
|
98
|
+
VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
|
|
99
|
+
VALUE clusterer_class = rb_const_get(ckmeans_module, rb_intern("Clusterer"));
|
|
100
|
+
|
|
101
|
+
rb_define_private_method(clusterer_class, "sorted_group_sizes", rb_ckmeans_sorted_group_sizes, 0);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
# define ARENA_MIN_CAPACITY 1024
|
|
105
|
+
# define ALLOCATION_FACTOR 20
|
|
106
|
+
# define PIx2 (M_PI * 2.0)
|
|
107
|
+
|
|
108
|
+
VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
|
|
109
|
+
VALUE rb_xcount = rb_ivar_get(self, rb_intern("@xcount"));
|
|
110
|
+
VALUE rb_kmin = rb_ivar_get(self, rb_intern("@kmin"));
|
|
111
|
+
VALUE rb_kmax = rb_ivar_get(self, rb_intern("@kmax"));
|
|
112
|
+
VALUE rb_xsorted = rb_ivar_get(self, rb_intern("@xsorted"));
|
|
113
|
+
VALUE rb_apply_bic_deviation = rb_ivar_get(self, rb_intern("@apply_bic_deviation"));
|
|
114
|
+
uint32_t xcount = NUM2UINT(rb_xcount);
|
|
115
|
+
uint32_t kmin = NUM2UINT(rb_kmin);
|
|
116
|
+
uint32_t kmax = NUM2UINT(rb_kmax);
|
|
117
|
+
bool apply_deviation = RTEST(rb_apply_bic_deviation);
|
|
118
|
+
Arena *arena = arena_create(sizeof(int) * xcount * kmax * ALLOCATION_FACTOR);
|
|
119
|
+
|
|
120
|
+
if (arena == NULL) {
|
|
121
|
+
return Qnil;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
MatrixF *cost = matrix_create_f(arena, kmax, xcount);
|
|
125
|
+
MatrixI *splits = matrix_create_i(arena, kmax, xcount);
|
|
126
|
+
VectorF *xsorted = vector_create_f(arena, xcount);
|
|
127
|
+
/* TODO: pack sums into one vector of pairs */
|
|
128
|
+
VectorF *xsum = vector_create_f(arena, xcount);
|
|
129
|
+
VectorF *xsumsq = vector_create_f(arena, xcount);
|
|
130
|
+
|
|
131
|
+
for (uint32_t i = 0; i < xcount; i++) {
|
|
132
|
+
long double xi = NUM2DBL(rb_ary_entry(rb_xsorted, i));
|
|
133
|
+
vector_set_f(xsorted, i, xi);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
State state = {
|
|
137
|
+
.arena = arena,
|
|
138
|
+
.xcount = xcount,
|
|
139
|
+
.kmin = kmin,
|
|
140
|
+
.kmax = kmax,
|
|
141
|
+
.apply_deviation = apply_deviation,
|
|
142
|
+
.xsorted = xsorted,
|
|
143
|
+
.cost = cost,
|
|
144
|
+
.splits = splits,
|
|
145
|
+
.xsum = xsum,
|
|
146
|
+
.xsumsq = xsumsq
|
|
147
|
+
};
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
long double shift = vector_get_f(xsorted, xcount / 2);
|
|
151
|
+
long double diff_initial = vector_get_f(xsorted, 0) - shift;
|
|
152
|
+
|
|
153
|
+
vector_set_f(xsum, 0, diff_initial);
|
|
154
|
+
vector_set_f(xsumsq, 0, diff_initial * diff_initial);
|
|
155
|
+
|
|
156
|
+
for (uint32_t i = 1; i < xcount; i++) {
|
|
157
|
+
long double xi = vector_get_f(xsorted, i);
|
|
158
|
+
long double xsum_prev = vector_get_f(xsum, i - 1);
|
|
159
|
+
long double xsumsq_prev = vector_get_f(xsumsq, i - 1);
|
|
160
|
+
long double diff = xi - shift;
|
|
161
|
+
|
|
162
|
+
vector_set_f(xsum, i, xsum_prev + diff);
|
|
163
|
+
vector_set_f(xsumsq, i, xsumsq_prev + diff * diff);
|
|
164
|
+
matrix_set_f(cost, 0, i, dissimilarity(0, i, xsum, xsumsq));
|
|
165
|
+
matrix_set_i(splits, 0, i, 0);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
for (uint32_t q = 1; q <= kmax - 1; q++) {
|
|
169
|
+
uint32_t imin = (q < kmax - 1) ? ((q > 1) ? q : 1) : xcount - 1;
|
|
170
|
+
fill_row(state, q, imin, xcount - 1);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
uint32_t koptimal = find_koptimal(state);
|
|
174
|
+
|
|
175
|
+
VectorI *sizes = backtrack_sizes(state, koptimal);
|
|
176
|
+
|
|
177
|
+
/* printf("XSORTED \t"); vector_inspect_f(xsorted); */
|
|
178
|
+
/* printf("K OPTIMAL: %lld\n", koptimal); */
|
|
179
|
+
/* printf("SIZES \t"); vector_inspect_i(sizes); */
|
|
180
|
+
/* printf("FINAL COST\n"); matrix_inspect_f(cost); */
|
|
181
|
+
/* printf("FINAL SPLITS\n"); matrix_inspect_i(splits); */
|
|
182
|
+
|
|
183
|
+
VALUE response = rb_ary_new2(sizes->nvalues);
|
|
184
|
+
for (uint32_t i = 0; i < sizes->nvalues; i++) {
|
|
185
|
+
VALUE size = LONG2NUM(vector_get_i(sizes, i));
|
|
186
|
+
rb_ary_store(response, i, size);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
arena_destroy(arena);
|
|
190
|
+
|
|
191
|
+
return response;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
uint32_t find_koptimal(State state)
|
|
195
|
+
{
|
|
196
|
+
uint32_t kmin = state.kmin;
|
|
197
|
+
uint32_t kmax = state.kmax;
|
|
198
|
+
uint32_t xcount = state.xcount;
|
|
199
|
+
uint32_t kopt = kmin;
|
|
200
|
+
uint32_t xindex_max = state.xcount - 1;
|
|
201
|
+
VectorF *xsorted = state.xsorted;
|
|
202
|
+
long double x0 = vector_get_f(xsorted, 0);
|
|
203
|
+
long double xn = vector_get_f(xsorted, xindex_max);
|
|
204
|
+
long double max_bic = 0.0;
|
|
205
|
+
long double adjustment = state.apply_deviation ? 0.0 : 1.0;
|
|
206
|
+
|
|
207
|
+
for (uint32_t k = kmin; k <= kmax; k++) {
|
|
208
|
+
uint32_t index_right, index_left = 0;
|
|
209
|
+
long double bin_left, bin_right, loglikelihood = 0.0;
|
|
210
|
+
VectorI *sizes = backtrack_sizes(state, k);
|
|
211
|
+
|
|
212
|
+
for (uint32_t kb = 0; kb < k; kb++) {
|
|
213
|
+
uint32_t npoints = vector_get_i(sizes, kb);
|
|
214
|
+
index_right = index_left + npoints - 1;
|
|
215
|
+
long double xleft = vector_get_f(xsorted, index_left);
|
|
216
|
+
long double xright = vector_get_f(xsorted, index_right);
|
|
217
|
+
bin_left = xleft;
|
|
218
|
+
bin_right = xright;
|
|
219
|
+
|
|
220
|
+
if (xleft == xright) {
|
|
221
|
+
bin_left = index_left == 0
|
|
222
|
+
? x0
|
|
223
|
+
: (vector_get_f(xsorted, index_left - 1) + xleft) / 2;
|
|
224
|
+
bin_right = index_right < xindex_max
|
|
225
|
+
? (xright + vector_get_f(xsorted, index_right + 1)) / 2
|
|
226
|
+
: xn;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
long double bin_width = bin_right - bin_left;
|
|
230
|
+
SegmentStats stats = shifted_data_variance(xsorted, index_left, index_right);
|
|
231
|
+
long double mean = stats.mean;
|
|
232
|
+
long double variance = stats.variance;
|
|
233
|
+
|
|
234
|
+
if (variance > 0) {
|
|
235
|
+
for (uint32_t i = index_left; i <= index_right; i++) {
|
|
236
|
+
long double xi = vector_get_f(xsorted, i);
|
|
237
|
+
loglikelihood += -(xi - mean) * (xi - mean) / (2.0 * variance);
|
|
238
|
+
}
|
|
239
|
+
loglikelihood += npoints * (
|
|
240
|
+
(log(npoints / (long double) xcount) * adjustment) -
|
|
241
|
+
(0.5 * log(PIx2 * variance))
|
|
242
|
+
);
|
|
243
|
+
} else {
|
|
244
|
+
loglikelihood += npoints * log(1.0 / bin_width / xcount);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
index_left = index_right + 1;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
long double bic = (2.0 * loglikelihood) - (((3 * k) - 1) * log((long double) xcount));
|
|
251
|
+
|
|
252
|
+
if (k == kmin) {
|
|
253
|
+
max_bic = bic;
|
|
254
|
+
kopt = kmin;
|
|
255
|
+
} else if (bic > max_bic) {
|
|
256
|
+
max_bic = bic;
|
|
257
|
+
kopt = k;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
return kopt;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
VectorI *backtrack_sizes(State state, uint32_t k)
|
|
265
|
+
{
|
|
266
|
+
MatrixI *splits = state.splits;
|
|
267
|
+
VectorI *sizes = vector_create_i(state.arena, k);
|
|
268
|
+
uint32_t xcount = state.xcount;
|
|
269
|
+
uint32_t right = xcount - 1;
|
|
270
|
+
uint32_t left = 0;
|
|
271
|
+
|
|
272
|
+
// Common case works with `i` remaining unsigned and unconditional assignment of the next `left` and `right`
|
|
273
|
+
for (uint32_t i = k - 1; i > 0; i--, right = left - 1) {
|
|
274
|
+
left = matrix_get_i(splits, i, right);
|
|
275
|
+
vector_set_i(sizes, i, right - left + 1);
|
|
276
|
+
}
|
|
277
|
+
left = matrix_get_i(splits, 0, right);
|
|
278
|
+
vector_set_i(sizes, 0, right - left + 1);
|
|
279
|
+
|
|
280
|
+
return sizes;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
SegmentStats shifted_data_variance(VectorF *xsorted, uint32_t left, uint32_t right)
|
|
284
|
+
{
|
|
285
|
+
const uint32_t n = right - left + 1;
|
|
286
|
+
long double sum = 0.0;
|
|
287
|
+
long double sumsq = 0.0;
|
|
288
|
+
SegmentStats stats = { .mean = 0.0, .variance = 0.0 };
|
|
289
|
+
|
|
290
|
+
if (right >= left) {
|
|
291
|
+
const long double median = vector_get_f(xsorted, (left + right) / 2);
|
|
292
|
+
|
|
293
|
+
for (uint32_t i = left; i <= right; i++) {
|
|
294
|
+
const long double sumi = vector_get_f(xsorted, i) - median;
|
|
295
|
+
|
|
296
|
+
sum += sumi;
|
|
297
|
+
sumsq += sumi * sumi;
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
stats.mean = (sum / n) + median;
|
|
301
|
+
if (n > 1) {
|
|
302
|
+
stats.variance = (sumsq - (sum * sum / n)) / (n - 1);
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
return stats;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
void fill_row(State state, uint32_t q, uint32_t imin, uint32_t imax) {
|
|
310
|
+
uint32_t size = imax - q + 1;
|
|
311
|
+
VectorI *split_candidates = vector_create_i(state.arena, size);
|
|
312
|
+
for (uint32_t i = 0; i < size; i++) {
|
|
313
|
+
vector_set_i(split_candidates, i, q + i);
|
|
314
|
+
}
|
|
315
|
+
RowParams rparams = { .row = q, .imin = imin, .imax = imax, .istep = 1 };
|
|
316
|
+
smawk(state, rparams, split_candidates);
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
void smawk(State state, RowParams rparams, VectorI *split_candidates) {
|
|
320
|
+
const uint32_t imin = rparams.imin;
|
|
321
|
+
const uint32_t imax = rparams.imax;
|
|
322
|
+
const uint32_t istep = rparams.istep;
|
|
323
|
+
|
|
324
|
+
if ((imax - imin) <= (0 * istep)) {
|
|
325
|
+
find_min_from_candidates(state, rparams, split_candidates);
|
|
326
|
+
} else {
|
|
327
|
+
VectorI *odd_candidates = prune_candidates(state, rparams, split_candidates);
|
|
328
|
+
/* printf("PRUNED\t"); vector_inspect_i(odd_candidates); */
|
|
329
|
+
uint32_t istepx2 = istep * 2;
|
|
330
|
+
uint32_t imin_odd = imin + istep;
|
|
331
|
+
uint32_t imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2);
|
|
332
|
+
RowParams rparams_odd = { .row = rparams.row, .imin = imin_odd, .imax = imax_odd, .istep = istepx2 };
|
|
333
|
+
|
|
334
|
+
smawk(state, rparams_odd, odd_candidates);
|
|
335
|
+
fill_even_positions(state, rparams, split_candidates);
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
void fill_even_positions(State state, RowParams rparams, VectorI *split_candidates)
|
|
340
|
+
{
|
|
341
|
+
uint32_t row = rparams.row;
|
|
342
|
+
uint32_t imin = rparams.imin;
|
|
343
|
+
uint32_t imax = rparams.imax;
|
|
344
|
+
uint32_t istep = rparams.istep;
|
|
345
|
+
uint32_t n = split_candidates->nvalues;
|
|
346
|
+
uint32_t istepx2 = istep * 2;
|
|
347
|
+
uint32_t jl = vector_get_i(split_candidates, 0);
|
|
348
|
+
VectorF *xsum = state.xsum;
|
|
349
|
+
VectorF *xsumsq = state.xsumsq;
|
|
350
|
+
MatrixI *splits = state.splits;
|
|
351
|
+
|
|
352
|
+
for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
|
|
353
|
+
while (vector_get_i(split_candidates, r) < jl) r++;
|
|
354
|
+
|
|
355
|
+
uint32_t rcandidate = vector_get_i(split_candidates, r);
|
|
356
|
+
uint32_t cost_base_row = row - 1;
|
|
357
|
+
uint32_t cost_base_col = rcandidate - 1;
|
|
358
|
+
long double cost =
|
|
359
|
+
matrix_get_f(state.cost, cost_base_row, cost_base_col) + dissimilarity(rcandidate, i, xsum, xsumsq);
|
|
360
|
+
|
|
361
|
+
matrix_set_f(state.cost, row, i, cost);
|
|
362
|
+
matrix_set_i(state.splits, row, i, rcandidate);
|
|
363
|
+
|
|
364
|
+
uint32_t jh =
|
|
365
|
+
(i + istep) <= imax
|
|
366
|
+
? matrix_get_i(splits, row, i + istep)
|
|
367
|
+
: vector_get_i(split_candidates, n - 1);
|
|
368
|
+
|
|
369
|
+
uint32_t jmax = jh < i ? jh : i;
|
|
370
|
+
long double sjimin = dissimilarity(jmax, i, xsum, xsumsq);
|
|
371
|
+
|
|
372
|
+
for (++r; r < n && vector_get_i(split_candidates, r) <= jmax; r++) {
|
|
373
|
+
uint32_t jabs = vector_get_i(split_candidates, r);
|
|
374
|
+
|
|
375
|
+
if (jabs > i) break;
|
|
376
|
+
if (jabs < matrix_get_i(splits, row - 1, i)) continue;
|
|
377
|
+
|
|
378
|
+
long double cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
|
|
379
|
+
long double sj = cost_base + dissimilarity(jabs, i, xsum, xsumsq);
|
|
380
|
+
long double cost_prev = matrix_get_f(state.cost, row, i);
|
|
381
|
+
|
|
382
|
+
if (sj <= cost_prev) {
|
|
383
|
+
matrix_set_f(state.cost, row, i, sj);
|
|
384
|
+
matrix_set_i(state.splits, row, i, jabs);
|
|
385
|
+
} else if (cost_base + sjimin > cost_prev) {
|
|
386
|
+
break;
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
r--;
|
|
391
|
+
jl = jh;
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
|
|
396
|
+
{
|
|
397
|
+
const uint32_t row = rparams.row;
|
|
398
|
+
const uint32_t imin = rparams.imin;
|
|
399
|
+
const uint32_t imax = rparams.imax;
|
|
400
|
+
const uint32_t istep = rparams.istep;
|
|
401
|
+
MatrixF *const cost = state.cost;
|
|
402
|
+
MatrixI *const splits = state.splits;
|
|
403
|
+
|
|
404
|
+
uint32_t optimal_split_idx_prev = 0;
|
|
405
|
+
|
|
406
|
+
for (uint32_t i = imin; i <= imax; i += istep)
|
|
407
|
+
{
|
|
408
|
+
const uint32_t optimal_split_idx = optimal_split_idx_prev;
|
|
409
|
+
const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
|
|
410
|
+
const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
|
|
411
|
+
const long double added_cost = dissimilarity(optimal_split, i, state.xsum, state.xsumsq);
|
|
412
|
+
|
|
413
|
+
matrix_set_f(cost, row, i, cost_prev + added_cost);
|
|
414
|
+
matrix_set_i(splits, row, i, optimal_split);
|
|
415
|
+
|
|
416
|
+
for (uint32_t r = optimal_split_idx + 1; r < split_candidates->nvalues; r++)
|
|
417
|
+
{
|
|
418
|
+
uint32_t split = vector_get_i(split_candidates, r);
|
|
419
|
+
|
|
420
|
+
if (split < matrix_get_i(splits, row - 1, i)) continue;
|
|
421
|
+
if (split > i) break;
|
|
422
|
+
|
|
423
|
+
long double split_cost =
|
|
424
|
+
matrix_get_f(cost, row - 1, split - 1) + dissimilarity(split, i, state.xsum, state.xsumsq);
|
|
425
|
+
|
|
426
|
+
if (split_cost > matrix_get_f(cost, row, i)) continue;
|
|
427
|
+
|
|
428
|
+
matrix_set_f(cost, row, i, split_cost);
|
|
429
|
+
matrix_set_i(splits, row, i, split);
|
|
430
|
+
optimal_split_idx_prev = r;
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
|
|
436
|
+
{
|
|
437
|
+
uint32_t n = ((rparams.imax - rparams.imin) / rparams.istep) + 1;
|
|
438
|
+
uint32_t m = split_candidates->nvalues;
|
|
439
|
+
|
|
440
|
+
if (n >= m) return split_candidates;
|
|
441
|
+
|
|
442
|
+
uint32_t left = -1;
|
|
443
|
+
uint32_t right = 0;
|
|
444
|
+
VectorI *pruned = vector_dup_i(split_candidates, state.arena);
|
|
445
|
+
|
|
446
|
+
while (m > n)
|
|
447
|
+
{
|
|
448
|
+
uint32_t p = left + 1;
|
|
449
|
+
uint32_t i = rparams.imin + p * rparams.istep;
|
|
450
|
+
uint32_t j = vector_get_i(pruned, right);
|
|
451
|
+
uint32_t jnext = vector_get_i(pruned, right + 1);
|
|
452
|
+
long double sl =
|
|
453
|
+
matrix_get_f(state.cost, rparams.row - 1, j - 1) + dissimilarity(j, i, state.xsum, state.xsumsq);
|
|
454
|
+
long double snext =
|
|
455
|
+
matrix_get_f(state.cost, rparams.row - 1, jnext - 1) + dissimilarity(jnext, i, state.xsum, state.xsumsq);
|
|
456
|
+
|
|
457
|
+
if ((sl < snext) && (p < n - 1)) {
|
|
458
|
+
left++;
|
|
459
|
+
right++;
|
|
460
|
+
vector_set_i(pruned, left, j);
|
|
461
|
+
} else if ((sl < snext) && (p == n - 1)) {
|
|
462
|
+
right++;
|
|
463
|
+
m--;
|
|
464
|
+
vector_set_i(pruned, right, j);
|
|
465
|
+
} else {
|
|
466
|
+
if (p > 0) {
|
|
467
|
+
/* TODO: extract `vector_setcpy_T` */
|
|
468
|
+
vector_set_i(pruned, right, vector_get_i(pruned, left));
|
|
469
|
+
left--;
|
|
470
|
+
} else {
|
|
471
|
+
right++;
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
m--;
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
for (uint32_t i = left + 1; i < m; i++) {
|
|
479
|
+
/* TODO: extract `vector_setcpy_T` */
|
|
480
|
+
vector_set_i(pruned, i, vector_get_i(pruned, right++));
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
vector_downsize_i(pruned, m);
|
|
484
|
+
|
|
485
|
+
return pruned;
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
long double dissimilarity(uint32_t j, uint32_t i, VectorF *xsum, VectorF *xsumsq) {
|
|
489
|
+
long double sji = 0.0;
|
|
490
|
+
|
|
491
|
+
if (j >= i) return sji;
|
|
492
|
+
|
|
493
|
+
if (j > 0) {
|
|
494
|
+
/* TODO: looks more like `segment_delta` */
|
|
495
|
+
long double segment_sum = vector_get_diff_f(xsum, i, j - 1);
|
|
496
|
+
uint32_t segment_size = i - j + 1;
|
|
497
|
+
sji = vector_get_diff_f(xsumsq, i, j - 1) - (segment_sum * segment_sum / segment_size);
|
|
498
|
+
} else {
|
|
499
|
+
long double xsumi = vector_get_f(xsum, i);
|
|
500
|
+
sji = vector_get_f(xsumsq, i) - (xsumi * xsumi / (i + 1));
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
return (sji > 0) ? sji : 0.0;
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
VectorF *vector_create_f(Arena *arena, uint32_t nvalues) {
|
|
507
|
+
VectorF *v;
|
|
508
|
+
|
|
509
|
+
v = arena_alloc(arena, sizeof(*v));
|
|
510
|
+
v->values = arena_alloc(arena, sizeof(*(v->values)) * nvalues);
|
|
511
|
+
v->nvalues = nvalues;
|
|
512
|
+
|
|
513
|
+
return v;
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
VectorI *vector_create_i(Arena *arena, uint32_t nvalues) {
|
|
517
|
+
VectorI *v;
|
|
518
|
+
|
|
519
|
+
v = arena_alloc(arena, sizeof(*v));
|
|
520
|
+
v->values = arena_alloc(arena, sizeof(*(v->values)) * nvalues);
|
|
521
|
+
v->nvalues = nvalues;
|
|
522
|
+
|
|
523
|
+
return v;
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
VectorI *vector_dup_i(VectorI *v, Arena *arena)
|
|
527
|
+
{
|
|
528
|
+
VectorI *vdup = vector_create_i(arena, v->nvalues);
|
|
529
|
+
|
|
530
|
+
/* TODO: use one memcpy call */
|
|
531
|
+
for (uint32_t i = 0; i < v->nvalues; i++) {
|
|
532
|
+
vector_set_i(vdup, i, vector_get_i(v, i));
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
return vdup;
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
void vector_set_f(VectorF *v, uint32_t offset, long double value) {
|
|
539
|
+
assert(offset < v->nvalues && "[vector_set_f] element index should be less than nvalues");
|
|
540
|
+
|
|
541
|
+
*(v->values + offset) = value;
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
void vector_set_i(VectorI *v, uint32_t offset, uint32_t value) {
|
|
545
|
+
assert(offset < v->nvalues && "[vector_set_i] element index should be less than nvalues");
|
|
546
|
+
|
|
547
|
+
*(v->values + offset) = value;
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
uint32_t vector_get_i(VectorI *v, uint32_t offset) {
|
|
551
|
+
assert(offset < v->nvalues && "[vector_get_i] element index should be less than nvalues");
|
|
552
|
+
|
|
553
|
+
return *(v->values + offset);
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
void vector_downsize_i(VectorI *v, uint32_t new_size) {
|
|
557
|
+
v->nvalues = new_size;
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
void vector_inspect_i(VectorI *v) {
|
|
561
|
+
for (uint32_t i = 0; i < v->nvalues - 1; i++)
|
|
562
|
+
printf("%u, ", vector_get_i(v, i));
|
|
563
|
+
printf("%u\n", vector_get_i(v, v->nvalues - 1));
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
long double vector_get_f(VectorF *v, uint32_t offset) {
|
|
567
|
+
assert(offset < v->nvalues && "[vector_get_f] element index should be less than nvalues");
|
|
568
|
+
|
|
569
|
+
return *(v->values + offset);
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
long double vector_get_diff_f(VectorF *v, uint32_t i, uint32_t j) {
|
|
573
|
+
assert(i < v->nvalues && "[vector_get_diff_f] i should be less than nvalues");
|
|
574
|
+
assert(j < v->nvalues && "[vector_get_diff_f] j should be less than nvalues");
|
|
575
|
+
|
|
576
|
+
return *(v->values + i) - *(v->values + j);
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
void vector_inspect_f(VectorF *v) {
|
|
580
|
+
for (uint32_t i = 0; i < v->nvalues - 1; i++)
|
|
581
|
+
printf("%Lf, ", vector_get_f(v, i));
|
|
582
|
+
printf("%Lf\n", vector_get_f(v, v->nvalues - 1));
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
MatrixF *matrix_create_f(Arena *arena, uint32_t nrows, uint32_t ncols) {
|
|
586
|
+
MatrixF *m;
|
|
587
|
+
|
|
588
|
+
m = arena_alloc(arena, sizeof(*m));
|
|
589
|
+
m->values = arena_alloc(arena, sizeof(*(m->values)) * ncols * nrows);
|
|
590
|
+
m->ncols = ncols;
|
|
591
|
+
m->nrows = nrows;
|
|
592
|
+
|
|
593
|
+
return m;
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
MatrixI *matrix_create_i(Arena *arena, uint32_t nrows, uint32_t ncols) {
|
|
597
|
+
MatrixI *m;
|
|
598
|
+
|
|
599
|
+
m = arena_alloc(arena, sizeof(*m));
|
|
600
|
+
m->values = arena_alloc(arena, sizeof(*(m->values)) * ncols * nrows);
|
|
601
|
+
m->ncols = ncols;
|
|
602
|
+
m->nrows = nrows;
|
|
603
|
+
|
|
604
|
+
return m;
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
void matrix_set_f(MatrixF *m, uint32_t i, uint32_t j, long double value) {
|
|
608
|
+
assert(i < m->nrows && "[matrix_set_f] row offset should be less than nrows");
|
|
609
|
+
assert(j < m->cols && "[matrix_set_f] col offset should be less than ncols");
|
|
610
|
+
|
|
611
|
+
uint32_t offset = i * m->ncols + j;
|
|
612
|
+
*(m->values + offset) = value;
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
long double matrix_get_f(MatrixF *m, uint32_t i, uint32_t j) {
|
|
616
|
+
assert(i < m->nrows && "[matrix_get_f] row offset should be less than nrows");
|
|
617
|
+
assert(j < m->cols && "[matrix_get_f] col offset should be less than ncols");
|
|
618
|
+
|
|
619
|
+
uint32_t offset = i * m->ncols + j;
|
|
620
|
+
return *(m->values + offset);
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
void matrix_inspect_f(MatrixF *m) {
|
|
624
|
+
for (uint32_t i = 0; i < m->nrows; i++) {
|
|
625
|
+
for (uint32_t j = 0; j < m->ncols - 1; j++) {
|
|
626
|
+
long double value = matrix_get_f(m, i, j);
|
|
627
|
+
|
|
628
|
+
printf("%Lf, ", value);
|
|
629
|
+
}
|
|
630
|
+
printf("%Lf\n", matrix_get_f(m, i, m->ncols - 1));
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
void matrix_inspect_i(MatrixI *m) {
|
|
635
|
+
for (uint32_t i = 0; i < m->nrows; i++) {
|
|
636
|
+
for (uint32_t j = 0; j < m->ncols - 1; j++)
|
|
637
|
+
printf("%u, ", matrix_get_i(m, i, j));
|
|
638
|
+
printf("%u\n", matrix_get_i(m, i, m->ncols - 1));
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
void matrix_set_i(MatrixI *m, uint32_t i, uint32_t j, uint32_t value) {
|
|
643
|
+
assert(i < m->nrows && "[matrix_set_i] row offset should be less than nrows");
|
|
644
|
+
assert(j < m->cols && "[matrix_set_i] col offset should be less than ncols");
|
|
645
|
+
|
|
646
|
+
uint32_t offset = i * m->ncols + j;
|
|
647
|
+
*(m->values + offset) = value;
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
uint32_t matrix_get_i(MatrixI *m, uint32_t i, uint32_t j) {
|
|
651
|
+
assert(i < m->nrows && "[matrix_get_i] row offset should be less than nrows");
|
|
652
|
+
assert(j < m->cols && "[matrix_get_i] col offset should be less than ncols");
|
|
653
|
+
|
|
654
|
+
uint32_t offset = i * m->ncols + j;
|
|
655
|
+
return *(m->values + offset);
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
Arena *arena_create(uint32_t capacity) {
|
|
659
|
+
if (capacity < ARENA_MIN_CAPACITY) {
|
|
660
|
+
capacity = ARENA_MIN_CAPACITY;
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
Arena *arena;
|
|
664
|
+
|
|
665
|
+
arena = malloc(sizeof(*arena));
|
|
666
|
+
if (!arena) {
|
|
667
|
+
printf("Failed to allocate arena\n");
|
|
668
|
+
return NULL;
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
arena->buffer = calloc(1, capacity);
|
|
672
|
+
if (!arena->buffer) {
|
|
673
|
+
printf("Failed to allocate arena\n");
|
|
674
|
+
free(arena);
|
|
675
|
+
return NULL;
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
arena->capacity = capacity;
|
|
679
|
+
arena->offset = 0;
|
|
680
|
+
|
|
681
|
+
printf("[Arena Created] Capacity: %u, offset: %u\n", arena->capacity, arena->offset);
|
|
682
|
+
|
|
683
|
+
return arena;
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
void *arena_alloc(Arena *arena, uint32_t size) {
|
|
687
|
+
size = (size + 7) & ~7;
|
|
688
|
+
|
|
689
|
+
if (arena->offset + size > arena->capacity) {
|
|
690
|
+
printf("Arena Out Of Memory\n");
|
|
691
|
+
return NULL;
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
void *ptr = arena->buffer + arena->offset;
|
|
695
|
+
arena->offset += size;
|
|
696
|
+
|
|
697
|
+
return ptr;
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
void arena_destroy(Arena *arena) {
|
|
701
|
+
printf("[Arena Destroy] Capacity: %u, offset: %u, left: %u\n", arena->capacity, arena->offset, arena->capacity - arena->offset);
|
|
702
|
+
free(arena->buffer);
|
|
703
|
+
free(arena);
|
|
704
|
+
}
|
data/lib/ckmeans/clusterer.rb
CHANGED
|
@@ -2,8 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
module Ckmeans
|
|
4
4
|
class Clusterer # rubocop:disable Style/Documentation, Metrics/ClassLength
|
|
5
|
-
attr_reader :xcount, :xsorted, :kmin, :kmax, :smat, :jmat, :kestimate
|
|
6
|
-
|
|
7
5
|
PI_DOUBLE = Math::PI * 2
|
|
8
6
|
|
|
9
7
|
def initialize(entries, kmin, kmax = kmin, kestimate = :regular)
|
|
@@ -12,85 +10,94 @@ module Ckmeans
|
|
|
12
10
|
raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
|
|
13
11
|
raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
|
|
14
12
|
|
|
15
|
-
@kmin
|
|
16
|
-
@unique_xcount
|
|
17
|
-
@kmax
|
|
18
|
-
@
|
|
19
|
-
@
|
|
13
|
+
@kmin = kmin
|
|
14
|
+
@unique_xcount = entries.uniq.size
|
|
15
|
+
@kmax = [@unique_xcount, kmax].min
|
|
16
|
+
@xsorted_original = entries.sort
|
|
17
|
+
@xsorted = @xsorted_original.map(&:to_f)
|
|
18
|
+
@apply_bic_deviation = kestimate == :sensitive
|
|
20
19
|
end
|
|
21
20
|
|
|
22
21
|
def clusters
|
|
23
22
|
@clusters ||=
|
|
24
23
|
if @unique_xcount <= 1
|
|
25
|
-
[
|
|
24
|
+
[@xsorted_original]
|
|
26
25
|
else
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
26
|
+
sorted_group_sizes.each_with_object([]) do |size, groups|
|
|
27
|
+
groups << @xsorted_original.shift(size)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
=begin # rubocop:disable Style/BlockComments
|
|
31
|
+
@cost = Array.new(kmax) { Array.new(xcount) { 0.0 } }
|
|
32
|
+
@splits = Array.new(kmax) { Array.new(xcount) { 0 } }
|
|
33
|
+
@xsum = Array.new(xcount)
|
|
34
|
+
@xsumsq = Array.new(xcount)
|
|
35
|
+
|
|
36
|
+
shift = xsorted[xcount / 2]
|
|
37
|
+
xsum[0] = xsorted[0].to_f - shift
|
|
36
38
|
xsumsq[0] = xsum[0]**2
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
39
|
+
|
|
40
|
+
1.upto(xcount - 1) do |i|
|
|
41
|
+
xf = xsorted[i].to_f
|
|
42
|
+
xsum[i] = xsum[i - 1] + xf - shift
|
|
43
|
+
xsumsq[i] = xsumsq[i - 1] + ((xf - shift) * (xf - shift))
|
|
44
|
+
cost[0][i] = dissim(0, i)
|
|
45
|
+
splits[0][i] = 0
|
|
42
46
|
end
|
|
43
47
|
|
|
44
|
-
|
|
45
|
-
1.upto(
|
|
46
|
-
imin = q <
|
|
47
|
-
fill_row(q, imin,
|
|
48
|
+
kmax_idx = kmax - 1
|
|
49
|
+
1.upto(kmax_idx) do |q|
|
|
50
|
+
imin = q < kmax_idx ? [1, q].max : xcount - 1
|
|
51
|
+
fill_row(q, imin, xcount - 1)
|
|
48
52
|
end
|
|
49
53
|
|
|
50
54
|
kopt = koptimal
|
|
51
55
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
56
|
+
puts "RB COST\n", cost.map(&:inspect)
|
|
57
|
+
puts "RB SPLITS\n", splits.map(&:inspect)
|
|
58
|
+
puts "RB K OPTIMAL: #{kopt}"
|
|
59
|
+
|
|
60
|
+
backtrack(kopt).each_with_object(Array.new(kopt)) do |(q, left, right), res|
|
|
61
|
+
res[q] = xsorted[left..right]
|
|
55
62
|
end
|
|
56
|
-
|
|
63
|
+
=end
|
|
57
64
|
end
|
|
58
65
|
end
|
|
59
66
|
|
|
60
67
|
private
|
|
61
68
|
|
|
62
|
-
|
|
63
|
-
kopt = kmin
|
|
64
|
-
n = xcount
|
|
65
|
-
max_bic = 0.0
|
|
69
|
+
attr_reader :cost, :splits, :xsum, :xsumsq, :xcount, :xsorted, :kmin, :kmax
|
|
66
70
|
|
|
67
|
-
|
|
68
|
-
|
|
71
|
+
def koptimal # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
72
|
+
kopt = kmin
|
|
73
|
+
n = xcount
|
|
74
|
+
max_bic = 0.0
|
|
75
|
+
adjustment = kestimate == :sensitive ? 0.0 : 1.0 # Deviation from BIC formula to favor smaller clusters
|
|
69
76
|
|
|
70
77
|
kmin.upto(kmax) do |k|
|
|
71
|
-
sizes = Array.new(k)
|
|
72
|
-
|
|
73
|
-
index_left
|
|
74
|
-
index_right
|
|
78
|
+
sizes = backtrack(k).each_with_object(Array.new(k)) { |(q, left, right), sz| sz[q] = right - left + 1 }
|
|
79
|
+
|
|
80
|
+
index_left = 0
|
|
81
|
+
index_right = nil
|
|
75
82
|
loglikelihood = 0.0
|
|
76
|
-
bin_left
|
|
77
|
-
bin_right
|
|
83
|
+
bin_left = nil
|
|
84
|
+
bin_right = nil
|
|
78
85
|
|
|
79
86
|
k.times do |kb|
|
|
80
87
|
num_points_in_bin = sizes[kb]
|
|
81
88
|
index_right = index_left + num_points_in_bin - 1
|
|
82
89
|
|
|
83
90
|
if xsorted[index_left] < xsorted[index_right]
|
|
84
|
-
bin_left
|
|
91
|
+
bin_left = xsorted[index_left]
|
|
85
92
|
bin_right = xsorted[index_right]
|
|
86
93
|
elsif xsorted[index_left] == xsorted[index_right]
|
|
87
|
-
bin_left
|
|
94
|
+
bin_left = index_left == 0 ? xsorted[0] : (xsorted[index_left - 1] + xsorted[index_left]) / 2.0
|
|
88
95
|
bin_right = index_right < n - 1 ? (xsorted[index_right] + xsorted[index_right + 1]) / 2.0 : xsorted[n - 1]
|
|
89
96
|
else
|
|
90
97
|
raise "ERROR: binLeft > binRight"
|
|
91
98
|
end
|
|
92
99
|
|
|
93
|
-
bin_width = bin_right - bin_left
|
|
100
|
+
bin_width = bin_right.to_f - bin_left
|
|
94
101
|
|
|
95
102
|
mean, variance = shifted_data_variance(index_left, index_right)
|
|
96
103
|
|
|
@@ -112,10 +119,10 @@ module Ckmeans
|
|
|
112
119
|
|
|
113
120
|
if k == kmin
|
|
114
121
|
max_bic = bic
|
|
115
|
-
kopt
|
|
122
|
+
kopt = kmin
|
|
116
123
|
elsif bic > max_bic
|
|
117
124
|
max_bic = bic
|
|
118
|
-
kopt
|
|
125
|
+
kopt = k
|
|
119
126
|
end
|
|
120
127
|
end
|
|
121
128
|
|
|
@@ -123,22 +130,22 @@ module Ckmeans
|
|
|
123
130
|
end
|
|
124
131
|
|
|
125
132
|
def shifted_data_variance(ileft, iright)
|
|
126
|
-
sum
|
|
127
|
-
sumsq
|
|
128
|
-
mean
|
|
133
|
+
sum = 0.0
|
|
134
|
+
sumsq = 0.0
|
|
135
|
+
mean = 0.0
|
|
129
136
|
variance = 0.0
|
|
130
|
-
n
|
|
137
|
+
n = iright - ileft + 1
|
|
131
138
|
|
|
132
139
|
if iright >= ileft
|
|
133
|
-
median = xsorted[(ileft + iright) / 2]
|
|
140
|
+
median = xsorted[(ileft + iright) / 2].to_f
|
|
134
141
|
|
|
135
142
|
ileft.upto(iright) do |i|
|
|
136
|
-
sumi
|
|
137
|
-
sum
|
|
143
|
+
sumi = xsorted[i] - median
|
|
144
|
+
sum += sumi
|
|
138
145
|
sumsq += sumi**2
|
|
139
146
|
end
|
|
140
147
|
|
|
141
|
-
mean
|
|
148
|
+
mean = (sum / n) + median
|
|
142
149
|
variance = (sumsq - (sum * sum / n)) / (n - 1) if n > 1
|
|
143
150
|
end
|
|
144
151
|
|
|
@@ -146,12 +153,13 @@ module Ckmeans
|
|
|
146
153
|
end
|
|
147
154
|
|
|
148
155
|
def backtrack(k)
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
156
|
+
return to_enum(__method__, k) unless block_given?
|
|
157
|
+
|
|
158
|
+
right = xcount - 1
|
|
159
|
+
left = nil
|
|
152
160
|
|
|
153
161
|
(k - 1).downto(0) do |q|
|
|
154
|
-
left =
|
|
162
|
+
left = splits[q][right]
|
|
155
163
|
|
|
156
164
|
yield q, left, right
|
|
157
165
|
|
|
@@ -159,7 +167,7 @@ module Ckmeans
|
|
|
159
167
|
end
|
|
160
168
|
end
|
|
161
169
|
|
|
162
|
-
def dissim(j, i
|
|
170
|
+
def dissim(j, i)
|
|
163
171
|
return 0.0 if j >= i
|
|
164
172
|
|
|
165
173
|
sji =
|
|
@@ -174,80 +182,82 @@ module Ckmeans
|
|
|
174
182
|
[0, sji].max
|
|
175
183
|
end
|
|
176
184
|
|
|
177
|
-
def fill_row(q, imin, imax
|
|
185
|
+
def fill_row(q, imin, imax)
|
|
178
186
|
size = imax - q + 1
|
|
179
187
|
|
|
180
188
|
js = Array.new(size) { |i| q + i }
|
|
181
|
-
smawk(imin, imax, 1, q, js
|
|
189
|
+
smawk(imin, imax, 1, q, js)
|
|
182
190
|
end
|
|
183
191
|
|
|
184
|
-
def smawk(imin, imax, istep, q, js
|
|
192
|
+
def smawk(imin, imax, istep, q, js)
|
|
185
193
|
if (imax - imin) <= (0 * istep)
|
|
186
|
-
find_min_from_candidates(q, imin, imax, istep, js
|
|
194
|
+
find_min_from_candidates(q, imin, imax, istep, js)
|
|
187
195
|
else
|
|
188
|
-
js_odd =
|
|
196
|
+
js_odd = prune_candidates(imin, imax, istep, q, js)
|
|
197
|
+
# puts "Pruned: #{js_odd.inspect}"
|
|
189
198
|
istepx2 = istep * 2
|
|
190
199
|
imin_odd = imin + istep
|
|
191
200
|
imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2)
|
|
192
|
-
smawk(imin_odd, imax_odd, istepx2, q, js_odd
|
|
193
|
-
fill_even_positions(imin, imax, istep, q, js
|
|
201
|
+
smawk(imin_odd, imax_odd, istepx2, q, js_odd)
|
|
202
|
+
fill_even_positions(imin, imax, istep, q, js)
|
|
194
203
|
end
|
|
195
204
|
end
|
|
196
205
|
|
|
197
|
-
def find_min_from_candidates(q, imin, imax, istep, js
|
|
198
|
-
|
|
206
|
+
def find_min_from_candidates(q, imin, imax, istep, js)
|
|
207
|
+
optimal_split_index_prev = 0
|
|
199
208
|
|
|
200
209
|
(imin..imax).step(istep) do |i|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
210
|
+
optimal_split_index = optimal_split_index_prev
|
|
211
|
+
optimal_split = js[optimal_split_index]
|
|
212
|
+
cost[q][i] = cost[q - 1][optimal_split - 1] + dissim(optimal_split, i)
|
|
213
|
+
splits[q][i] = optimal_split
|
|
204
214
|
|
|
205
|
-
((
|
|
206
|
-
jabs = js[
|
|
215
|
+
((optimal_split_index + 1)...js.size).each do |split_index|
|
|
216
|
+
jabs = js[split_index]
|
|
207
217
|
|
|
208
|
-
next if jabs <
|
|
218
|
+
next if jabs < splits[q - 1][i]
|
|
209
219
|
break if jabs > i
|
|
210
220
|
|
|
211
|
-
sj =
|
|
221
|
+
sj = cost[q - 1][jabs - 1] + dissim(jabs, i)
|
|
212
222
|
|
|
213
|
-
next unless sj <=
|
|
223
|
+
next unless sj <= cost[q][i]
|
|
214
224
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
225
|
+
cost[q][i] = sj
|
|
226
|
+
splits[q][i] = js[split_index]
|
|
227
|
+
optimal_split_index_prev = split_index
|
|
218
228
|
end
|
|
219
229
|
end
|
|
220
230
|
end
|
|
221
231
|
|
|
222
|
-
def
|
|
232
|
+
def prune_candidates(imin, imax, istep, q, js)
|
|
223
233
|
n = ((imax - imin) / istep) + 1
|
|
224
234
|
m = js.size
|
|
225
235
|
|
|
226
236
|
return js if n >= m
|
|
227
237
|
|
|
228
|
-
|
|
238
|
+
pruned = js.dup
|
|
229
239
|
left = -1
|
|
230
240
|
right = 0
|
|
231
241
|
|
|
232
242
|
while m > n
|
|
233
|
-
p
|
|
234
|
-
i
|
|
235
|
-
j
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
if (sl <
|
|
243
|
+
p = left + 1
|
|
244
|
+
i = imin + (p * istep)
|
|
245
|
+
j = pruned[right]
|
|
246
|
+
jnext = pruned[right + 1]
|
|
247
|
+
sl = cost[q - 1][j - 1] + dissim(j, i)
|
|
248
|
+
snext = cost[q - 1][jnext - 1] + dissim(jnext, i)
|
|
249
|
+
|
|
250
|
+
if (sl < snext) && (p < n - 1)
|
|
241
251
|
left += 1
|
|
242
|
-
|
|
252
|
+
pruned[left] = j
|
|
243
253
|
right += 1
|
|
244
|
-
elsif (sl <
|
|
254
|
+
elsif (sl < snext) && (p == n - 1)
|
|
245
255
|
right += 1
|
|
246
|
-
|
|
256
|
+
pruned[right] = j
|
|
247
257
|
m -= 1
|
|
248
258
|
else
|
|
249
259
|
if p > 0
|
|
250
|
-
|
|
260
|
+
pruned[right] = pruned[left]
|
|
251
261
|
left -= 1
|
|
252
262
|
else
|
|
253
263
|
right += 1
|
|
@@ -258,15 +268,15 @@ module Ckmeans
|
|
|
258
268
|
end
|
|
259
269
|
|
|
260
270
|
((left + 1)...m).each do |r|
|
|
261
|
-
|
|
271
|
+
pruned[r] = pruned[right]
|
|
262
272
|
right += 1
|
|
263
273
|
end
|
|
264
274
|
|
|
265
|
-
|
|
266
|
-
|
|
275
|
+
pruned.slice!(m..-1) if pruned.size > m
|
|
276
|
+
pruned
|
|
267
277
|
end
|
|
268
278
|
|
|
269
|
-
def fill_even_positions(imin, imax, istep, q, js
|
|
279
|
+
def fill_even_positions(imin, imax, istep, q, js)
|
|
270
280
|
n = js.size
|
|
271
281
|
istepx2 = istep * 2
|
|
272
282
|
jl = js[0]
|
|
@@ -276,11 +286,11 @@ module Ckmeans
|
|
|
276
286
|
while i <= imax
|
|
277
287
|
r += 1 while js[r] < jl
|
|
278
288
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
jh
|
|
282
|
-
jmax
|
|
283
|
-
sjimin
|
|
289
|
+
cost[q][i] = cost[q - 1][js[r] - 1] + dissim(js[r], i)
|
|
290
|
+
splits[q][i] = js[r]
|
|
291
|
+
jh = (i + istep) <= imax ? splits[q][i + istep] : js[n - 1]
|
|
292
|
+
jmax = [jh, i].min
|
|
293
|
+
sjimin = dissim(jmax, i)
|
|
284
294
|
|
|
285
295
|
r += 1
|
|
286
296
|
while r < n && js[r] <= jmax
|
|
@@ -288,18 +298,18 @@ module Ckmeans
|
|
|
288
298
|
|
|
289
299
|
break if jabs > i
|
|
290
300
|
|
|
291
|
-
if jabs <
|
|
301
|
+
if jabs < splits[q - 1][i]
|
|
292
302
|
r += 1
|
|
293
303
|
next
|
|
294
304
|
end
|
|
295
305
|
|
|
296
|
-
|
|
297
|
-
sj
|
|
306
|
+
cost_base = cost[q - 1][jabs - 1]
|
|
307
|
+
sj = cost_base + dissim(jabs, i)
|
|
298
308
|
|
|
299
|
-
if sj <=
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
elsif
|
|
309
|
+
if sj <= cost[q][i]
|
|
310
|
+
cost[q][i] = sj
|
|
311
|
+
splits[q][i] = jabs
|
|
312
|
+
elsif cost_base + sjimin > cost[q][i]
|
|
303
313
|
break
|
|
304
314
|
end
|
|
305
315
|
|
|
@@ -314,3 +324,5 @@ module Ckmeans
|
|
|
314
324
|
end
|
|
315
325
|
end
|
|
316
326
|
end
|
|
327
|
+
|
|
328
|
+
require "ckmeans/extensions"
|
data/lib/ckmeans/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ckmeans
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 1.0.0.rc
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Vlad Lebedev
|
|
8
8
|
bindir: exe
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2025-
|
|
10
|
+
date: 2025-04-22 00:00:00.000000000 Z
|
|
11
11
|
dependencies: []
|
|
12
12
|
description: Repeatable clustering of unidimensional data
|
|
13
13
|
email:
|
|
@@ -24,6 +24,8 @@ files:
|
|
|
24
24
|
- LICENSE
|
|
25
25
|
- README.md
|
|
26
26
|
- Rakefile
|
|
27
|
+
- ext/ckmeans/extconf.rb
|
|
28
|
+
- ext/ckmeans/extensions.c
|
|
27
29
|
- lib/ckmeans.rb
|
|
28
30
|
- lib/ckmeans/clusterer.rb
|
|
29
31
|
- lib/ckmeans/version.rb
|