ckmeans 2.1.0 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/ckmeans/algorithm.inc +281 -0
- data/ext/ckmeans/dissimilarity.h +65 -0
- data/ext/ckmeans/extensions.c +22 -341
- data/lib/ckmeans/clusterer.rb +6 -6
- data/lib/ckmeans/version.rb +1 -1
- data/lib/ckmedian/clusterer.rb +6 -6
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 231337d4b73a838b8a9326936c4ae4c003db103108b6622c22a09a6a44bf4e31
|
|
4
|
+
data.tar.gz: c0d97cd2fd4b1fd6693305900e9e329e218993fb735b9d24cd1c7281eda14e8a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 04be90532ac9498a184025d849b14a0d34fb61d00352c818a6e7495ae753666689ecef0d542fc6e95f118e5c83fcc29560baee2b339762158d47811a1d4164cd
|
|
7
|
+
data.tar.gz: '019cc8ff7f3fa2648faa03997fc88489718e944d13760b1e28eed3c4e48d48da56fd87ced9e5d4499e9c6cfb962cba6c02889dd0d3a53fd88ccb0a0396e00551'
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
/* SMAWK algorithm implementation template
|
|
2
|
+
*
|
|
3
|
+
* This file is designed to be included multiple times with different DISSIM_SUFFIX and DISSIM macro definitions
|
|
4
|
+
* to generate L1 and L2 specific versions of the clustering algorithm.
|
|
5
|
+
*
|
|
6
|
+
* Before including this file, define:
|
|
7
|
+
* DISSIM_SUFFIX - suffix for function names (e.g., l1, l2)
|
|
8
|
+
* DISSIM(j, i, xsum, xsumsq) - macro that calls the dissimilarity function
|
|
9
|
+
*
|
|
10
|
+
* Example:
|
|
11
|
+
* #define DISSIM_SUFFIX l2
|
|
12
|
+
* #define DISSIM(j, i, xsum, xsumsq) dissimilarity_l2(j, i, xsum, xsumsq)
|
|
13
|
+
* #include "algorithm.inc"
|
|
14
|
+
* #undef DISSIM
|
|
15
|
+
* #undef DISSIM_SUFFIX
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
#ifndef DISSIM_SUFFIX
|
|
19
|
+
#error "DISSIM_SUFFIX must be defined before including algorithm.inc"
|
|
20
|
+
#endif
|
|
21
|
+
|
|
22
|
+
#ifndef DISSIM
|
|
23
|
+
#error "DISSIM must be defined before including algorithm.inc"
|
|
24
|
+
#endif
|
|
25
|
+
|
|
26
|
+
/* Helper macros for token pasting */
|
|
27
|
+
#define CONCAT_IMPL(a, b) a##_##b
|
|
28
|
+
#define CONCAT(a, b) CONCAT_IMPL(a, b)
|
|
29
|
+
#define FUNC_NAME(name) CONCAT(name, DISSIM_SUFFIX)
|
|
30
|
+
|
|
31
|
+
static inline void FUNC_NAME(fill_even_positions)(State state, RowParams rparams, VectorI *split_candidates)
|
|
32
|
+
{
|
|
33
|
+
uint32_t row = rparams.row;
|
|
34
|
+
uint32_t imin = rparams.imin;
|
|
35
|
+
uint32_t imax = rparams.imax;
|
|
36
|
+
uint32_t istep = rparams.istep;
|
|
37
|
+
uint32_t n = split_candidates->size;
|
|
38
|
+
uint32_t istepx2 = istep * 2;
|
|
39
|
+
uint32_t jl = vector_get_i(split_candidates, 0);
|
|
40
|
+
VectorF *const xsum = state.xsum;
|
|
41
|
+
VectorF *const xsumsq = state.xsumsq;
|
|
42
|
+
MatrixI *const splits = state.splits;
|
|
43
|
+
|
|
44
|
+
for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
|
|
45
|
+
while (vector_get_i(split_candidates, r) < jl) r++;
|
|
46
|
+
|
|
47
|
+
uint32_t rcandidate = vector_get_i(split_candidates, r);
|
|
48
|
+
uint32_t cost_base_row = row - 1;
|
|
49
|
+
uint32_t cost_base_col = rcandidate - 1;
|
|
50
|
+
LDouble cost = matrix_get_f(state.cost, cost_base_row, cost_base_col)
|
|
51
|
+
+ DISSIM(rcandidate, i, xsum, xsumsq);
|
|
52
|
+
|
|
53
|
+
matrix_set_f(state.cost, row, i, cost);
|
|
54
|
+
matrix_set_i(state.splits, row, i, rcandidate);
|
|
55
|
+
|
|
56
|
+
uint32_t jh = (i + istep) <= imax
|
|
57
|
+
? matrix_get_i(splits, row, i + istep)
|
|
58
|
+
: vector_get_i(split_candidates, n - 1);
|
|
59
|
+
|
|
60
|
+
uint32_t jmax = jh < i ? jh : i;
|
|
61
|
+
LDouble sjimin = DISSIM(jmax, i, xsum, xsumsq);
|
|
62
|
+
|
|
63
|
+
for (++r; r < n && vector_get_i(split_candidates, r) <= jmax; r++) {
|
|
64
|
+
uint32_t jabs = vector_get_i(split_candidates, r);
|
|
65
|
+
|
|
66
|
+
if (jabs > i) break;
|
|
67
|
+
if (jabs < matrix_get_i(splits, row - 1, i)) continue;
|
|
68
|
+
|
|
69
|
+
LDouble cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
|
|
70
|
+
LDouble sj = cost_base + DISSIM(jabs, i, xsum, xsumsq);
|
|
71
|
+
LDouble cost_prev = matrix_get_f(state.cost, row, i);
|
|
72
|
+
|
|
73
|
+
if (sj <= cost_prev) {
|
|
74
|
+
matrix_set_f(state.cost, row, i, sj);
|
|
75
|
+
matrix_set_i(state.splits, row, i, jabs);
|
|
76
|
+
} else if (cost_base + sjimin > cost_prev) {
|
|
77
|
+
break;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
r--;
|
|
82
|
+
jl = jh;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
static inline void FUNC_NAME(find_min_from_candidates)(State state, RowParams rparams, VectorI *split_candidates)
|
|
87
|
+
{
|
|
88
|
+
const uint32_t row = rparams.row;
|
|
89
|
+
const uint32_t imin = rparams.imin;
|
|
90
|
+
const uint32_t imax = rparams.imax;
|
|
91
|
+
const uint32_t istep = rparams.istep;
|
|
92
|
+
MatrixF *const cost = state.cost;
|
|
93
|
+
MatrixI *const splits = state.splits;
|
|
94
|
+
|
|
95
|
+
uint32_t optimal_split_idx_prev = 0;
|
|
96
|
+
|
|
97
|
+
for (uint32_t i = imin; i <= imax; i += istep)
|
|
98
|
+
{
|
|
99
|
+
const uint32_t optimal_split_idx = optimal_split_idx_prev;
|
|
100
|
+
const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
|
|
101
|
+
const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
|
|
102
|
+
const LDouble added_cost = DISSIM(optimal_split, i, state.xsum, state.xsumsq);
|
|
103
|
+
|
|
104
|
+
matrix_set_f(cost, row, i, cost_prev + added_cost);
|
|
105
|
+
matrix_set_i(splits, row, i, optimal_split);
|
|
106
|
+
|
|
107
|
+
for (uint32_t r = optimal_split_idx + 1; r < split_candidates->size; r++)
|
|
108
|
+
{
|
|
109
|
+
uint32_t split = vector_get_i(split_candidates, r);
|
|
110
|
+
|
|
111
|
+
if (split < matrix_get_i(splits, row - 1, i)) continue;
|
|
112
|
+
if (split > i) break;
|
|
113
|
+
|
|
114
|
+
LDouble split_cost = matrix_get_f(cost, row - 1, split - 1) + DISSIM(split, i, state.xsum, state.xsumsq);
|
|
115
|
+
|
|
116
|
+
if (split_cost > matrix_get_f(cost, row, i)) continue;
|
|
117
|
+
|
|
118
|
+
matrix_set_f(cost, row, i, split_cost);
|
|
119
|
+
matrix_set_i(splits, row, i, split);
|
|
120
|
+
optimal_split_idx_prev = r;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
static inline VectorI *FUNC_NAME(prune_candidates)(State state, RowParams rparams, VectorI *split_candidates)
|
|
126
|
+
{
|
|
127
|
+
uint32_t imin = rparams.imin;
|
|
128
|
+
uint32_t row = rparams.row;
|
|
129
|
+
uint32_t istep = rparams.istep;
|
|
130
|
+
uint32_t n = ((rparams.imax - imin) / istep) + 1;
|
|
131
|
+
uint32_t m = split_candidates->size;
|
|
132
|
+
|
|
133
|
+
if (n >= m) return split_candidates;
|
|
134
|
+
|
|
135
|
+
uint32_t left = 0;
|
|
136
|
+
uint32_t right = 0;
|
|
137
|
+
VectorI *pruned = vector_dup_i(split_candidates, state.arena);
|
|
138
|
+
|
|
139
|
+
while (m > n)
|
|
140
|
+
{
|
|
141
|
+
uint32_t i = imin + left * istep;
|
|
142
|
+
uint32_t j = vector_get_i(pruned, right);
|
|
143
|
+
uint32_t jnext = vector_get_i(pruned, right + 1);
|
|
144
|
+
LDouble sl = matrix_get_f(state.cost, row - 1, j - 1) + DISSIM(j, i, state.xsum, state.xsumsq);
|
|
145
|
+
LDouble snext = matrix_get_f(state.cost, row - 1, jnext - 1) + DISSIM(jnext, i, state.xsum, state.xsumsq);
|
|
146
|
+
|
|
147
|
+
if ((sl < snext) && (left < n - 1)) {
|
|
148
|
+
vector_set_i(pruned, left, j);
|
|
149
|
+
left++;
|
|
150
|
+
right++;
|
|
151
|
+
} else if ((sl < snext) && (left == n - 1)) {
|
|
152
|
+
right++;
|
|
153
|
+
m--;
|
|
154
|
+
vector_set_i(pruned, right, j);
|
|
155
|
+
} else {
|
|
156
|
+
if (left > 0) {
|
|
157
|
+
vector_set_i(pruned, right, vector_get_i(pruned, --left));
|
|
158
|
+
} else {
|
|
159
|
+
right++;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
m--;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
for (uint32_t i = left; i < m; i++) {
|
|
167
|
+
vector_set_i(pruned, i, vector_get_i(pruned, right++));
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
vector_downsize_i(pruned, m);
|
|
171
|
+
|
|
172
|
+
return pruned;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
void FUNC_NAME(smawk)(State state, RowParams rparams, VectorI *split_candidates)
|
|
176
|
+
{
|
|
177
|
+
const uint32_t imin = rparams.imin;
|
|
178
|
+
const uint32_t imax = rparams.imax;
|
|
179
|
+
const uint32_t istep = rparams.istep;
|
|
180
|
+
|
|
181
|
+
if ((imax - imin) <= (0 * istep)) {
|
|
182
|
+
FUNC_NAME(find_min_from_candidates)(state, rparams, split_candidates);
|
|
183
|
+
} else {
|
|
184
|
+
VectorI *odd_candidates = FUNC_NAME(prune_candidates)(state, rparams, split_candidates);
|
|
185
|
+
uint32_t istepx2 = istep * 2;
|
|
186
|
+
uint32_t imin_odd = imin + istep;
|
|
187
|
+
uint32_t imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2);
|
|
188
|
+
RowParams rparams_odd = { .row = rparams.row, .imin = imin_odd, .imax = imax_odd, .istep = istepx2 };
|
|
189
|
+
|
|
190
|
+
FUNC_NAME(smawk)(state, rparams_odd, odd_candidates);
|
|
191
|
+
FUNC_NAME(fill_even_positions)(state, rparams, split_candidates);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
void FUNC_NAME(fill_row)(State state, uint32_t q, uint32_t imin, uint32_t imax)
|
|
196
|
+
{
|
|
197
|
+
uint32_t size = imax - q + 1;
|
|
198
|
+
VectorI *split_candidates = vector_create_i(state.arena, size);
|
|
199
|
+
for (uint32_t i = 0; i < size; i++) {
|
|
200
|
+
vector_set_i(split_candidates, i, q + i);
|
|
201
|
+
}
|
|
202
|
+
RowParams rparams = { .row = q, .imin = imin, .imax = imax, .istep = 1 };
|
|
203
|
+
FUNC_NAME(smawk)(state, rparams, split_candidates);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
VALUE FUNC_NAME(rb_sorted_group_sizes)(VALUE self, FnFindKOptimal *find_koptimal)
|
|
207
|
+
{
|
|
208
|
+
uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
|
|
209
|
+
uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
|
|
210
|
+
uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
|
|
211
|
+
VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
|
|
212
|
+
size_t capacity = sizeof(LDouble) * (xcount + 2) * (kmax + 2) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
|
|
213
|
+
Arena *arena = arena_create(capacity);
|
|
214
|
+
|
|
215
|
+
if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
|
|
216
|
+
|
|
217
|
+
MatrixF *cost = matrix_create_f(arena, kmax, xcount);
|
|
218
|
+
MatrixI *splits = matrix_create_i(arena, kmax, xcount);
|
|
219
|
+
VectorF *xsorted = vector_create_f(arena, xcount);
|
|
220
|
+
VectorF *xsum = vector_create_f(arena, xcount);
|
|
221
|
+
VectorF *xsumsq = vector_create_f(arena, xcount);
|
|
222
|
+
|
|
223
|
+
for (uint32_t i = 0; i < xcount; i++) {
|
|
224
|
+
LDouble xi = NUM2DBL(rb_ary_entry(rb_xsorted, i));
|
|
225
|
+
vector_set_f(xsorted, i, xi);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
State state = {
|
|
229
|
+
.arena = arena,
|
|
230
|
+
.xcount = xcount,
|
|
231
|
+
.kmin = kmin,
|
|
232
|
+
.kmax = kmax,
|
|
233
|
+
.xsorted = xsorted,
|
|
234
|
+
.cost = cost,
|
|
235
|
+
.splits = splits,
|
|
236
|
+
.xsum = xsum,
|
|
237
|
+
.xsumsq = xsumsq
|
|
238
|
+
};
|
|
239
|
+
|
|
240
|
+
LDouble shift = vector_get_f(xsorted, xcount / 2);
|
|
241
|
+
LDouble diff_initial = vector_get_f(xsorted, 0) - shift;
|
|
242
|
+
|
|
243
|
+
vector_set_f(xsum, 0, diff_initial);
|
|
244
|
+
vector_set_f(xsumsq, 0, diff_initial * diff_initial);
|
|
245
|
+
|
|
246
|
+
for (uint32_t i = 1; i < xcount; i++) {
|
|
247
|
+
LDouble xi = vector_get_f(xsorted, i);
|
|
248
|
+
LDouble xsum_prev = vector_get_f(xsum, i - 1);
|
|
249
|
+
LDouble xsumsq_prev = vector_get_f(xsumsq, i - 1);
|
|
250
|
+
LDouble diff = xi - shift;
|
|
251
|
+
|
|
252
|
+
vector_set_f(xsum, i, xsum_prev + diff);
|
|
253
|
+
vector_set_f(xsumsq, i, xsumsq_prev + diff * diff);
|
|
254
|
+
matrix_set_f(cost, 0, i, DISSIM(0, i, xsum, xsumsq));
|
|
255
|
+
matrix_set_i(splits, 0, i, 0);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
for (uint32_t q = 1; q <= kmax - 1; q++) {
|
|
259
|
+
uint32_t imin = (q < kmax - 1) ? ((q > 1) ? q : 1) : xcount - 1;
|
|
260
|
+
FUNC_NAME(fill_row)(state, q, imin, xcount - 1);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
uint32_t koptimal = find_koptimal(state);
|
|
264
|
+
|
|
265
|
+
VectorI *sizes = vector_create_i(arena, koptimal);
|
|
266
|
+
backtrack_sizes(state, sizes, koptimal);
|
|
267
|
+
|
|
268
|
+
VALUE response = rb_ary_new2(sizes->size);
|
|
269
|
+
for (uint32_t i = 0; i < sizes->size; i++) {
|
|
270
|
+
VALUE size = LONG2NUM(vector_get_i(sizes, i));
|
|
271
|
+
rb_ary_store(response, i, size);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
arena_destroy(arena);
|
|
275
|
+
|
|
276
|
+
return response;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
#undef CONCAT_IMPL
|
|
280
|
+
#undef CONCAT
|
|
281
|
+
#undef FUNC_NAME
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
#ifndef DISSIMILARITY_H
|
|
2
|
+
#define DISSIMILARITY_H
|
|
3
|
+
|
|
4
|
+
/* L2 aka Euclidean aka Mean dissimilarity criteria */
|
|
5
|
+
static inline LDouble dissimilarity_l2(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict xsumsq)
|
|
6
|
+
{
|
|
7
|
+
LDouble sji = 0.0;
|
|
8
|
+
|
|
9
|
+
if (j >= i) return sji;
|
|
10
|
+
|
|
11
|
+
if (j > 0) {
|
|
12
|
+
LDouble segment_diff = vector_get_diff_f(xsum, i, j - 1);
|
|
13
|
+
uint32_t segment_size = i - j + 1;
|
|
14
|
+
sji = vector_get_diff_f(xsumsq, i, j - 1) - (segment_diff * segment_diff / segment_size);
|
|
15
|
+
} else {
|
|
16
|
+
LDouble xsumi = vector_get_f(xsum, i);
|
|
17
|
+
sji = vector_get_f(xsumsq, i) - (xsumi * xsumi / (i + 1));
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return (sji > 0) ? sji : 0.0;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/* L1 aka Manhattan aka Median dissimilarity criteria */
|
|
24
|
+
static inline LDouble dissimilarity_l1(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict _xsumsq)
|
|
25
|
+
{
|
|
26
|
+
LDouble sji = 0.0;
|
|
27
|
+
|
|
28
|
+
if (j >= i) return sji;
|
|
29
|
+
|
|
30
|
+
if (j > 0) {
|
|
31
|
+
uint32_t median_idx = (i + j) >> 1;
|
|
32
|
+
|
|
33
|
+
if (((i - j + 1) % 2) == 1) {
|
|
34
|
+
sji =
|
|
35
|
+
- vector_get_f(xsum, median_idx - 1)
|
|
36
|
+
+ vector_get_f(xsum, j - 1)
|
|
37
|
+
+ vector_get_f(xsum, i)
|
|
38
|
+
- vector_get_f(xsum, median_idx);
|
|
39
|
+
} else {
|
|
40
|
+
sji =
|
|
41
|
+
- vector_get_f(xsum, median_idx)
|
|
42
|
+
+ vector_get_f(xsum, j - 1)
|
|
43
|
+
+ vector_get_f(xsum, i)
|
|
44
|
+
- vector_get_f(xsum, median_idx);
|
|
45
|
+
}
|
|
46
|
+
} else { // j == 0
|
|
47
|
+
uint32_t median_idx = i >> 1;
|
|
48
|
+
|
|
49
|
+
if (((i + 1) % 2) == 1) {
|
|
50
|
+
sji =
|
|
51
|
+
- vector_get_f(xsum, median_idx - 1)
|
|
52
|
+
+ vector_get_f(xsum, i)
|
|
53
|
+
- vector_get_f(xsum, median_idx);
|
|
54
|
+
} else {
|
|
55
|
+
sji =
|
|
56
|
+
- vector_get_f(xsum, median_idx)
|
|
57
|
+
+ vector_get_f(xsum, i)
|
|
58
|
+
- vector_get_f(xsum, median_idx);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return (sji < 0) ? 0.0 : sji;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
#endif /* DISSIMILARITY_H */
|
data/ext/ckmeans/extensions.c
CHANGED
|
@@ -33,8 +33,6 @@ typedef struct VectorI {
|
|
|
33
33
|
uint32_t *values;
|
|
34
34
|
} VectorI;
|
|
35
35
|
|
|
36
|
-
typedef LDouble (FnDissim)(uint32_t, uint32_t, VectorF*, VectorF*);
|
|
37
|
-
|
|
38
36
|
typedef struct State {
|
|
39
37
|
uint32_t xcount;
|
|
40
38
|
uint32_t kmin;
|
|
@@ -45,7 +43,6 @@ typedef struct State {
|
|
|
45
43
|
MatrixI *splits;
|
|
46
44
|
VectorF *xsum;
|
|
47
45
|
VectorF *xsumsq;
|
|
48
|
-
FnDissim *dissim;
|
|
49
46
|
} State;
|
|
50
47
|
|
|
51
48
|
typedef struct RowParams {
|
|
@@ -64,7 +61,6 @@ typedef struct {
|
|
|
64
61
|
|
|
65
62
|
VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
|
|
66
63
|
VALUE rb_ckmedian_sorted_group_sizes(VALUE self);
|
|
67
|
-
VALUE rb_sorted_group_sizes(VALUE self, FnDissim*, FnFindKOptimal*);
|
|
68
64
|
|
|
69
65
|
Arena *arena_create(size_t);
|
|
70
66
|
void *arena_alloc(Arena*, size_t);
|
|
@@ -91,13 +87,6 @@ uint32_t vector_get_i(VectorI*, uint32_t offset);
|
|
|
91
87
|
void vector_downsize_i(VectorI*, uint32_t);
|
|
92
88
|
void vector_inspect_i(VectorI*);
|
|
93
89
|
|
|
94
|
-
LDouble dissimilarity_l2(uint32_t, uint32_t, VectorF*, VectorF*);
|
|
95
|
-
LDouble dissimilarity_l1(uint32_t, uint32_t, VectorF*, VectorF*);
|
|
96
|
-
void fill_row(State, uint32_t, uint32_t, uint32_t);
|
|
97
|
-
void smawk(State, RowParams, VectorI*);
|
|
98
|
-
void find_min_from_candidates(State, RowParams, VectorI*);
|
|
99
|
-
VectorI *prune_candidates(State, RowParams, VectorI*);
|
|
100
|
-
void fill_even_positions(State, RowParams, VectorI*);
|
|
101
90
|
SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
|
|
102
91
|
VectorI *backtrack_sizes(State, VectorI*, uint32_t);
|
|
103
92
|
uint32_t find_koptimal_fast(State);
|
|
@@ -118,99 +107,34 @@ void Init_extensions(void) {
|
|
|
118
107
|
# define ALLOCATION_FACTOR 3
|
|
119
108
|
# define PIx2 (M_PI * 2.0)
|
|
120
109
|
|
|
110
|
+
#include "dissimilarity.h"
|
|
111
|
+
|
|
112
|
+
/* L2-specific versions of all hot-path functions */
|
|
113
|
+
#define DISSIM_SUFFIX l2
|
|
114
|
+
#define DISSIM(j, i, xsum, xsumsq) dissimilarity_l2(j, i, xsum, xsumsq)
|
|
115
|
+
#include "algorithm.inc"
|
|
116
|
+
#undef DISSIM
|
|
117
|
+
#undef DISSIM_SUFFIX
|
|
118
|
+
|
|
119
|
+
/* L1-specific versions of all hot-path functions */
|
|
120
|
+
#define DISSIM_SUFFIX l1
|
|
121
|
+
#define DISSIM(j, i, xsum, xsumsq) dissimilarity_l1(j, i, xsum, xsumsq)
|
|
122
|
+
#include "algorithm.inc"
|
|
123
|
+
#undef DISSIM
|
|
124
|
+
#undef DISSIM_SUFFIX
|
|
125
|
+
|
|
121
126
|
VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
|
|
122
127
|
{
|
|
123
|
-
bool
|
|
124
|
-
FnFindKOptimal *find_k =
|
|
125
|
-
return
|
|
128
|
+
bool use_stable = RTEST(rb_iv_get(self, "@use_stable_estimation"));
|
|
129
|
+
FnFindKOptimal *find_k = use_stable ? find_koptimal_gmm : find_koptimal_fast;
|
|
130
|
+
return rb_sorted_group_sizes_l2(self, find_k);
|
|
126
131
|
}
|
|
127
132
|
|
|
128
133
|
VALUE rb_ckmedian_sorted_group_sizes(VALUE self)
|
|
129
134
|
{
|
|
130
|
-
bool
|
|
131
|
-
FnFindKOptimal *find_k =
|
|
132
|
-
return
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria, FnFindKOptimal *find_koptimal)
|
|
136
|
-
{
|
|
137
|
-
uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
|
|
138
|
-
uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
|
|
139
|
-
uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
|
|
140
|
-
VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
|
|
141
|
-
size_t capacity = sizeof(LDouble) * (xcount + 2) * (kmax + 2) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
|
|
142
|
-
Arena *arena = arena_create(capacity);
|
|
143
|
-
|
|
144
|
-
if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
|
|
145
|
-
|
|
146
|
-
MatrixF *cost = matrix_create_f(arena, kmax, xcount);
|
|
147
|
-
MatrixI *splits = matrix_create_i(arena, kmax, xcount);
|
|
148
|
-
VectorF *xsorted = vector_create_f(arena, xcount);
|
|
149
|
-
VectorF *xsum = vector_create_f(arena, xcount);
|
|
150
|
-
VectorF *xsumsq = vector_create_f(arena, xcount);
|
|
151
|
-
|
|
152
|
-
for (uint32_t i = 0; i < xcount; i++) {
|
|
153
|
-
LDouble xi = NUM2DBL(rb_ary_entry(rb_xsorted, i));
|
|
154
|
-
vector_set_f(xsorted, i, xi);
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
State state = {
|
|
158
|
-
.arena = arena,
|
|
159
|
-
.xcount = xcount,
|
|
160
|
-
.kmin = kmin,
|
|
161
|
-
.kmax = kmax,
|
|
162
|
-
.xsorted = xsorted,
|
|
163
|
-
.cost = cost,
|
|
164
|
-
.splits = splits,
|
|
165
|
-
.xsum = xsum,
|
|
166
|
-
.xsumsq = xsumsq,
|
|
167
|
-
.dissim = criteria
|
|
168
|
-
};
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
LDouble shift = vector_get_f(xsorted, xcount / 2);
|
|
172
|
-
LDouble diff_initial = vector_get_f(xsorted, 0) - shift;
|
|
173
|
-
|
|
174
|
-
vector_set_f(xsum, 0, diff_initial);
|
|
175
|
-
vector_set_f(xsumsq, 0, diff_initial * diff_initial);
|
|
176
|
-
|
|
177
|
-
for (uint32_t i = 1; i < xcount; i++) {
|
|
178
|
-
LDouble xi = vector_get_f(xsorted, i);
|
|
179
|
-
LDouble xsum_prev = vector_get_f(xsum, i - 1);
|
|
180
|
-
LDouble xsumsq_prev = vector_get_f(xsumsq, i - 1);
|
|
181
|
-
LDouble diff = xi - shift;
|
|
182
|
-
|
|
183
|
-
vector_set_f(xsum, i, xsum_prev + diff);
|
|
184
|
-
vector_set_f(xsumsq, i, xsumsq_prev + diff * diff);
|
|
185
|
-
matrix_set_f(cost, 0, i, criteria(0, i, xsum, xsumsq));
|
|
186
|
-
matrix_set_i(splits, 0, i, 0);
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
for (uint32_t q = 1; q <= kmax - 1; q++) {
|
|
190
|
-
uint32_t imin = (q < kmax - 1) ? ((q > 1) ? q : 1) : xcount - 1;
|
|
191
|
-
fill_row(state, q, imin, xcount - 1);
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
uint32_t koptimal = find_koptimal(state);
|
|
195
|
-
|
|
196
|
-
VectorI *sizes = vector_create_i(arena, koptimal);
|
|
197
|
-
backtrack_sizes(state, sizes, koptimal);
|
|
198
|
-
|
|
199
|
-
/* printf("XSORTED \t"); vector_inspect_f(xsorted); */
|
|
200
|
-
/* printf("K OPTIMAL: %lld\n", koptimal); */
|
|
201
|
-
/* printf("SIZES \t"); vector_inspect_i(sizes); */
|
|
202
|
-
/* printf("FINAL COST\n"); matrix_inspect_f(cost); */
|
|
203
|
-
/* printf("FINAL SPLITS\n"); matrix_inspect_i(splits); */
|
|
204
|
-
|
|
205
|
-
VALUE response = rb_ary_new2(sizes->size);
|
|
206
|
-
for (uint32_t i = 0; i < sizes->size; i++) {
|
|
207
|
-
VALUE size = LONG2NUM(vector_get_i(sizes, i));
|
|
208
|
-
rb_ary_store(response, i, size);
|
|
209
|
-
}
|
|
210
|
-
|
|
211
|
-
arena_destroy(arena);
|
|
212
|
-
|
|
213
|
-
return response;
|
|
135
|
+
bool use_stable = RTEST(rb_iv_get(self, "@use_stable_estimation"));
|
|
136
|
+
FnFindKOptimal *find_k = use_stable ? find_koptimal_lmm : find_koptimal_fast;
|
|
137
|
+
return rb_sorted_group_sizes_l1(self, find_k);
|
|
214
138
|
}
|
|
215
139
|
|
|
216
140
|
uint32_t find_koptimal_fast(State state)
|
|
@@ -536,249 +460,6 @@ SegmentStats shifted_data_variance(VectorF *xsorted, uint32_t left, uint32_t rig
|
|
|
536
460
|
return stats;
|
|
537
461
|
}
|
|
538
462
|
|
|
539
|
-
void fill_row(State state, uint32_t q, uint32_t imin, uint32_t imax)
|
|
540
|
-
{
|
|
541
|
-
uint32_t size = imax - q + 1;
|
|
542
|
-
VectorI *split_candidates = vector_create_i(state.arena, size);
|
|
543
|
-
for (uint32_t i = 0; i < size; i++) {
|
|
544
|
-
vector_set_i(split_candidates, i, q + i);
|
|
545
|
-
}
|
|
546
|
-
RowParams rparams = { .row = q, .imin = imin, .imax = imax, .istep = 1 };
|
|
547
|
-
smawk(state, rparams, split_candidates);
|
|
548
|
-
}
|
|
549
|
-
|
|
550
|
-
void smawk(State state, RowParams rparams, VectorI *split_candidates)
|
|
551
|
-
{
|
|
552
|
-
const uint32_t imin = rparams.imin;
|
|
553
|
-
const uint32_t imax = rparams.imax;
|
|
554
|
-
const uint32_t istep = rparams.istep;
|
|
555
|
-
|
|
556
|
-
if ((imax - imin) <= (0 * istep)) {
|
|
557
|
-
find_min_from_candidates(state, rparams, split_candidates);
|
|
558
|
-
} else {
|
|
559
|
-
VectorI *odd_candidates = prune_candidates(state, rparams, split_candidates);
|
|
560
|
-
/* printf("PRUNED\t"); vector_inspect_i(odd_candidates); */
|
|
561
|
-
uint32_t istepx2 = istep * 2;
|
|
562
|
-
uint32_t imin_odd = imin + istep;
|
|
563
|
-
uint32_t imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2);
|
|
564
|
-
RowParams rparams_odd = { .row = rparams.row, .imin = imin_odd, .imax = imax_odd, .istep = istepx2 };
|
|
565
|
-
|
|
566
|
-
smawk(state, rparams_odd, odd_candidates);
|
|
567
|
-
fill_even_positions(state, rparams, split_candidates);
|
|
568
|
-
}
|
|
569
|
-
}
|
|
570
|
-
|
|
571
|
-
inline void fill_even_positions(State state, RowParams rparams, VectorI *split_candidates)
|
|
572
|
-
{
|
|
573
|
-
uint32_t row = rparams.row;
|
|
574
|
-
uint32_t imin = rparams.imin;
|
|
575
|
-
uint32_t imax = rparams.imax;
|
|
576
|
-
uint32_t istep = rparams.istep;
|
|
577
|
-
uint32_t n = split_candidates->size;
|
|
578
|
-
uint32_t istepx2 = istep * 2;
|
|
579
|
-
uint32_t jl = vector_get_i(split_candidates, 0);
|
|
580
|
-
VectorF *const xsum = state.xsum;
|
|
581
|
-
VectorF *const xsumsq = state.xsumsq;
|
|
582
|
-
MatrixI *const splits = state.splits;
|
|
583
|
-
FnDissim *const dissim = state.dissim;
|
|
584
|
-
|
|
585
|
-
for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
|
|
586
|
-
while (vector_get_i(split_candidates, r) < jl) r++;
|
|
587
|
-
|
|
588
|
-
uint32_t rcandidate = vector_get_i(split_candidates, r);
|
|
589
|
-
uint32_t cost_base_row = row - 1;
|
|
590
|
-
uint32_t cost_base_col = rcandidate - 1;
|
|
591
|
-
LDouble cost =
|
|
592
|
-
matrix_get_f(state.cost, cost_base_row, cost_base_col) + dissim(rcandidate, i, xsum, xsumsq);
|
|
593
|
-
|
|
594
|
-
matrix_set_f(state.cost, row, i, cost);
|
|
595
|
-
matrix_set_i(state.splits, row, i, rcandidate);
|
|
596
|
-
|
|
597
|
-
uint32_t jh =
|
|
598
|
-
(i + istep) <= imax
|
|
599
|
-
? matrix_get_i(splits, row, i + istep)
|
|
600
|
-
: vector_get_i(split_candidates, n - 1);
|
|
601
|
-
|
|
602
|
-
uint32_t jmax = jh < i ? jh : i;
|
|
603
|
-
LDouble sjimin = dissim(jmax, i, xsum, xsumsq);
|
|
604
|
-
|
|
605
|
-
for (++r; r < n && vector_get_i(split_candidates, r) <= jmax; r++) {
|
|
606
|
-
uint32_t jabs = vector_get_i(split_candidates, r);
|
|
607
|
-
|
|
608
|
-
if (jabs > i) break;
|
|
609
|
-
if (jabs < matrix_get_i(splits, row - 1, i)) continue;
|
|
610
|
-
|
|
611
|
-
LDouble cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
|
|
612
|
-
LDouble sj = cost_base + dissim(jabs, i, xsum, xsumsq);
|
|
613
|
-
LDouble cost_prev = matrix_get_f(state.cost, row, i);
|
|
614
|
-
|
|
615
|
-
if (sj <= cost_prev) {
|
|
616
|
-
matrix_set_f(state.cost, row, i, sj);
|
|
617
|
-
matrix_set_i(state.splits, row, i, jabs);
|
|
618
|
-
} else if (cost_base + sjimin > cost_prev) {
|
|
619
|
-
break;
|
|
620
|
-
}
|
|
621
|
-
}
|
|
622
|
-
|
|
623
|
-
r--;
|
|
624
|
-
jl = jh;
|
|
625
|
-
}
|
|
626
|
-
}
|
|
627
|
-
|
|
628
|
-
inline void find_min_from_candidates(State state, RowParams rparams, VectorI *split_candidates)
|
|
629
|
-
{
|
|
630
|
-
const uint32_t row = rparams.row;
|
|
631
|
-
const uint32_t imin = rparams.imin;
|
|
632
|
-
const uint32_t imax = rparams.imax;
|
|
633
|
-
const uint32_t istep = rparams.istep;
|
|
634
|
-
MatrixF *const cost = state.cost;
|
|
635
|
-
MatrixI *const splits = state.splits;
|
|
636
|
-
FnDissim *const dissim = state.dissim;
|
|
637
|
-
|
|
638
|
-
uint32_t optimal_split_idx_prev = 0;
|
|
639
|
-
|
|
640
|
-
for (uint32_t i = imin; i <= imax; i += istep)
|
|
641
|
-
{
|
|
642
|
-
const uint32_t optimal_split_idx = optimal_split_idx_prev;
|
|
643
|
-
const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
|
|
644
|
-
const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
|
|
645
|
-
const LDouble added_cost = dissim(optimal_split, i, state.xsum, state.xsumsq);
|
|
646
|
-
|
|
647
|
-
matrix_set_f(cost, row, i, cost_prev + added_cost);
|
|
648
|
-
matrix_set_i(splits, row, i, optimal_split);
|
|
649
|
-
|
|
650
|
-
for (uint32_t r = optimal_split_idx + 1; r < split_candidates->size; r++)
|
|
651
|
-
{
|
|
652
|
-
uint32_t split = vector_get_i(split_candidates, r);
|
|
653
|
-
|
|
654
|
-
if (split < matrix_get_i(splits, row - 1, i)) continue;
|
|
655
|
-
if (split > i) break;
|
|
656
|
-
|
|
657
|
-
LDouble split_cost =
|
|
658
|
-
matrix_get_f(cost, row - 1, split - 1) + dissim(split, i, state.xsum, state.xsumsq);
|
|
659
|
-
|
|
660
|
-
if (split_cost > matrix_get_f(cost, row, i)) continue;
|
|
661
|
-
|
|
662
|
-
matrix_set_f(cost, row, i, split_cost);
|
|
663
|
-
matrix_set_i(splits, row, i, split);
|
|
664
|
-
optimal_split_idx_prev = r;
|
|
665
|
-
}
|
|
666
|
-
}
|
|
667
|
-
}
|
|
668
|
-
|
|
669
|
-
inline VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
|
|
670
|
-
{
|
|
671
|
-
uint32_t imin = rparams.imin;
|
|
672
|
-
uint32_t row = rparams.row;
|
|
673
|
-
uint32_t istep = rparams.istep;
|
|
674
|
-
uint32_t n = ((rparams.imax - imin) / istep) + 1;
|
|
675
|
-
uint32_t m = split_candidates->size;
|
|
676
|
-
|
|
677
|
-
if (n >= m) return split_candidates;
|
|
678
|
-
|
|
679
|
-
uint32_t left = 0;
|
|
680
|
-
uint32_t right = 0;
|
|
681
|
-
VectorI *pruned = vector_dup_i(split_candidates, state.arena);
|
|
682
|
-
FnDissim *const dissim = state.dissim;
|
|
683
|
-
|
|
684
|
-
while (m > n)
|
|
685
|
-
{
|
|
686
|
-
uint32_t i = imin + left * istep;
|
|
687
|
-
uint32_t j = vector_get_i(pruned, right);
|
|
688
|
-
uint32_t jnext = vector_get_i(pruned, right + 1);
|
|
689
|
-
LDouble sl =
|
|
690
|
-
matrix_get_f(state.cost, row - 1, j - 1) + dissim(j, i, state.xsum, state.xsumsq);
|
|
691
|
-
LDouble snext =
|
|
692
|
-
matrix_get_f(state.cost, row - 1, jnext - 1) + dissim(jnext, i, state.xsum, state.xsumsq);
|
|
693
|
-
|
|
694
|
-
if ((sl < snext) && (left < n - 1)) {
|
|
695
|
-
vector_set_i(pruned, left, j);
|
|
696
|
-
left++;
|
|
697
|
-
right++;
|
|
698
|
-
} else if ((sl < snext) && (left == n - 1)) {
|
|
699
|
-
right++;
|
|
700
|
-
m--;
|
|
701
|
-
vector_set_i(pruned, right, j);
|
|
702
|
-
} else {
|
|
703
|
-
if (left > 0) {
|
|
704
|
-
vector_set_i(pruned, right, vector_get_i(pruned, --left));
|
|
705
|
-
} else {
|
|
706
|
-
right++;
|
|
707
|
-
}
|
|
708
|
-
|
|
709
|
-
m--;
|
|
710
|
-
}
|
|
711
|
-
}
|
|
712
|
-
|
|
713
|
-
for (uint32_t i = left; i < m; i++) {
|
|
714
|
-
vector_set_i(pruned, i, vector_get_i(pruned, right++));
|
|
715
|
-
}
|
|
716
|
-
|
|
717
|
-
vector_downsize_i(pruned, m);
|
|
718
|
-
|
|
719
|
-
return pruned;
|
|
720
|
-
}
|
|
721
|
-
|
|
722
|
-
/* L2 aka Euclidean aka Mean dissimilarity criteria */
|
|
723
|
-
inline LDouble dissimilarity_l2(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict xsumsq) {
|
|
724
|
-
LDouble sji = 0.0;
|
|
725
|
-
|
|
726
|
-
if (j >= i) return sji;
|
|
727
|
-
|
|
728
|
-
if (j > 0) {
|
|
729
|
-
LDouble segment_diff = vector_get_diff_f(xsum, i, j - 1);
|
|
730
|
-
uint32_t segment_size = i - j + 1;
|
|
731
|
-
sji = vector_get_diff_f(xsumsq, i, j - 1) - (segment_diff * segment_diff / segment_size);
|
|
732
|
-
} else {
|
|
733
|
-
LDouble xsumi = vector_get_f(xsum, i);
|
|
734
|
-
sji = vector_get_f(xsumsq, i) - (xsumi * xsumi / (i + 1));
|
|
735
|
-
}
|
|
736
|
-
|
|
737
|
-
return (sji > 0) ? sji : 0.0;
|
|
738
|
-
}
|
|
739
|
-
|
|
740
|
-
/* L1 aka Manhattan aka Median dissimilarity criteria */
|
|
741
|
-
inline LDouble dissimilarity_l1(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict _xsumsq)
|
|
742
|
-
{
|
|
743
|
-
LDouble sji = 0.0;
|
|
744
|
-
|
|
745
|
-
if (j >= i) return sji;
|
|
746
|
-
|
|
747
|
-
if (j > 0) {
|
|
748
|
-
uint32_t median_idx = (i + j) >> 1;
|
|
749
|
-
|
|
750
|
-
if (((i - j + 1) % 2) == 1) {
|
|
751
|
-
sji =
|
|
752
|
-
- vector_get_f(xsum, median_idx - 1)
|
|
753
|
-
+ vector_get_f(xsum, j - 1)
|
|
754
|
-
+ vector_get_f(xsum, i)
|
|
755
|
-
- vector_get_f(xsum, median_idx);
|
|
756
|
-
} else {
|
|
757
|
-
sji =
|
|
758
|
-
- vector_get_f(xsum, median_idx)
|
|
759
|
-
+ vector_get_f(xsum, j - 1)
|
|
760
|
-
+ vector_get_f(xsum, i)
|
|
761
|
-
- vector_get_f(xsum, median_idx);
|
|
762
|
-
}
|
|
763
|
-
} else { // j == 0
|
|
764
|
-
uint32_t median_idx = i >> 1;
|
|
765
|
-
|
|
766
|
-
if (((i + 1) % 2) == 1) {
|
|
767
|
-
sji =
|
|
768
|
-
- vector_get_f(xsum, median_idx - 1)
|
|
769
|
-
+ vector_get_f(xsum, i)
|
|
770
|
-
- vector_get_f(xsum, median_idx);
|
|
771
|
-
} else {
|
|
772
|
-
sji =
|
|
773
|
-
- vector_get_f(xsum, median_idx)
|
|
774
|
-
+ vector_get_f(xsum, i)
|
|
775
|
-
- vector_get_f(xsum, median_idx);
|
|
776
|
-
}
|
|
777
|
-
}
|
|
778
|
-
|
|
779
|
-
return (sji < 0) ? 0.0 : sji;
|
|
780
|
-
}
|
|
781
|
-
|
|
782
463
|
inline VectorF *vector_create_f(Arena *arena, uint32_t size) {
|
|
783
464
|
VectorF *v;
|
|
784
465
|
|
data/lib/ckmeans/clusterer.rb
CHANGED
|
@@ -26,12 +26,12 @@ module Ckmeans
|
|
|
26
26
|
raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
|
|
27
27
|
raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
|
|
28
28
|
|
|
29
|
-
@kmin
|
|
30
|
-
@unique_xcount
|
|
31
|
-
@kmax
|
|
32
|
-
@xsorted_original
|
|
33
|
-
@xsorted
|
|
34
|
-
@
|
|
29
|
+
@kmin = kmin
|
|
30
|
+
@unique_xcount = entries.uniq.size
|
|
31
|
+
@kmax = [@unique_xcount, kmax].min
|
|
32
|
+
@xsorted_original = entries.sort
|
|
33
|
+
@xsorted = @xsorted_original.map(&:to_f)
|
|
34
|
+
@use_stable_estimation = %i[gmm stable].include?(kestimate)
|
|
35
35
|
end
|
|
36
36
|
|
|
37
37
|
def clusters
|
data/lib/ckmeans/version.rb
CHANGED
data/lib/ckmedian/clusterer.rb
CHANGED
|
@@ -28,12 +28,12 @@ module Ckmedian
|
|
|
28
28
|
raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
|
|
29
29
|
raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
|
|
30
30
|
|
|
31
|
-
@kmin
|
|
32
|
-
@unique_xcount
|
|
33
|
-
@kmax
|
|
34
|
-
@xsorted_original
|
|
35
|
-
@xsorted
|
|
36
|
-
@
|
|
31
|
+
@kmin = kmin
|
|
32
|
+
@unique_xcount = entries.uniq.size
|
|
33
|
+
@kmax = [@unique_xcount, kmax].min
|
|
34
|
+
@xsorted_original = entries.sort
|
|
35
|
+
@xsorted = @xsorted_original.map(&:to_f)
|
|
36
|
+
@use_stable_estimation = %i[lmm stable].include?(kestimate)
|
|
37
37
|
end
|
|
38
38
|
|
|
39
39
|
def clusters
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ckmeans
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.1.
|
|
4
|
+
version: 2.1.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Vlad Lebedev
|
|
@@ -28,6 +28,8 @@ files:
|
|
|
28
28
|
- LICENSE
|
|
29
29
|
- README.md
|
|
30
30
|
- Rakefile
|
|
31
|
+
- ext/ckmeans/algorithm.inc
|
|
32
|
+
- ext/ckmeans/dissimilarity.h
|
|
31
33
|
- ext/ckmeans/extconf.rb
|
|
32
34
|
- ext/ckmeans/extensions.c
|
|
33
35
|
- lib/ckmeans.rb
|