ckmeans 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +13 -0
- data/CHANGELOG.md +22 -0
- data/Dockerfile +11 -0
- data/ext/ckmeans/extensions.c +153 -151
- data/lib/ckmeans/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 17dd59ae47e814d5cf0b45665856a52e33e1af22c90722955750004405633a4e
|
|
4
|
+
data.tar.gz: 4278bb18d8a987ac71fd7ea179055ab6d2c15292d772b7d9df1dd8c4adde011b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7e3d19cfbfbebb0b26bf1ffdd7c99998a898ccf123359994e339147735b819f3f16fc73c2ac202a3fbe3c4f1c13c747e7181d01d56770be5404ca6354533b23d
|
|
7
|
+
data.tar.gz: 2be82db12f8d9da2cafb03713440f3083d2ffd7fd7f6917ad8e98d1c864b1d97f99e9a0771afe6aaaff502fee86d81e9221b8d689a388817b060fc7ce1917a87
|
data/.dockerignore
ADDED
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,27 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [1.0.4] - 2025-05-01
|
|
4
|
+
|
|
5
|
+
- Simpler capacity size expression ([#14](https://github.com/vlebedeff/rb-ckmeans/pull/14))
|
|
6
|
+
|
|
7
|
+
## [1.0.3] - 2025-05-01
|
|
8
|
+
|
|
9
|
+
- More frugal memory allocation ([#11](https://github.com/vlebedeff/rb-ckmeans/pull/11))
|
|
10
|
+
- Use `rb_iv_get` for brevity
|
|
11
|
+
- Various optimizations ([#10](https://github.com/vlebedeff/rb-ckmeans/pull/10))
|
|
12
|
+
- Extract `LDouble` type definition
|
|
13
|
+
- Remove `ruby-prof` gem
|
|
14
|
+
- Rename `nvalues` to `size`
|
|
15
|
+
|
|
16
|
+
## [1.0.2] - 2025-04-24
|
|
17
|
+
|
|
18
|
+
- Polish & Housekeeping ([#9](https://github.com/vlebedeff/rb-ckmeans/pull/9))
|
|
19
|
+
- Fix int variable sign ([#8](https://github.com/vlebedeff/rb-ckmeans/pull/8))
|
|
20
|
+
|
|
21
|
+
## [1.0.0] - 2025-04-22
|
|
22
|
+
|
|
23
|
+
- https://github.com/vlebedeff/rb-ckmeans/pull/6
|
|
24
|
+
|
|
3
25
|
## [0.1.2] - 2025-03-31
|
|
4
26
|
|
|
5
27
|
- https://github.com/vlebedeff/rb-ckmeans/pull/3
|
data/Dockerfile
ADDED
data/ext/ckmeans/extensions.c
CHANGED
|
@@ -3,16 +3,18 @@
|
|
|
3
3
|
#include <string.h>
|
|
4
4
|
#include "ruby.h"
|
|
5
5
|
|
|
6
|
+
typedef long double LDouble;
|
|
7
|
+
|
|
6
8
|
typedef struct Arena {
|
|
7
|
-
|
|
8
|
-
|
|
9
|
+
size_t capacity;
|
|
10
|
+
size_t offset;
|
|
9
11
|
uint8_t *buffer;
|
|
10
12
|
} Arena;
|
|
11
13
|
|
|
12
14
|
typedef struct MatrixF {
|
|
13
15
|
uint32_t ncols;
|
|
14
16
|
uint32_t nrows;
|
|
15
|
-
|
|
17
|
+
LDouble *values;
|
|
16
18
|
} MatrixF;
|
|
17
19
|
|
|
18
20
|
typedef struct MatrixI {
|
|
@@ -22,12 +24,12 @@ typedef struct MatrixI {
|
|
|
22
24
|
} MatrixI;
|
|
23
25
|
|
|
24
26
|
typedef struct VectorF {
|
|
25
|
-
uint32_t
|
|
26
|
-
|
|
27
|
+
uint32_t size;
|
|
28
|
+
LDouble *values;
|
|
27
29
|
} VectorF;
|
|
28
30
|
|
|
29
31
|
typedef struct VectorI {
|
|
30
|
-
uint32_t
|
|
32
|
+
uint32_t size;
|
|
31
33
|
uint32_t *values;
|
|
32
34
|
} VectorI;
|
|
33
35
|
|
|
@@ -52,69 +54,67 @@ typedef struct RowParams {
|
|
|
52
54
|
} RowParams;
|
|
53
55
|
|
|
54
56
|
typedef struct {
|
|
55
|
-
|
|
56
|
-
|
|
57
|
+
LDouble mean;
|
|
58
|
+
LDouble variance;
|
|
57
59
|
} SegmentStats;
|
|
58
60
|
|
|
59
|
-
VALUE
|
|
60
|
-
|
|
61
|
-
Arena
|
|
62
|
-
void
|
|
63
|
-
void
|
|
64
|
-
|
|
65
|
-
MatrixF
|
|
66
|
-
MatrixI
|
|
67
|
-
void
|
|
68
|
-
|
|
69
|
-
void
|
|
70
|
-
void
|
|
71
|
-
uint32_t
|
|
72
|
-
void
|
|
73
|
-
|
|
74
|
-
VectorF
|
|
75
|
-
void
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
void
|
|
79
|
-
VectorI
|
|
80
|
-
VectorI
|
|
81
|
-
void
|
|
82
|
-
uint32_t
|
|
83
|
-
void
|
|
84
|
-
void
|
|
85
|
-
|
|
86
|
-
|
|
61
|
+
VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
|
|
62
|
+
|
|
63
|
+
Arena *arena_create(size_t);
|
|
64
|
+
void *arena_alloc(Arena*, size_t);
|
|
65
|
+
void arena_destroy(Arena*);
|
|
66
|
+
|
|
67
|
+
MatrixF *matrix_create_f(Arena*, uint32_t, uint32_t);
|
|
68
|
+
MatrixI *matrix_create_i(Arena*, uint32_t, uint32_t);
|
|
69
|
+
void matrix_set_f(MatrixF*, uint32_t, uint32_t, LDouble value);
|
|
70
|
+
LDouble matrix_get_f(MatrixF*, uint32_t, uint32_t);
|
|
71
|
+
void matrix_inspect_f(MatrixF*);
|
|
72
|
+
void matrix_set_i(MatrixI*, uint32_t, uint32_t, uint32_t value);
|
|
73
|
+
uint32_t matrix_get_i(MatrixI*, uint32_t, uint32_t);
|
|
74
|
+
void matrix_inspect_i(MatrixI*);
|
|
75
|
+
|
|
76
|
+
VectorF *vector_create_f(Arena*, uint32_t);
|
|
77
|
+
void vector_set_f(VectorF*, uint32_t offset, LDouble value);
|
|
78
|
+
LDouble vector_get_f(VectorF*, uint32_t offset);
|
|
79
|
+
LDouble vector_get_diff_f(VectorF*, uint32_t, uint32_t);
|
|
80
|
+
void vector_inspect_f(VectorF*);
|
|
81
|
+
VectorI *vector_create_i(Arena*, uint32_t);
|
|
82
|
+
VectorI *vector_dup_i(VectorI*, Arena*);
|
|
83
|
+
void vector_set_i(VectorI*, uint32_t offset, uint32_t value);
|
|
84
|
+
uint32_t vector_get_i(VectorI*, uint32_t offset);
|
|
85
|
+
void vector_downsize_i(VectorI*, uint32_t);
|
|
86
|
+
void vector_inspect_i(VectorI*);
|
|
87
|
+
|
|
88
|
+
LDouble dissimilarity(uint32_t, uint32_t, VectorF*, VectorF*);
|
|
87
89
|
void fill_row(State, uint32_t, uint32_t, uint32_t);
|
|
88
90
|
void smawk(State, RowParams, VectorI*);
|
|
89
91
|
void find_min_from_candidates(State, RowParams, VectorI*);
|
|
90
|
-
VectorI
|
|
92
|
+
VectorI *prune_candidates(State, RowParams, VectorI*);
|
|
91
93
|
void fill_even_positions(State, RowParams, VectorI*);
|
|
92
94
|
SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
|
|
93
|
-
VectorI
|
|
95
|
+
VectorI *backtrack_sizes(State, VectorI*, uint32_t);
|
|
94
96
|
uint32_t find_koptimal(State);
|
|
95
97
|
|
|
96
98
|
void Init_extensions(void) {
|
|
97
|
-
VALUE ckmeans_module
|
|
99
|
+
VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
|
|
98
100
|
VALUE clusterer_class = rb_const_get(ckmeans_module, rb_intern("Clusterer"));
|
|
99
101
|
|
|
100
102
|
rb_define_private_method(clusterer_class, "sorted_group_sizes", rb_ckmeans_sorted_group_sizes, 0);
|
|
101
103
|
}
|
|
102
104
|
|
|
103
|
-
# define ARENA_MIN_CAPACITY
|
|
104
|
-
# define ALLOCATION_FACTOR
|
|
105
|
+
# define ARENA_MIN_CAPACITY 100
|
|
106
|
+
# define ALLOCATION_FACTOR 3
|
|
105
107
|
# define PIx2 (M_PI * 2.0)
|
|
106
108
|
|
|
107
|
-
VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
bool apply_deviation = RTEST(rb_apply_bic_deviation);
|
|
117
|
-
Arena *arena = arena_create(sizeof(int) * xcount * kmax * ALLOCATION_FACTOR);
|
|
109
|
+
VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
|
|
110
|
+
{
|
|
111
|
+
uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
|
|
112
|
+
uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
|
|
113
|
+
uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
|
|
114
|
+
bool apply_deviation = RTEST(rb_iv_get(self, "@apply_bic_deviation"));
|
|
115
|
+
VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
|
|
116
|
+
size_t capacity = sizeof(LDouble) * (xcount + 1) * (kmax + 1) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
|
|
117
|
+
Arena *arena = arena_create(capacity);
|
|
118
118
|
|
|
119
119
|
if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
|
|
120
120
|
|
|
@@ -125,7 +125,7 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
|
|
|
125
125
|
VectorF *xsumsq = vector_create_f(arena, xcount);
|
|
126
126
|
|
|
127
127
|
for (uint32_t i = 0; i < xcount; i++) {
|
|
128
|
-
|
|
128
|
+
LDouble xi = NUM2DBL(rb_ary_entry(rb_xsorted, i));
|
|
129
129
|
vector_set_f(xsorted, i, xi);
|
|
130
130
|
}
|
|
131
131
|
|
|
@@ -143,17 +143,17 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
|
|
|
143
143
|
};
|
|
144
144
|
|
|
145
145
|
|
|
146
|
-
|
|
147
|
-
|
|
146
|
+
LDouble shift = vector_get_f(xsorted, xcount / 2);
|
|
147
|
+
LDouble diff_initial = vector_get_f(xsorted, 0) - shift;
|
|
148
148
|
|
|
149
149
|
vector_set_f(xsum, 0, diff_initial);
|
|
150
150
|
vector_set_f(xsumsq, 0, diff_initial * diff_initial);
|
|
151
151
|
|
|
152
152
|
for (uint32_t i = 1; i < xcount; i++) {
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
153
|
+
LDouble xi = vector_get_f(xsorted, i);
|
|
154
|
+
LDouble xsum_prev = vector_get_f(xsum, i - 1);
|
|
155
|
+
LDouble xsumsq_prev = vector_get_f(xsumsq, i - 1);
|
|
156
|
+
LDouble diff = xi - shift;
|
|
157
157
|
|
|
158
158
|
vector_set_f(xsum, i, xsum_prev + diff);
|
|
159
159
|
vector_set_f(xsumsq, i, xsumsq_prev + diff * diff);
|
|
@@ -168,7 +168,8 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
|
|
|
168
168
|
|
|
169
169
|
uint32_t koptimal = find_koptimal(state);
|
|
170
170
|
|
|
171
|
-
VectorI *sizes =
|
|
171
|
+
VectorI *sizes = vector_create_i(arena, koptimal);
|
|
172
|
+
backtrack_sizes(state, sizes, koptimal);
|
|
172
173
|
|
|
173
174
|
/* printf("XSORTED \t"); vector_inspect_f(xsorted); */
|
|
174
175
|
/* printf("K OPTIMAL: %lld\n", koptimal); */
|
|
@@ -176,8 +177,8 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
|
|
|
176
177
|
/* printf("FINAL COST\n"); matrix_inspect_f(cost); */
|
|
177
178
|
/* printf("FINAL SPLITS\n"); matrix_inspect_i(splits); */
|
|
178
179
|
|
|
179
|
-
VALUE response = rb_ary_new2(sizes->
|
|
180
|
-
for (uint32_t i = 0; i < sizes->
|
|
180
|
+
VALUE response = rb_ary_new2(sizes->size);
|
|
181
|
+
for (uint32_t i = 0; i < sizes->size; i++) {
|
|
181
182
|
VALUE size = LONG2NUM(vector_get_i(sizes, i));
|
|
182
183
|
rb_ary_store(response, i, size);
|
|
183
184
|
}
|
|
@@ -189,29 +190,30 @@ VALUE rb_ckmeans_sorted_group_sizes(VALUE self) {
|
|
|
189
190
|
|
|
190
191
|
uint32_t find_koptimal(State state)
|
|
191
192
|
{
|
|
192
|
-
uint32_t kmin
|
|
193
|
-
uint32_t kmax
|
|
194
|
-
uint32_t xcount
|
|
195
|
-
uint32_t kopt
|
|
196
|
-
uint32_t xindex_max
|
|
197
|
-
VectorF *xsorted
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
193
|
+
uint32_t kmin = state.kmin;
|
|
194
|
+
uint32_t kmax = state.kmax;
|
|
195
|
+
uint32_t xcount = state.xcount;
|
|
196
|
+
uint32_t kopt = kmin;
|
|
197
|
+
uint32_t xindex_max = state.xcount - 1;
|
|
198
|
+
VectorF *xsorted = state.xsorted;
|
|
199
|
+
LDouble x0 = vector_get_f(xsorted, 0);
|
|
200
|
+
LDouble xn = vector_get_f(xsorted, xindex_max);
|
|
201
|
+
LDouble max_bic = 0.0;
|
|
202
|
+
LDouble xcount_log = log((LDouble) xcount);
|
|
203
|
+
|
|
204
|
+
VectorI *sizes = vector_create_i(state.arena, kmax);
|
|
203
205
|
for (uint32_t k = kmin; k <= kmax; k++) {
|
|
204
206
|
uint32_t index_right, index_left = 0;
|
|
205
|
-
|
|
206
|
-
|
|
207
|
+
LDouble bin_left, bin_right, loglikelihood = 0.0;
|
|
208
|
+
backtrack_sizes(state, sizes, k);
|
|
207
209
|
|
|
208
210
|
for (uint32_t kb = 0; kb < k; kb++) {
|
|
209
|
-
uint32_t npoints
|
|
210
|
-
index_right
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
bin_left
|
|
214
|
-
bin_right
|
|
211
|
+
uint32_t npoints = vector_get_i(sizes, kb);
|
|
212
|
+
index_right = index_left + npoints - 1;
|
|
213
|
+
LDouble xleft = vector_get_f(xsorted, index_left);
|
|
214
|
+
LDouble xright = vector_get_f(xsorted, index_right);
|
|
215
|
+
bin_left = xleft;
|
|
216
|
+
bin_right = xright;
|
|
215
217
|
|
|
216
218
|
if (xleft == xright) {
|
|
217
219
|
bin_left = index_left == 0
|
|
@@ -222,18 +224,18 @@ uint32_t find_koptimal(State state)
|
|
|
222
224
|
: xn;
|
|
223
225
|
}
|
|
224
226
|
|
|
225
|
-
|
|
226
|
-
SegmentStats stats
|
|
227
|
-
|
|
228
|
-
|
|
227
|
+
LDouble bin_width = bin_right - bin_left;
|
|
228
|
+
SegmentStats stats = shifted_data_variance(xsorted, index_left, index_right);
|
|
229
|
+
LDouble mean = stats.mean;
|
|
230
|
+
LDouble variance = stats.variance;
|
|
229
231
|
|
|
230
232
|
if (variance > 0) {
|
|
231
233
|
for (uint32_t i = index_left; i <= index_right; i++) {
|
|
232
|
-
|
|
234
|
+
LDouble xi = vector_get_f(xsorted, i);
|
|
233
235
|
loglikelihood += -(xi - mean) * (xi - mean) / (2.0 * variance);
|
|
234
236
|
}
|
|
235
237
|
loglikelihood += npoints * (
|
|
236
|
-
(log(npoints / (
|
|
238
|
+
(state.apply_deviation ? 0.0 : log(npoints / (LDouble) xcount)) -
|
|
237
239
|
(0.5 * log(PIx2 * variance))
|
|
238
240
|
);
|
|
239
241
|
} else {
|
|
@@ -243,24 +245,23 @@ uint32_t find_koptimal(State state)
|
|
|
243
245
|
index_left = index_right + 1;
|
|
244
246
|
}
|
|
245
247
|
|
|
246
|
-
|
|
248
|
+
LDouble bic = (2.0 * loglikelihood) - (((3 * k) - 1) * xcount_log);
|
|
247
249
|
|
|
248
250
|
if (k == kmin) {
|
|
249
251
|
max_bic = bic;
|
|
250
|
-
kopt
|
|
252
|
+
kopt = kmin;
|
|
251
253
|
} else if (bic > max_bic) {
|
|
252
254
|
max_bic = bic;
|
|
253
|
-
kopt
|
|
255
|
+
kopt = k;
|
|
254
256
|
}
|
|
255
257
|
}
|
|
256
258
|
|
|
257
259
|
return kopt;
|
|
258
260
|
}
|
|
259
261
|
|
|
260
|
-
VectorI *backtrack_sizes(State state, uint32_t k)
|
|
262
|
+
VectorI *backtrack_sizes(State state, VectorI *sizes, uint32_t k)
|
|
261
263
|
{
|
|
262
264
|
MatrixI *splits = state.splits;
|
|
263
|
-
VectorI *sizes = vector_create_i(state.arena, k);
|
|
264
265
|
uint32_t xcount = state.xcount;
|
|
265
266
|
uint32_t right = xcount - 1;
|
|
266
267
|
uint32_t left = 0;
|
|
@@ -280,15 +281,15 @@ VectorI *backtrack_sizes(State state, uint32_t k)
|
|
|
280
281
|
SegmentStats shifted_data_variance(VectorF *xsorted, uint32_t left, uint32_t right)
|
|
281
282
|
{
|
|
282
283
|
const uint32_t n = right - left + 1;
|
|
283
|
-
|
|
284
|
-
|
|
284
|
+
LDouble sum = 0.0;
|
|
285
|
+
LDouble sumsq = 0.0;
|
|
285
286
|
SegmentStats stats = { .mean = 0.0, .variance = 0.0 };
|
|
286
287
|
|
|
287
288
|
if (right >= left) {
|
|
288
|
-
const
|
|
289
|
+
const LDouble median = vector_get_f(xsorted, (left + right) / 2);
|
|
289
290
|
|
|
290
291
|
for (uint32_t i = left; i <= right; i++) {
|
|
291
|
-
const
|
|
292
|
+
const LDouble sumi = vector_get_f(xsorted, i) - median;
|
|
292
293
|
|
|
293
294
|
sum += sumi;
|
|
294
295
|
sumsq += sumi * sumi;
|
|
@@ -341,7 +342,7 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
|
|
|
341
342
|
uint32_t imin = rparams.imin;
|
|
342
343
|
uint32_t imax = rparams.imax;
|
|
343
344
|
uint32_t istep = rparams.istep;
|
|
344
|
-
uint32_t n = split_candidates->
|
|
345
|
+
uint32_t n = split_candidates->size;
|
|
345
346
|
uint32_t istepx2 = istep * 2;
|
|
346
347
|
uint32_t jl = vector_get_i(split_candidates, 0);
|
|
347
348
|
VectorF *xsum = state.xsum;
|
|
@@ -351,22 +352,22 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
|
|
|
351
352
|
for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
|
|
352
353
|
while (vector_get_i(split_candidates, r) < jl) r++;
|
|
353
354
|
|
|
354
|
-
uint32_t rcandidate
|
|
355
|
+
uint32_t rcandidate = vector_get_i(split_candidates, r);
|
|
355
356
|
uint32_t cost_base_row = row - 1;
|
|
356
357
|
uint32_t cost_base_col = rcandidate - 1;
|
|
357
|
-
|
|
358
|
+
LDouble cost =
|
|
358
359
|
matrix_get_f(state.cost, cost_base_row, cost_base_col) + dissimilarity(rcandidate, i, xsum, xsumsq);
|
|
359
360
|
|
|
360
361
|
matrix_set_f(state.cost, row, i, cost);
|
|
361
362
|
matrix_set_i(state.splits, row, i, rcandidate);
|
|
362
363
|
|
|
363
|
-
uint32_t jh
|
|
364
|
+
uint32_t jh =
|
|
364
365
|
(i + istep) <= imax
|
|
365
366
|
? matrix_get_i(splits, row, i + istep)
|
|
366
367
|
: vector_get_i(split_candidates, n - 1);
|
|
367
368
|
|
|
368
|
-
uint32_t jmax
|
|
369
|
-
|
|
369
|
+
uint32_t jmax = jh < i ? jh : i;
|
|
370
|
+
LDouble sjimin = dissimilarity(jmax, i, xsum, xsumsq);
|
|
370
371
|
|
|
371
372
|
for (++r; r < n && vector_get_i(split_candidates, r) <= jmax; r++) {
|
|
372
373
|
uint32_t jabs = vector_get_i(split_candidates, r);
|
|
@@ -374,9 +375,9 @@ void fill_even_positions(State state, RowParams rparams, VectorI *split_candidat
|
|
|
374
375
|
if (jabs > i) break;
|
|
375
376
|
if (jabs < matrix_get_i(splits, row - 1, i)) continue;
|
|
376
377
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
378
|
+
LDouble cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
|
|
379
|
+
LDouble sj = cost_base + dissimilarity(jabs, i, xsum, xsumsq);
|
|
380
|
+
LDouble cost_prev = matrix_get_f(state.cost, row, i);
|
|
380
381
|
|
|
381
382
|
if (sj <= cost_prev) {
|
|
382
383
|
matrix_set_f(state.cost, row, i, sj);
|
|
@@ -407,19 +408,19 @@ void find_min_from_candidates(State state, RowParams rparams, VectorI *split_can
|
|
|
407
408
|
const uint32_t optimal_split_idx = optimal_split_idx_prev;
|
|
408
409
|
const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
|
|
409
410
|
const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
|
|
410
|
-
const
|
|
411
|
+
const LDouble added_cost = dissimilarity(optimal_split, i, state.xsum, state.xsumsq);
|
|
411
412
|
|
|
412
413
|
matrix_set_f(cost, row, i, cost_prev + added_cost);
|
|
413
414
|
matrix_set_i(splits, row, i, optimal_split);
|
|
414
415
|
|
|
415
|
-
for (uint32_t r = optimal_split_idx + 1; r < split_candidates->
|
|
416
|
+
for (uint32_t r = optimal_split_idx + 1; r < split_candidates->size; r++)
|
|
416
417
|
{
|
|
417
418
|
uint32_t split = vector_get_i(split_candidates, r);
|
|
418
419
|
|
|
419
420
|
if (split < matrix_get_i(splits, row - 1, i)) continue;
|
|
420
421
|
if (split > i) break;
|
|
421
422
|
|
|
422
|
-
|
|
423
|
+
LDouble split_cost =
|
|
423
424
|
matrix_get_f(cost, row - 1, split - 1) + dissimilarity(split, i, state.xsum, state.xsumsq);
|
|
424
425
|
|
|
425
426
|
if (split_cost > matrix_get_f(cost, row, i)) continue;
|
|
@@ -437,7 +438,7 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
|
|
|
437
438
|
uint32_t row = rparams.row;
|
|
438
439
|
uint32_t istep = rparams.istep;
|
|
439
440
|
uint32_t n = ((rparams.imax - imin) / istep) + 1;
|
|
440
|
-
uint32_t m = split_candidates->
|
|
441
|
+
uint32_t m = split_candidates->size;
|
|
441
442
|
|
|
442
443
|
if (n >= m) return split_candidates;
|
|
443
444
|
|
|
@@ -447,12 +448,12 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
|
|
|
447
448
|
|
|
448
449
|
while (m > n)
|
|
449
450
|
{
|
|
450
|
-
uint32_t i
|
|
451
|
-
uint32_t j
|
|
452
|
-
uint32_t jnext
|
|
453
|
-
|
|
451
|
+
uint32_t i = imin + left * istep;
|
|
452
|
+
uint32_t j = vector_get_i(pruned, right);
|
|
453
|
+
uint32_t jnext = vector_get_i(pruned, right + 1);
|
|
454
|
+
LDouble sl =
|
|
454
455
|
matrix_get_f(state.cost, row - 1, j - 1) + dissimilarity(j, i, state.xsum, state.xsumsq);
|
|
455
|
-
|
|
456
|
+
LDouble snext =
|
|
456
457
|
matrix_get_f(state.cost, row - 1, jnext - 1) + dissimilarity(jnext, i, state.xsum, state.xsumsq);
|
|
457
458
|
|
|
458
459
|
if ((sl < snext) && (left < n - 1)) {
|
|
@@ -483,86 +484,86 @@ VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candida
|
|
|
483
484
|
return pruned;
|
|
484
485
|
}
|
|
485
486
|
|
|
486
|
-
|
|
487
|
-
|
|
487
|
+
inline LDouble dissimilarity(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict xsumsq) {
|
|
488
|
+
LDouble sji = 0.0;
|
|
488
489
|
|
|
489
490
|
if (j >= i) return sji;
|
|
490
491
|
|
|
491
492
|
if (j > 0) {
|
|
492
|
-
|
|
493
|
-
uint32_t segment_size
|
|
494
|
-
sji
|
|
493
|
+
LDouble segment_diff = vector_get_diff_f(xsum, i, j - 1);
|
|
494
|
+
uint32_t segment_size = i - j + 1;
|
|
495
|
+
sji = vector_get_diff_f(xsumsq, i, j - 1) - (segment_diff * segment_diff / segment_size);
|
|
495
496
|
} else {
|
|
496
|
-
|
|
497
|
-
sji
|
|
497
|
+
LDouble xsumi = vector_get_f(xsum, i);
|
|
498
|
+
sji = vector_get_f(xsumsq, i) - (xsumi * xsumi / (i + 1));
|
|
498
499
|
}
|
|
499
500
|
|
|
500
501
|
return (sji > 0) ? sji : 0.0;
|
|
501
502
|
}
|
|
502
503
|
|
|
503
|
-
VectorF *vector_create_f(Arena *arena, uint32_t
|
|
504
|
+
inline VectorF *vector_create_f(Arena *arena, uint32_t size) {
|
|
504
505
|
VectorF *v;
|
|
505
506
|
|
|
506
|
-
v
|
|
507
|
-
v->values
|
|
508
|
-
v->
|
|
507
|
+
v = arena_alloc(arena, sizeof(*v));
|
|
508
|
+
v->values = arena_alloc(arena, sizeof(*(v->values)) * size);
|
|
509
|
+
v->size = size;
|
|
509
510
|
|
|
510
511
|
return v;
|
|
511
512
|
}
|
|
512
513
|
|
|
513
|
-
VectorI *vector_create_i(Arena *arena, uint32_t
|
|
514
|
+
inline VectorI *vector_create_i(Arena *arena, uint32_t size) {
|
|
514
515
|
VectorI *v;
|
|
515
516
|
|
|
516
|
-
v
|
|
517
|
-
v->values
|
|
518
|
-
v->
|
|
517
|
+
v = arena_alloc(arena, sizeof(*v));
|
|
518
|
+
v->values = arena_alloc(arena, sizeof(*(v->values)) * size);
|
|
519
|
+
v->size = size;
|
|
519
520
|
|
|
520
521
|
return v;
|
|
521
522
|
}
|
|
522
523
|
|
|
523
|
-
VectorI *vector_dup_i(VectorI *v, Arena *arena)
|
|
524
|
+
inline VectorI *vector_dup_i(VectorI *v, Arena *arena)
|
|
524
525
|
{
|
|
525
|
-
VectorI *vdup = vector_create_i(arena, v->
|
|
526
|
+
VectorI *vdup = vector_create_i(arena, v->size);
|
|
526
527
|
|
|
527
|
-
memcpy(vdup->values, v->values, sizeof(*(v->values)) * v->
|
|
528
|
+
memcpy(vdup->values, v->values, sizeof(*(v->values)) * v->size);
|
|
528
529
|
|
|
529
530
|
return vdup;
|
|
530
531
|
}
|
|
531
532
|
|
|
532
|
-
void vector_set_f(VectorF *v, uint32_t offset,
|
|
533
|
+
inline void vector_set_f(VectorF *v, uint32_t offset, LDouble value) {
|
|
533
534
|
*(v->values + offset) = value;
|
|
534
535
|
}
|
|
535
536
|
|
|
536
|
-
void vector_set_i(VectorI *v, uint32_t offset, uint32_t value) {
|
|
537
|
+
inline void vector_set_i(VectorI *v, uint32_t offset, uint32_t value) {
|
|
537
538
|
*(v->values + offset) = value;
|
|
538
539
|
}
|
|
539
540
|
|
|
540
|
-
uint32_t vector_get_i(VectorI *v, uint32_t offset) {
|
|
541
|
+
inline uint32_t vector_get_i(VectorI *v, uint32_t offset) {
|
|
541
542
|
return *(v->values + offset);
|
|
542
543
|
}
|
|
543
544
|
|
|
544
|
-
void vector_downsize_i(VectorI *v, uint32_t new_size) {
|
|
545
|
-
v->
|
|
545
|
+
inline void vector_downsize_i(VectorI *v, uint32_t new_size) {
|
|
546
|
+
v->size = new_size;
|
|
546
547
|
}
|
|
547
548
|
|
|
548
549
|
void vector_inspect_i(VectorI *v) {
|
|
549
|
-
for (uint32_t i = 0; i < v->
|
|
550
|
+
for (uint32_t i = 0; i < v->size - 1; i++)
|
|
550
551
|
printf("%u, ", vector_get_i(v, i));
|
|
551
|
-
printf("%u\n", vector_get_i(v, v->
|
|
552
|
+
printf("%u\n", vector_get_i(v, v->size - 1));
|
|
552
553
|
}
|
|
553
554
|
|
|
554
|
-
|
|
555
|
+
inline LDouble vector_get_f(VectorF *v, uint32_t offset) {
|
|
555
556
|
return *(v->values + offset);
|
|
556
557
|
}
|
|
557
558
|
|
|
558
|
-
|
|
559
|
+
inline LDouble vector_get_diff_f(VectorF *v, uint32_t i, uint32_t j) {
|
|
559
560
|
return *(v->values + i) - *(v->values + j);
|
|
560
561
|
}
|
|
561
562
|
|
|
562
563
|
void vector_inspect_f(VectorF *v) {
|
|
563
|
-
for (uint32_t i = 0; i < v->
|
|
564
|
+
for (uint32_t i = 0; i < v->size - 1; i++)
|
|
564
565
|
printf("%Lf, ", vector_get_f(v, i));
|
|
565
|
-
printf("%Lf\n", vector_get_f(v, v->
|
|
566
|
+
printf("%Lf\n", vector_get_f(v, v->size - 1));
|
|
566
567
|
}
|
|
567
568
|
|
|
568
569
|
MatrixF *matrix_create_f(Arena *arena, uint32_t nrows, uint32_t ncols) {
|
|
@@ -587,12 +588,12 @@ MatrixI *matrix_create_i(Arena *arena, uint32_t nrows, uint32_t ncols) {
|
|
|
587
588
|
return m;
|
|
588
589
|
}
|
|
589
590
|
|
|
590
|
-
void matrix_set_f(MatrixF *m, uint32_t i, uint32_t j,
|
|
591
|
+
inline void matrix_set_f(MatrixF *m, uint32_t i, uint32_t j, LDouble value) {
|
|
591
592
|
uint32_t offset = i * m->ncols + j;
|
|
592
593
|
*(m->values + offset) = value;
|
|
593
594
|
}
|
|
594
595
|
|
|
595
|
-
|
|
596
|
+
inline LDouble matrix_get_f(MatrixF *m, uint32_t i, uint32_t j) {
|
|
596
597
|
uint32_t offset = i * m->ncols + j;
|
|
597
598
|
return *(m->values + offset);
|
|
598
599
|
}
|
|
@@ -600,7 +601,7 @@ long double matrix_get_f(MatrixF *m, uint32_t i, uint32_t j) {
|
|
|
600
601
|
void matrix_inspect_f(MatrixF *m) {
|
|
601
602
|
for (uint32_t i = 0; i < m->nrows; i++) {
|
|
602
603
|
for (uint32_t j = 0; j < m->ncols - 1; j++) {
|
|
603
|
-
|
|
604
|
+
LDouble value = matrix_get_f(m, i, j);
|
|
604
605
|
|
|
605
606
|
printf("%Lf, ", value);
|
|
606
607
|
}
|
|
@@ -616,17 +617,17 @@ void matrix_inspect_i(MatrixI *m) {
|
|
|
616
617
|
}
|
|
617
618
|
}
|
|
618
619
|
|
|
619
|
-
void matrix_set_i(MatrixI *m, uint32_t i, uint32_t j, uint32_t value) {
|
|
620
|
+
inline void matrix_set_i(MatrixI *m, uint32_t i, uint32_t j, uint32_t value) {
|
|
620
621
|
uint32_t offset = i * m->ncols + j;
|
|
621
622
|
*(m->values + offset) = value;
|
|
622
623
|
}
|
|
623
624
|
|
|
624
|
-
uint32_t matrix_get_i(MatrixI *m, uint32_t i, uint32_t j) {
|
|
625
|
+
inline uint32_t matrix_get_i(MatrixI *m, uint32_t i, uint32_t j) {
|
|
625
626
|
uint32_t offset = i * m->ncols + j;
|
|
626
627
|
return *(m->values + offset);
|
|
627
628
|
}
|
|
628
629
|
|
|
629
|
-
Arena *arena_create(
|
|
630
|
+
Arena *arena_create(size_t capacity) {
|
|
630
631
|
if (capacity < ARENA_MIN_CAPACITY) {
|
|
631
632
|
capacity = ARENA_MIN_CAPACITY;
|
|
632
633
|
}
|
|
@@ -654,7 +655,7 @@ Arena *arena_create(uint32_t capacity) {
|
|
|
654
655
|
return arena;
|
|
655
656
|
}
|
|
656
657
|
|
|
657
|
-
void *arena_alloc(Arena *arena,
|
|
658
|
+
void *arena_alloc(Arena *arena, size_t size) {
|
|
658
659
|
size = (size + 7) & ~7;
|
|
659
660
|
|
|
660
661
|
if (arena->offset + size > arena->capacity) {
|
|
@@ -669,7 +670,8 @@ void *arena_alloc(Arena *arena, uint32_t size) {
|
|
|
669
670
|
}
|
|
670
671
|
|
|
671
672
|
void arena_destroy(Arena *arena) {
|
|
672
|
-
/*
|
|
673
|
+
/* double leftover = ((double) arena->capacity - arena->offset) / arena->capacity * 100; */
|
|
674
|
+
/* printf("[Arena Destroy] Capacity: %zu, offset: %zu, left: %2.2f%%\n", arena->capacity, arena->offset, leftover); */
|
|
673
675
|
free(arena->buffer);
|
|
674
676
|
free(arena);
|
|
675
677
|
}
|
data/lib/ckmeans/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ckmeans
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.0.
|
|
4
|
+
version: 1.0.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Vlad Lebedev
|
|
8
8
|
bindir: exe
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2025-
|
|
10
|
+
date: 2025-05-01 00:00:00.000000000 Z
|
|
11
11
|
dependencies: []
|
|
12
12
|
description: Repeatable clustering of unidimensional data
|
|
13
13
|
email:
|
|
@@ -17,11 +17,13 @@ extensions:
|
|
|
17
17
|
- ext/ckmeans/extconf.rb
|
|
18
18
|
extra_rdoc_files: []
|
|
19
19
|
files:
|
|
20
|
+
- ".dockerignore"
|
|
20
21
|
- ".rspec"
|
|
21
22
|
- ".rubocop.yml"
|
|
22
23
|
- ".rubocop_todo.yml"
|
|
23
24
|
- ".ruby-version"
|
|
24
25
|
- CHANGELOG.md
|
|
26
|
+
- Dockerfile
|
|
25
27
|
- LICENSE
|
|
26
28
|
- README.md
|
|
27
29
|
- Rakefile
|