ckmeans 2.0.0 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +33 -14
- data/ext/ckmeans/algorithm.inc +281 -0
- data/ext/ckmeans/dissimilarity.h +65 -0
- data/ext/ckmeans/extensions.c +139 -339
- data/lib/ckmeans/clusterer.rb +25 -7
- data/lib/ckmeans/version.rb +1 -1
- data/lib/ckmedian/clusterer.rb +28 -7
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 231337d4b73a838b8a9326936c4ae4c003db103108b6622c22a09a6a44bf4e31
|
|
4
|
+
data.tar.gz: c0d97cd2fd4b1fd6693305900e9e329e218993fb735b9d24cd1c7281eda14e8a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 04be90532ac9498a184025d849b14a0d34fb61d00352c818a6e7495ae753666689ecef0d542fc6e95f118e5c83fcc29560baee2b339762158d47811a1d4164cd
|
|
7
|
+
data.tar.gz: '019cc8ff7f3fa2648faa03997fc88489718e944d13760b1e28eed3c4e48d48da56fd87ced9e5d4499e9c6cfb962cba6c02889dd0d3a53fd88ccb0a0396e00551'
|
data/README.md
CHANGED
|
@@ -18,33 +18,52 @@ gem install ckmeans
|
|
|
18
18
|
|
|
19
19
|
## Usage
|
|
20
20
|
|
|
21
|
-
###
|
|
21
|
+
### Basic Clustering
|
|
22
22
|
|
|
23
23
|
```rb
|
|
24
|
-
# Fixed cluster count
|
|
25
|
-
Ckmeans::Clusterer(data,
|
|
26
|
-
Ckmedian::Clusterer(data,
|
|
24
|
+
# Fixed cluster count (K known in advance)
|
|
25
|
+
Ckmeans::Clusterer.new(data, 3).clusters
|
|
26
|
+
Ckmedian::Clusterer.new(data, 3).clusters
|
|
27
|
+
|
|
28
|
+
# Automatic K selection (tries K from kmin to kmax, picks optimal)
|
|
29
|
+
Ckmeans::Clusterer.new(data, 1, 10).clusters
|
|
30
|
+
Ckmedian::Clusterer.new(data, 1, 10).clusters
|
|
27
31
|
```
|
|
28
32
|
|
|
29
|
-
###
|
|
33
|
+
### Choosing Between Ckmeans and Ckmedian
|
|
34
|
+
|
|
35
|
+
- **Ckmeans** - Minimizes squared distances (L2). Good for normally distributed data.
|
|
36
|
+
- **Ckmedian** - Minimizes absolute distances (L1). More robust to outliers and data bursts.
|
|
30
37
|
|
|
31
38
|
```rb
|
|
32
|
-
|
|
33
|
-
|
|
39
|
+
# For clean numerical data
|
|
40
|
+
temperatures = [20.1, 20.2, 25.5, 25.6, 30.1, 30.2]
|
|
41
|
+
Ckmeans::Clusterer.new(temperatures, 1, 5).clusters
|
|
42
|
+
# => [[20.1, 20.2], [25.5, 25.6], [30.1, 30.2]]
|
|
43
|
+
|
|
44
|
+
# For data with outliers (e.g., photo timestamps with bursts)
|
|
45
|
+
timestamps = photos.map(&:taken_at).map(&:to_i)
|
|
46
|
+
Ckmedian::Clusterer.new(timestamps, 1, 20).clusters
|
|
34
47
|
```
|
|
35
48
|
|
|
36
|
-
###
|
|
49
|
+
### Stable Estimation (Recommended for Edge Cases)
|
|
37
50
|
|
|
38
|
-
For
|
|
39
|
-
For relatively small sets or sets with many duplicates use Gaussian Mixture Model (GMM)-based estimation.
|
|
40
|
-
It works slower but is more resilient for various data patterns like big numbers of duplicates or clusters with different
|
|
41
|
-
numbers of elements.
|
|
51
|
+
By default, both algorithms use a fast heuristic for estimating K. For datasets with many duplicates, tight clusters, or outliers, use `:stable` for more robust estimation:
|
|
42
52
|
|
|
43
53
|
```rb
|
|
44
|
-
|
|
45
|
-
|
|
54
|
+
# Stable estimation (uses statistical mixture models)
|
|
55
|
+
Ckmeans::Clusterer.new(data, 1, 10, :stable).clusters
|
|
56
|
+
Ckmedian::Clusterer.new(data, 1, 10, :stable).clusters
|
|
46
57
|
```
|
|
47
58
|
|
|
59
|
+
**When to use `:stable`:**
|
|
60
|
+
- Small to medium datasets (< 1000 points)
|
|
61
|
+
- Many duplicate values
|
|
62
|
+
- Clusters with very different sizes
|
|
63
|
+
- Photo/event timeline clustering (bursts and gaps)
|
|
64
|
+
|
|
65
|
+
**Expert users:** `:stable` is an alias for `:gmm` (Gaussian Mixture Model) in Ckmeans and `:lmm` (Laplace Mixture Model) in Ckmedian.
|
|
66
|
+
|
|
48
67
|
## License
|
|
49
68
|
|
|
50
69
|
The gem is available as open source under the terms of the [LGPL v3 License](https://opensource.org/license/lgpl-3-0).
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
/* SMAWK algorithm implementation template
|
|
2
|
+
*
|
|
3
|
+
* This file is designed to be included multiple times with different DISSIM_SUFFIX and DISSIM macro definitions
|
|
4
|
+
* to generate L1 and L2 specific versions of the clustering algorithm.
|
|
5
|
+
*
|
|
6
|
+
* Before including this file, define:
|
|
7
|
+
* DISSIM_SUFFIX - suffix for function names (e.g., l1, l2)
|
|
8
|
+
* DISSIM(j, i, xsum, xsumsq) - macro that calls the dissimilarity function
|
|
9
|
+
*
|
|
10
|
+
* Example:
|
|
11
|
+
* #define DISSIM_SUFFIX l2
|
|
12
|
+
* #define DISSIM(j, i, xsum, xsumsq) dissimilarity_l2(j, i, xsum, xsumsq)
|
|
13
|
+
* #include "algorithm.inc"
|
|
14
|
+
* #undef DISSIM
|
|
15
|
+
* #undef DISSIM_SUFFIX
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
#ifndef DISSIM_SUFFIX
|
|
19
|
+
#error "DISSIM_SUFFIX must be defined before including algorithm.inc"
|
|
20
|
+
#endif
|
|
21
|
+
|
|
22
|
+
#ifndef DISSIM
|
|
23
|
+
#error "DISSIM must be defined before including algorithm.inc"
|
|
24
|
+
#endif
|
|
25
|
+
|
|
26
|
+
/* Helper macros for token pasting */
|
|
27
|
+
#define CONCAT_IMPL(a, b) a##_##b
|
|
28
|
+
#define CONCAT(a, b) CONCAT_IMPL(a, b)
|
|
29
|
+
#define FUNC_NAME(name) CONCAT(name, DISSIM_SUFFIX)
|
|
30
|
+
|
|
31
|
+
static inline void FUNC_NAME(fill_even_positions)(State state, RowParams rparams, VectorI *split_candidates)
|
|
32
|
+
{
|
|
33
|
+
uint32_t row = rparams.row;
|
|
34
|
+
uint32_t imin = rparams.imin;
|
|
35
|
+
uint32_t imax = rparams.imax;
|
|
36
|
+
uint32_t istep = rparams.istep;
|
|
37
|
+
uint32_t n = split_candidates->size;
|
|
38
|
+
uint32_t istepx2 = istep * 2;
|
|
39
|
+
uint32_t jl = vector_get_i(split_candidates, 0);
|
|
40
|
+
VectorF *const xsum = state.xsum;
|
|
41
|
+
VectorF *const xsumsq = state.xsumsq;
|
|
42
|
+
MatrixI *const splits = state.splits;
|
|
43
|
+
|
|
44
|
+
for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
|
|
45
|
+
while (vector_get_i(split_candidates, r) < jl) r++;
|
|
46
|
+
|
|
47
|
+
uint32_t rcandidate = vector_get_i(split_candidates, r);
|
|
48
|
+
uint32_t cost_base_row = row - 1;
|
|
49
|
+
uint32_t cost_base_col = rcandidate - 1;
|
|
50
|
+
LDouble cost = matrix_get_f(state.cost, cost_base_row, cost_base_col)
|
|
51
|
+
+ DISSIM(rcandidate, i, xsum, xsumsq);
|
|
52
|
+
|
|
53
|
+
matrix_set_f(state.cost, row, i, cost);
|
|
54
|
+
matrix_set_i(state.splits, row, i, rcandidate);
|
|
55
|
+
|
|
56
|
+
uint32_t jh = (i + istep) <= imax
|
|
57
|
+
? matrix_get_i(splits, row, i + istep)
|
|
58
|
+
: vector_get_i(split_candidates, n - 1);
|
|
59
|
+
|
|
60
|
+
uint32_t jmax = jh < i ? jh : i;
|
|
61
|
+
LDouble sjimin = DISSIM(jmax, i, xsum, xsumsq);
|
|
62
|
+
|
|
63
|
+
for (++r; r < n && vector_get_i(split_candidates, r) <= jmax; r++) {
|
|
64
|
+
uint32_t jabs = vector_get_i(split_candidates, r);
|
|
65
|
+
|
|
66
|
+
if (jabs > i) break;
|
|
67
|
+
if (jabs < matrix_get_i(splits, row - 1, i)) continue;
|
|
68
|
+
|
|
69
|
+
LDouble cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
|
|
70
|
+
LDouble sj = cost_base + DISSIM(jabs, i, xsum, xsumsq);
|
|
71
|
+
LDouble cost_prev = matrix_get_f(state.cost, row, i);
|
|
72
|
+
|
|
73
|
+
if (sj <= cost_prev) {
|
|
74
|
+
matrix_set_f(state.cost, row, i, sj);
|
|
75
|
+
matrix_set_i(state.splits, row, i, jabs);
|
|
76
|
+
} else if (cost_base + sjimin > cost_prev) {
|
|
77
|
+
break;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
r--;
|
|
82
|
+
jl = jh;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
static inline void FUNC_NAME(find_min_from_candidates)(State state, RowParams rparams, VectorI *split_candidates)
|
|
87
|
+
{
|
|
88
|
+
const uint32_t row = rparams.row;
|
|
89
|
+
const uint32_t imin = rparams.imin;
|
|
90
|
+
const uint32_t imax = rparams.imax;
|
|
91
|
+
const uint32_t istep = rparams.istep;
|
|
92
|
+
MatrixF *const cost = state.cost;
|
|
93
|
+
MatrixI *const splits = state.splits;
|
|
94
|
+
|
|
95
|
+
uint32_t optimal_split_idx_prev = 0;
|
|
96
|
+
|
|
97
|
+
for (uint32_t i = imin; i <= imax; i += istep)
|
|
98
|
+
{
|
|
99
|
+
const uint32_t optimal_split_idx = optimal_split_idx_prev;
|
|
100
|
+
const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
|
|
101
|
+
const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
|
|
102
|
+
const LDouble added_cost = DISSIM(optimal_split, i, state.xsum, state.xsumsq);
|
|
103
|
+
|
|
104
|
+
matrix_set_f(cost, row, i, cost_prev + added_cost);
|
|
105
|
+
matrix_set_i(splits, row, i, optimal_split);
|
|
106
|
+
|
|
107
|
+
for (uint32_t r = optimal_split_idx + 1; r < split_candidates->size; r++)
|
|
108
|
+
{
|
|
109
|
+
uint32_t split = vector_get_i(split_candidates, r);
|
|
110
|
+
|
|
111
|
+
if (split < matrix_get_i(splits, row - 1, i)) continue;
|
|
112
|
+
if (split > i) break;
|
|
113
|
+
|
|
114
|
+
LDouble split_cost = matrix_get_f(cost, row - 1, split - 1) + DISSIM(split, i, state.xsum, state.xsumsq);
|
|
115
|
+
|
|
116
|
+
if (split_cost > matrix_get_f(cost, row, i)) continue;
|
|
117
|
+
|
|
118
|
+
matrix_set_f(cost, row, i, split_cost);
|
|
119
|
+
matrix_set_i(splits, row, i, split);
|
|
120
|
+
optimal_split_idx_prev = r;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
static inline VectorI *FUNC_NAME(prune_candidates)(State state, RowParams rparams, VectorI *split_candidates)
|
|
126
|
+
{
|
|
127
|
+
uint32_t imin = rparams.imin;
|
|
128
|
+
uint32_t row = rparams.row;
|
|
129
|
+
uint32_t istep = rparams.istep;
|
|
130
|
+
uint32_t n = ((rparams.imax - imin) / istep) + 1;
|
|
131
|
+
uint32_t m = split_candidates->size;
|
|
132
|
+
|
|
133
|
+
if (n >= m) return split_candidates;
|
|
134
|
+
|
|
135
|
+
uint32_t left = 0;
|
|
136
|
+
uint32_t right = 0;
|
|
137
|
+
VectorI *pruned = vector_dup_i(split_candidates, state.arena);
|
|
138
|
+
|
|
139
|
+
while (m > n)
|
|
140
|
+
{
|
|
141
|
+
uint32_t i = imin + left * istep;
|
|
142
|
+
uint32_t j = vector_get_i(pruned, right);
|
|
143
|
+
uint32_t jnext = vector_get_i(pruned, right + 1);
|
|
144
|
+
LDouble sl = matrix_get_f(state.cost, row - 1, j - 1) + DISSIM(j, i, state.xsum, state.xsumsq);
|
|
145
|
+
LDouble snext = matrix_get_f(state.cost, row - 1, jnext - 1) + DISSIM(jnext, i, state.xsum, state.xsumsq);
|
|
146
|
+
|
|
147
|
+
if ((sl < snext) && (left < n - 1)) {
|
|
148
|
+
vector_set_i(pruned, left, j);
|
|
149
|
+
left++;
|
|
150
|
+
right++;
|
|
151
|
+
} else if ((sl < snext) && (left == n - 1)) {
|
|
152
|
+
right++;
|
|
153
|
+
m--;
|
|
154
|
+
vector_set_i(pruned, right, j);
|
|
155
|
+
} else {
|
|
156
|
+
if (left > 0) {
|
|
157
|
+
vector_set_i(pruned, right, vector_get_i(pruned, --left));
|
|
158
|
+
} else {
|
|
159
|
+
right++;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
m--;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
for (uint32_t i = left; i < m; i++) {
|
|
167
|
+
vector_set_i(pruned, i, vector_get_i(pruned, right++));
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
vector_downsize_i(pruned, m);
|
|
171
|
+
|
|
172
|
+
return pruned;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
void FUNC_NAME(smawk)(State state, RowParams rparams, VectorI *split_candidates)
|
|
176
|
+
{
|
|
177
|
+
const uint32_t imin = rparams.imin;
|
|
178
|
+
const uint32_t imax = rparams.imax;
|
|
179
|
+
const uint32_t istep = rparams.istep;
|
|
180
|
+
|
|
181
|
+
if ((imax - imin) <= (0 * istep)) {
|
|
182
|
+
FUNC_NAME(find_min_from_candidates)(state, rparams, split_candidates);
|
|
183
|
+
} else {
|
|
184
|
+
VectorI *odd_candidates = FUNC_NAME(prune_candidates)(state, rparams, split_candidates);
|
|
185
|
+
uint32_t istepx2 = istep * 2;
|
|
186
|
+
uint32_t imin_odd = imin + istep;
|
|
187
|
+
uint32_t imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2);
|
|
188
|
+
RowParams rparams_odd = { .row = rparams.row, .imin = imin_odd, .imax = imax_odd, .istep = istepx2 };
|
|
189
|
+
|
|
190
|
+
FUNC_NAME(smawk)(state, rparams_odd, odd_candidates);
|
|
191
|
+
FUNC_NAME(fill_even_positions)(state, rparams, split_candidates);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
void FUNC_NAME(fill_row)(State state, uint32_t q, uint32_t imin, uint32_t imax)
|
|
196
|
+
{
|
|
197
|
+
uint32_t size = imax - q + 1;
|
|
198
|
+
VectorI *split_candidates = vector_create_i(state.arena, size);
|
|
199
|
+
for (uint32_t i = 0; i < size; i++) {
|
|
200
|
+
vector_set_i(split_candidates, i, q + i);
|
|
201
|
+
}
|
|
202
|
+
RowParams rparams = { .row = q, .imin = imin, .imax = imax, .istep = 1 };
|
|
203
|
+
FUNC_NAME(smawk)(state, rparams, split_candidates);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
VALUE FUNC_NAME(rb_sorted_group_sizes)(VALUE self, FnFindKOptimal *find_koptimal)
|
|
207
|
+
{
|
|
208
|
+
uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
|
|
209
|
+
uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
|
|
210
|
+
uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
|
|
211
|
+
VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
|
|
212
|
+
size_t capacity = sizeof(LDouble) * (xcount + 2) * (kmax + 2) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
|
|
213
|
+
Arena *arena = arena_create(capacity);
|
|
214
|
+
|
|
215
|
+
if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
|
|
216
|
+
|
|
217
|
+
MatrixF *cost = matrix_create_f(arena, kmax, xcount);
|
|
218
|
+
MatrixI *splits = matrix_create_i(arena, kmax, xcount);
|
|
219
|
+
VectorF *xsorted = vector_create_f(arena, xcount);
|
|
220
|
+
VectorF *xsum = vector_create_f(arena, xcount);
|
|
221
|
+
VectorF *xsumsq = vector_create_f(arena, xcount);
|
|
222
|
+
|
|
223
|
+
for (uint32_t i = 0; i < xcount; i++) {
|
|
224
|
+
LDouble xi = NUM2DBL(rb_ary_entry(rb_xsorted, i));
|
|
225
|
+
vector_set_f(xsorted, i, xi);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
State state = {
|
|
229
|
+
.arena = arena,
|
|
230
|
+
.xcount = xcount,
|
|
231
|
+
.kmin = kmin,
|
|
232
|
+
.kmax = kmax,
|
|
233
|
+
.xsorted = xsorted,
|
|
234
|
+
.cost = cost,
|
|
235
|
+
.splits = splits,
|
|
236
|
+
.xsum = xsum,
|
|
237
|
+
.xsumsq = xsumsq
|
|
238
|
+
};
|
|
239
|
+
|
|
240
|
+
LDouble shift = vector_get_f(xsorted, xcount / 2);
|
|
241
|
+
LDouble diff_initial = vector_get_f(xsorted, 0) - shift;
|
|
242
|
+
|
|
243
|
+
vector_set_f(xsum, 0, diff_initial);
|
|
244
|
+
vector_set_f(xsumsq, 0, diff_initial * diff_initial);
|
|
245
|
+
|
|
246
|
+
for (uint32_t i = 1; i < xcount; i++) {
|
|
247
|
+
LDouble xi = vector_get_f(xsorted, i);
|
|
248
|
+
LDouble xsum_prev = vector_get_f(xsum, i - 1);
|
|
249
|
+
LDouble xsumsq_prev = vector_get_f(xsumsq, i - 1);
|
|
250
|
+
LDouble diff = xi - shift;
|
|
251
|
+
|
|
252
|
+
vector_set_f(xsum, i, xsum_prev + diff);
|
|
253
|
+
vector_set_f(xsumsq, i, xsumsq_prev + diff * diff);
|
|
254
|
+
matrix_set_f(cost, 0, i, DISSIM(0, i, xsum, xsumsq));
|
|
255
|
+
matrix_set_i(splits, 0, i, 0);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
for (uint32_t q = 1; q <= kmax - 1; q++) {
|
|
259
|
+
uint32_t imin = (q < kmax - 1) ? ((q > 1) ? q : 1) : xcount - 1;
|
|
260
|
+
FUNC_NAME(fill_row)(state, q, imin, xcount - 1);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
uint32_t koptimal = find_koptimal(state);
|
|
264
|
+
|
|
265
|
+
VectorI *sizes = vector_create_i(arena, koptimal);
|
|
266
|
+
backtrack_sizes(state, sizes, koptimal);
|
|
267
|
+
|
|
268
|
+
VALUE response = rb_ary_new2(sizes->size);
|
|
269
|
+
for (uint32_t i = 0; i < sizes->size; i++) {
|
|
270
|
+
VALUE size = LONG2NUM(vector_get_i(sizes, i));
|
|
271
|
+
rb_ary_store(response, i, size);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
arena_destroy(arena);
|
|
275
|
+
|
|
276
|
+
return response;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
#undef CONCAT_IMPL
|
|
280
|
+
#undef CONCAT
|
|
281
|
+
#undef FUNC_NAME
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
#ifndef DISSIMILARITY_H
|
|
2
|
+
#define DISSIMILARITY_H
|
|
3
|
+
|
|
4
|
+
/* L2 aka Euclidean aka Mean dissimilarity criteria */
|
|
5
|
+
static inline LDouble dissimilarity_l2(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict xsumsq)
|
|
6
|
+
{
|
|
7
|
+
LDouble sji = 0.0;
|
|
8
|
+
|
|
9
|
+
if (j >= i) return sji;
|
|
10
|
+
|
|
11
|
+
if (j > 0) {
|
|
12
|
+
LDouble segment_diff = vector_get_diff_f(xsum, i, j - 1);
|
|
13
|
+
uint32_t segment_size = i - j + 1;
|
|
14
|
+
sji = vector_get_diff_f(xsumsq, i, j - 1) - (segment_diff * segment_diff / segment_size);
|
|
15
|
+
} else {
|
|
16
|
+
LDouble xsumi = vector_get_f(xsum, i);
|
|
17
|
+
sji = vector_get_f(xsumsq, i) - (xsumi * xsumi / (i + 1));
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return (sji > 0) ? sji : 0.0;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/* L1 aka Manhattan aka Median dissimilarity criteria */
|
|
24
|
+
static inline LDouble dissimilarity_l1(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict _xsumsq)
|
|
25
|
+
{
|
|
26
|
+
LDouble sji = 0.0;
|
|
27
|
+
|
|
28
|
+
if (j >= i) return sji;
|
|
29
|
+
|
|
30
|
+
if (j > 0) {
|
|
31
|
+
uint32_t median_idx = (i + j) >> 1;
|
|
32
|
+
|
|
33
|
+
if (((i - j + 1) % 2) == 1) {
|
|
34
|
+
sji =
|
|
35
|
+
- vector_get_f(xsum, median_idx - 1)
|
|
36
|
+
+ vector_get_f(xsum, j - 1)
|
|
37
|
+
+ vector_get_f(xsum, i)
|
|
38
|
+
- vector_get_f(xsum, median_idx);
|
|
39
|
+
} else {
|
|
40
|
+
sji =
|
|
41
|
+
- vector_get_f(xsum, median_idx)
|
|
42
|
+
+ vector_get_f(xsum, j - 1)
|
|
43
|
+
+ vector_get_f(xsum, i)
|
|
44
|
+
- vector_get_f(xsum, median_idx);
|
|
45
|
+
}
|
|
46
|
+
} else { // j == 0
|
|
47
|
+
uint32_t median_idx = i >> 1;
|
|
48
|
+
|
|
49
|
+
if (((i + 1) % 2) == 1) {
|
|
50
|
+
sji =
|
|
51
|
+
- vector_get_f(xsum, median_idx - 1)
|
|
52
|
+
+ vector_get_f(xsum, i)
|
|
53
|
+
- vector_get_f(xsum, median_idx);
|
|
54
|
+
} else {
|
|
55
|
+
sji =
|
|
56
|
+
- vector_get_f(xsum, median_idx)
|
|
57
|
+
+ vector_get_f(xsum, i)
|
|
58
|
+
- vector_get_f(xsum, median_idx);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return (sji < 0) ? 0.0 : sji;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
#endif /* DISSIMILARITY_H */
|
data/ext/ckmeans/extensions.c
CHANGED
|
@@ -33,8 +33,6 @@ typedef struct VectorI {
|
|
|
33
33
|
uint32_t *values;
|
|
34
34
|
} VectorI;
|
|
35
35
|
|
|
36
|
-
typedef LDouble (FnDissim)(uint32_t, uint32_t, VectorF*, VectorF*);
|
|
37
|
-
|
|
38
36
|
typedef struct State {
|
|
39
37
|
uint32_t xcount;
|
|
40
38
|
uint32_t kmin;
|
|
@@ -45,7 +43,6 @@ typedef struct State {
|
|
|
45
43
|
MatrixI *splits;
|
|
46
44
|
VectorF *xsum;
|
|
47
45
|
VectorF *xsumsq;
|
|
48
|
-
FnDissim *dissim;
|
|
49
46
|
} State;
|
|
50
47
|
|
|
51
48
|
typedef struct RowParams {
|
|
@@ -55,6 +52,8 @@ typedef struct RowParams {
|
|
|
55
52
|
uint32_t istep;
|
|
56
53
|
} RowParams;
|
|
57
54
|
|
|
55
|
+
typedef uint32_t (FnFindKOptimal)(State);
|
|
56
|
+
|
|
58
57
|
typedef struct {
|
|
59
58
|
LDouble mean;
|
|
60
59
|
LDouble variance;
|
|
@@ -62,7 +61,6 @@ typedef struct {
|
|
|
62
61
|
|
|
63
62
|
VALUE rb_ckmeans_sorted_group_sizes(VALUE self);
|
|
64
63
|
VALUE rb_ckmedian_sorted_group_sizes(VALUE self);
|
|
65
|
-
VALUE rb_sorted_group_sizes(VALUE self, FnDissim*);
|
|
66
64
|
|
|
67
65
|
Arena *arena_create(size_t);
|
|
68
66
|
void *arena_alloc(Arena*, size_t);
|
|
@@ -89,17 +87,11 @@ uint32_t vector_get_i(VectorI*, uint32_t offset);
|
|
|
89
87
|
void vector_downsize_i(VectorI*, uint32_t);
|
|
90
88
|
void vector_inspect_i(VectorI*);
|
|
91
89
|
|
|
92
|
-
LDouble dissimilarity_l2(uint32_t, uint32_t, VectorF*, VectorF*);
|
|
93
|
-
LDouble dissimilarity_l1(uint32_t, uint32_t, VectorF*, VectorF*);
|
|
94
|
-
void fill_row(State, uint32_t, uint32_t, uint32_t);
|
|
95
|
-
void smawk(State, RowParams, VectorI*);
|
|
96
|
-
void find_min_from_candidates(State, RowParams, VectorI*);
|
|
97
|
-
VectorI *prune_candidates(State, RowParams, VectorI*);
|
|
98
|
-
void fill_even_positions(State, RowParams, VectorI*);
|
|
99
90
|
SegmentStats shifted_data_variance(VectorF*, uint32_t, uint32_t);
|
|
100
91
|
VectorI *backtrack_sizes(State, VectorI*, uint32_t);
|
|
101
92
|
uint32_t find_koptimal_fast(State);
|
|
102
93
|
uint32_t find_koptimal_gmm(State);
|
|
94
|
+
uint32_t find_koptimal_lmm(State);
|
|
103
95
|
|
|
104
96
|
void Init_extensions(void) {
|
|
105
97
|
VALUE ckmeans_module = rb_const_get(rb_cObject, rb_intern("Ckmeans"));
|
|
@@ -115,96 +107,34 @@ void Init_extensions(void) {
|
|
|
115
107
|
# define ALLOCATION_FACTOR 3
|
|
116
108
|
# define PIx2 (M_PI * 2.0)
|
|
117
109
|
|
|
110
|
+
#include "dissimilarity.h"
|
|
111
|
+
|
|
112
|
+
/* L2-specific versions of all hot-path functions */
|
|
113
|
+
#define DISSIM_SUFFIX l2
|
|
114
|
+
#define DISSIM(j, i, xsum, xsumsq) dissimilarity_l2(j, i, xsum, xsumsq)
|
|
115
|
+
#include "algorithm.inc"
|
|
116
|
+
#undef DISSIM
|
|
117
|
+
#undef DISSIM_SUFFIX
|
|
118
|
+
|
|
119
|
+
/* L1-specific versions of all hot-path functions */
|
|
120
|
+
#define DISSIM_SUFFIX l1
|
|
121
|
+
#define DISSIM(j, i, xsum, xsumsq) dissimilarity_l1(j, i, xsum, xsumsq)
|
|
122
|
+
#include "algorithm.inc"
|
|
123
|
+
#undef DISSIM
|
|
124
|
+
#undef DISSIM_SUFFIX
|
|
125
|
+
|
|
118
126
|
VALUE rb_ckmeans_sorted_group_sizes(VALUE self)
|
|
119
127
|
{
|
|
120
|
-
|
|
128
|
+
bool use_stable = RTEST(rb_iv_get(self, "@use_stable_estimation"));
|
|
129
|
+
FnFindKOptimal *find_k = use_stable ? find_koptimal_gmm : find_koptimal_fast;
|
|
130
|
+
return rb_sorted_group_sizes_l2(self, find_k);
|
|
121
131
|
}
|
|
122
132
|
|
|
123
133
|
VALUE rb_ckmedian_sorted_group_sizes(VALUE self)
|
|
124
134
|
{
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
VALUE rb_sorted_group_sizes(VALUE self, FnDissim *criteria)
|
|
129
|
-
{
|
|
130
|
-
uint32_t xcount = NUM2UINT(rb_iv_get(self, "@xcount"));
|
|
131
|
-
uint32_t kmin = NUM2UINT(rb_iv_get(self, "@kmin"));
|
|
132
|
-
uint32_t kmax = NUM2UINT(rb_iv_get(self, "@kmax"));
|
|
133
|
-
bool use_gmm = RTEST(rb_iv_get(self, "@use_gmm"));
|
|
134
|
-
VALUE rb_xsorted = rb_iv_get(self, "@xsorted");
|
|
135
|
-
size_t capacity = sizeof(LDouble) * (xcount + 2) * (kmax + 2) * ALLOCATION_FACTOR + ARENA_MIN_CAPACITY;
|
|
136
|
-
Arena *arena = arena_create(capacity);
|
|
137
|
-
|
|
138
|
-
if (arena == NULL) rb_raise(rb_eNoMemError, "Arena Memory Allocation Failed");
|
|
139
|
-
|
|
140
|
-
MatrixF *cost = matrix_create_f(arena, kmax, xcount);
|
|
141
|
-
MatrixI *splits = matrix_create_i(arena, kmax, xcount);
|
|
142
|
-
VectorF *xsorted = vector_create_f(arena, xcount);
|
|
143
|
-
VectorF *xsum = vector_create_f(arena, xcount);
|
|
144
|
-
VectorF *xsumsq = vector_create_f(arena, xcount);
|
|
145
|
-
|
|
146
|
-
for (uint32_t i = 0; i < xcount; i++) {
|
|
147
|
-
LDouble xi = NUM2DBL(rb_ary_entry(rb_xsorted, i));
|
|
148
|
-
vector_set_f(xsorted, i, xi);
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
State state = {
|
|
152
|
-
.arena = arena,
|
|
153
|
-
.xcount = xcount,
|
|
154
|
-
.kmin = kmin,
|
|
155
|
-
.kmax = kmax,
|
|
156
|
-
.xsorted = xsorted,
|
|
157
|
-
.cost = cost,
|
|
158
|
-
.splits = splits,
|
|
159
|
-
.xsum = xsum,
|
|
160
|
-
.xsumsq = xsumsq,
|
|
161
|
-
.dissim = criteria
|
|
162
|
-
};
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
LDouble shift = vector_get_f(xsorted, xcount / 2);
|
|
166
|
-
LDouble diff_initial = vector_get_f(xsorted, 0) - shift;
|
|
167
|
-
|
|
168
|
-
vector_set_f(xsum, 0, diff_initial);
|
|
169
|
-
vector_set_f(xsumsq, 0, diff_initial * diff_initial);
|
|
170
|
-
|
|
171
|
-
for (uint32_t i = 1; i < xcount; i++) {
|
|
172
|
-
LDouble xi = vector_get_f(xsorted, i);
|
|
173
|
-
LDouble xsum_prev = vector_get_f(xsum, i - 1);
|
|
174
|
-
LDouble xsumsq_prev = vector_get_f(xsumsq, i - 1);
|
|
175
|
-
LDouble diff = xi - shift;
|
|
176
|
-
|
|
177
|
-
vector_set_f(xsum, i, xsum_prev + diff);
|
|
178
|
-
vector_set_f(xsumsq, i, xsumsq_prev + diff * diff);
|
|
179
|
-
matrix_set_f(cost, 0, i, criteria(0, i, xsum, xsumsq));
|
|
180
|
-
matrix_set_i(splits, 0, i, 0);
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
for (uint32_t q = 1; q <= kmax - 1; q++) {
|
|
184
|
-
uint32_t imin = (q < kmax - 1) ? ((q > 1) ? q : 1) : xcount - 1;
|
|
185
|
-
fill_row(state, q, imin, xcount - 1);
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
uint32_t koptimal = use_gmm ? find_koptimal_gmm(state) : find_koptimal_fast(state);
|
|
189
|
-
|
|
190
|
-
VectorI *sizes = vector_create_i(arena, koptimal);
|
|
191
|
-
backtrack_sizes(state, sizes, koptimal);
|
|
192
|
-
|
|
193
|
-
/* printf("XSORTED \t"); vector_inspect_f(xsorted); */
|
|
194
|
-
/* printf("K OPTIMAL: %lld\n", koptimal); */
|
|
195
|
-
/* printf("SIZES \t"); vector_inspect_i(sizes); */
|
|
196
|
-
/* printf("FINAL COST\n"); matrix_inspect_f(cost); */
|
|
197
|
-
/* printf("FINAL SPLITS\n"); matrix_inspect_i(splits); */
|
|
198
|
-
|
|
199
|
-
VALUE response = rb_ary_new2(sizes->size);
|
|
200
|
-
for (uint32_t i = 0; i < sizes->size; i++) {
|
|
201
|
-
VALUE size = LONG2NUM(vector_get_i(sizes, i));
|
|
202
|
-
rb_ary_store(response, i, size);
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
arena_destroy(arena);
|
|
206
|
-
|
|
207
|
-
return response;
|
|
135
|
+
bool use_stable = RTEST(rb_iv_get(self, "@use_stable_estimation"));
|
|
136
|
+
FnFindKOptimal *find_k = use_stable ? find_koptimal_lmm : find_koptimal_fast;
|
|
137
|
+
return rb_sorted_group_sizes_l1(self, find_k);
|
|
208
138
|
}
|
|
209
139
|
|
|
210
140
|
uint32_t find_koptimal_fast(State state)
|
|
@@ -372,292 +302,162 @@ uint32_t find_koptimal_gmm(State state)
|
|
|
372
302
|
return kopt;
|
|
373
303
|
}
|
|
374
304
|
|
|
375
|
-
|
|
305
|
+
uint32_t find_koptimal_lmm(State state)
|
|
376
306
|
{
|
|
377
|
-
|
|
307
|
+
uint32_t kmin = state.kmin;
|
|
308
|
+
uint32_t kmax = state.kmax;
|
|
378
309
|
uint32_t xcount = state.xcount;
|
|
379
|
-
uint32_t right = xcount - 1;
|
|
380
|
-
uint32_t left = 0;
|
|
381
310
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
left = matrix_get_i(splits, i, right);
|
|
385
|
-
vector_set_i(sizes, i, right - left + 1);
|
|
311
|
+
if (kmin > kmax || xcount < 2) {
|
|
312
|
+
return (kmin < kmax) ? kmin : kmax;
|
|
386
313
|
}
|
|
387
|
-
// Special case outside of the loop removing the need for conditionals
|
|
388
|
-
left = matrix_get_i(splits, 0, right);
|
|
389
|
-
vector_set_i(sizes, 0, right - left + 1);
|
|
390
314
|
|
|
391
|
-
|
|
392
|
-
|
|
315
|
+
Arena *arena = state.arena;
|
|
316
|
+
VectorF *xsorted = state.xsorted;
|
|
317
|
+
uint32_t kopt = kmin;
|
|
318
|
+
LDouble max_bic = 0.0;
|
|
319
|
+
LDouble log_xcount = log((LDouble) xcount);
|
|
320
|
+
VectorF *lambda = vector_create_f(arena, kmax);
|
|
321
|
+
VectorF *mu = vector_create_f(arena, kmax); /* median */
|
|
322
|
+
VectorF *scale = vector_create_f(arena, kmax); /* MAD (mean absolute deviation) */
|
|
323
|
+
VectorF *coeff = vector_create_f(arena, kmax);
|
|
324
|
+
VectorI *sizes = vector_create_i(arena, kmax);
|
|
393
325
|
|
|
394
|
-
|
|
395
|
-
{
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
LDouble sumsq = 0.0;
|
|
399
|
-
SegmentStats stats = { .mean = 0.0, .variance = 0.0 };
|
|
326
|
+
for (uint32_t kouter = kmin; kouter <= kmax; ++kouter)
|
|
327
|
+
{
|
|
328
|
+
uint32_t ileft = 0;
|
|
329
|
+
uint32_t iright;
|
|
400
330
|
|
|
401
|
-
|
|
402
|
-
const LDouble median = vector_get_f(xsorted, (left + right) / 2);
|
|
331
|
+
backtrack_sizes(state, sizes, kouter);
|
|
403
332
|
|
|
404
|
-
for (uint32_t
|
|
405
|
-
|
|
333
|
+
for (uint32_t k = 0; k < kouter; ++k)
|
|
334
|
+
{
|
|
335
|
+
uint32_t size = vector_get_i(sizes, k);
|
|
336
|
+
vector_set_f(lambda, k, size / (LDouble) xcount);
|
|
337
|
+
iright = ileft + size - 1;
|
|
406
338
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
339
|
+
uint32_t median_idx = (ileft + iright) / 2;
|
|
340
|
+
LDouble median;
|
|
341
|
+
if ((size % 2) == 1) {
|
|
342
|
+
median = vector_get_f(xsorted, median_idx);
|
|
343
|
+
} else {
|
|
344
|
+
median = (vector_get_f(xsorted, median_idx) + vector_get_f(xsorted, median_idx + 1)) / 2.0;
|
|
345
|
+
}
|
|
346
|
+
vector_set_f(mu, k, median);
|
|
410
347
|
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
348
|
+
LDouble mad = 0.0;
|
|
349
|
+
for (uint32_t i = ileft; i <= iright; ++i) {
|
|
350
|
+
LDouble xi = vector_get_f(xsorted, i);
|
|
351
|
+
mad += fabs(xi - median);
|
|
352
|
+
}
|
|
353
|
+
mad = mad / size;
|
|
354
|
+
vector_set_f(scale, k, mad);
|
|
416
355
|
|
|
417
|
-
|
|
418
|
-
|
|
356
|
+
/* Handle edge case: MAD = 0 (all points are the same) or size = 1 */
|
|
357
|
+
if (mad == 0 || size == 1) {
|
|
358
|
+
LDouble dmin;
|
|
419
359
|
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
VectorI *split_candidates = vector_create_i(state.arena, size);
|
|
424
|
-
for (uint32_t i = 0; i < size; i++) {
|
|
425
|
-
vector_set_i(split_candidates, i, q + i);
|
|
426
|
-
}
|
|
427
|
-
RowParams rparams = { .row = q, .imin = imin, .imax = imax, .istep = 1 };
|
|
428
|
-
smawk(state, rparams, split_candidates);
|
|
429
|
-
}
|
|
360
|
+
if (ileft > 0 && iright < xcount - 1) {
|
|
361
|
+
LDouble left_diff = vector_get_diff_f(xsorted, ileft, ileft - 1);
|
|
362
|
+
LDouble right_diff = vector_get_diff_f(xsorted, iright + 1, iright);
|
|
430
363
|
|
|
431
|
-
|
|
432
|
-
{
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
if ((imax - imin) <= (0 * istep)) {
|
|
438
|
-
find_min_from_candidates(state, rparams, split_candidates);
|
|
439
|
-
} else {
|
|
440
|
-
VectorI *odd_candidates = prune_candidates(state, rparams, split_candidates);
|
|
441
|
-
/* printf("PRUNED\t"); vector_inspect_i(odd_candidates); */
|
|
442
|
-
uint32_t istepx2 = istep * 2;
|
|
443
|
-
uint32_t imin_odd = imin + istep;
|
|
444
|
-
uint32_t imax_odd = imin_odd + ((imax - imin_odd) / istepx2 * istepx2);
|
|
445
|
-
RowParams rparams_odd = { .row = rparams.row, .imin = imin_odd, .imax = imax_odd, .istep = istepx2 };
|
|
446
|
-
|
|
447
|
-
smawk(state, rparams_odd, odd_candidates);
|
|
448
|
-
fill_even_positions(state, rparams, split_candidates);
|
|
449
|
-
}
|
|
450
|
-
}
|
|
364
|
+
dmin = (left_diff < right_diff) ? left_diff : right_diff;
|
|
365
|
+
} else if (ileft > 0) {
|
|
366
|
+
dmin = vector_get_diff_f(xsorted, ileft, ileft - 1);
|
|
367
|
+
} else {
|
|
368
|
+
dmin = vector_get_diff_f(xsorted, iright + 1, iright);
|
|
369
|
+
}
|
|
451
370
|
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
uint32_t row = rparams.row;
|
|
455
|
-
uint32_t imin = rparams.imin;
|
|
456
|
-
uint32_t imax = rparams.imax;
|
|
457
|
-
uint32_t istep = rparams.istep;
|
|
458
|
-
uint32_t n = split_candidates->size;
|
|
459
|
-
uint32_t istepx2 = istep * 2;
|
|
460
|
-
uint32_t jl = vector_get_i(split_candidates, 0);
|
|
461
|
-
VectorF *const xsum = state.xsum;
|
|
462
|
-
VectorF *const xsumsq = state.xsumsq;
|
|
463
|
-
MatrixI *const splits = state.splits;
|
|
464
|
-
FnDissim *const dissim = state.dissim;
|
|
465
|
-
|
|
466
|
-
for (uint32_t i = imin, r = 0; i <= imax; i += istepx2) {
|
|
467
|
-
while (vector_get_i(split_candidates, r) < jl) r++;
|
|
468
|
-
|
|
469
|
-
uint32_t rcandidate = vector_get_i(split_candidates, r);
|
|
470
|
-
uint32_t cost_base_row = row - 1;
|
|
471
|
-
uint32_t cost_base_col = rcandidate - 1;
|
|
472
|
-
LDouble cost =
|
|
473
|
-
matrix_get_f(state.cost, cost_base_row, cost_base_col) + dissim(rcandidate, i, xsum, xsumsq);
|
|
474
|
-
|
|
475
|
-
matrix_set_f(state.cost, row, i, cost);
|
|
476
|
-
matrix_set_i(state.splits, row, i, rcandidate);
|
|
477
|
-
|
|
478
|
-
uint32_t jh =
|
|
479
|
-
(i + istep) <= imax
|
|
480
|
-
? matrix_get_i(splits, row, i + istep)
|
|
481
|
-
: vector_get_i(split_candidates, n - 1);
|
|
482
|
-
|
|
483
|
-
uint32_t jmax = jh < i ? jh : i;
|
|
484
|
-
LDouble sjimin = dissim(jmax, i, xsum, xsumsq);
|
|
485
|
-
|
|
486
|
-
for (++r; r < n && vector_get_i(split_candidates, r) <= jmax; r++) {
|
|
487
|
-
uint32_t jabs = vector_get_i(split_candidates, r);
|
|
488
|
-
|
|
489
|
-
if (jabs > i) break;
|
|
490
|
-
if (jabs < matrix_get_i(splits, row - 1, i)) continue;
|
|
491
|
-
|
|
492
|
-
LDouble cost_base = matrix_get_f(state.cost, row - 1, jabs - 1);
|
|
493
|
-
LDouble sj = cost_base + dissim(jabs, i, xsum, xsumsq);
|
|
494
|
-
LDouble cost_prev = matrix_get_f(state.cost, row, i);
|
|
495
|
-
|
|
496
|
-
if (sj <= cost_prev) {
|
|
497
|
-
matrix_set_f(state.cost, row, i, sj);
|
|
498
|
-
matrix_set_i(state.splits, row, i, jabs);
|
|
499
|
-
} else if (cost_base + sjimin > cost_prev) {
|
|
500
|
-
break;
|
|
371
|
+
if (mad == 0) vector_set_f(scale, k, dmin / 6.0);
|
|
372
|
+
if (size == 1) vector_set_f(scale, k, dmin);
|
|
501
373
|
}
|
|
502
|
-
}
|
|
503
374
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
{
|
|
511
|
-
const uint32_t row = rparams.row;
|
|
512
|
-
const uint32_t imin = rparams.imin;
|
|
513
|
-
const uint32_t imax = rparams.imax;
|
|
514
|
-
const uint32_t istep = rparams.istep;
|
|
515
|
-
MatrixF *const cost = state.cost;
|
|
516
|
-
MatrixI *const splits = state.splits;
|
|
517
|
-
FnDissim *const dissim = state.dissim;
|
|
518
|
-
|
|
519
|
-
uint32_t optimal_split_idx_prev = 0;
|
|
520
|
-
|
|
521
|
-
for (uint32_t i = imin; i <= imax; i += istep)
|
|
522
|
-
{
|
|
523
|
-
const uint32_t optimal_split_idx = optimal_split_idx_prev;
|
|
524
|
-
const uint32_t optimal_split = vector_get_i(split_candidates, optimal_split_idx);
|
|
525
|
-
const uint32_t cost_prev = matrix_get_f(cost, row - 1, optimal_split - 1);
|
|
526
|
-
const LDouble added_cost = dissim(optimal_split, i, state.xsum, state.xsumsq);
|
|
375
|
+
/* Laplace coefficient: lambda_k / (2 * b_k) */
|
|
376
|
+
LDouble lambda_k = vector_get_f(lambda, k);
|
|
377
|
+
LDouble scale_k = vector_get_f(scale, k);
|
|
378
|
+
vector_set_f(coeff, k, lambda_k / (2.0 * scale_k));
|
|
379
|
+
ileft = iright + 1;
|
|
380
|
+
}
|
|
527
381
|
|
|
528
|
-
|
|
529
|
-
matrix_set_i(splits, row, i, optimal_split);
|
|
382
|
+
LDouble loglikelihood = 0.0;
|
|
530
383
|
|
|
531
|
-
for (uint32_t
|
|
384
|
+
for (uint32_t i = 0; i < xcount; ++i)
|
|
532
385
|
{
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
if (split < matrix_get_i(splits, row - 1, i)) continue;
|
|
536
|
-
if (split > i) break;
|
|
537
|
-
|
|
538
|
-
LDouble split_cost =
|
|
539
|
-
matrix_get_f(cost, row - 1, split - 1) + dissim(split, i, state.xsum, state.xsumsq);
|
|
540
|
-
|
|
541
|
-
if (split_cost > matrix_get_f(cost, row, i)) continue;
|
|
386
|
+
LDouble L = 0.0;
|
|
387
|
+
LDouble xi = vector_get_f(xsorted, i);
|
|
542
388
|
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
389
|
+
for (uint32_t k = 0; k < kouter; ++k)
|
|
390
|
+
{
|
|
391
|
+
LDouble coeff_k = vector_get_f(coeff, k);
|
|
392
|
+
LDouble mu_k = vector_get_f(mu, k);
|
|
393
|
+
LDouble scale_k = vector_get_f(scale, k);
|
|
394
|
+
LDouble x_mu_abs = fabs(xi - mu_k);
|
|
395
|
+
/* Laplace PDF: (1/(2b)) * exp(-|x-μ|/b) */
|
|
396
|
+
L += coeff_k * exp(-x_mu_abs / scale_k);
|
|
397
|
+
}
|
|
398
|
+
loglikelihood += log(L);
|
|
546
399
|
}
|
|
547
|
-
}
|
|
548
|
-
}
|
|
549
|
-
|
|
550
|
-
inline VectorI *prune_candidates(State state, RowParams rparams, VectorI *split_candidates)
|
|
551
|
-
{
|
|
552
|
-
uint32_t imin = rparams.imin;
|
|
553
|
-
uint32_t row = rparams.row;
|
|
554
|
-
uint32_t istep = rparams.istep;
|
|
555
|
-
uint32_t n = ((rparams.imax - imin) / istep) + 1;
|
|
556
|
-
uint32_t m = split_candidates->size;
|
|
557
400
|
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
uint32_t right = 0;
|
|
562
|
-
VectorI *pruned = vector_dup_i(split_candidates, state.arena);
|
|
563
|
-
FnDissim *const dissim = state.dissim;
|
|
401
|
+
/* BIC = 2*logL - (3k-1)*log(n) */
|
|
402
|
+
/* Parameters: k-1 mixing proportions + k medians + k scales = 3k-1 */
|
|
403
|
+
LDouble bic = 2 * loglikelihood - (3 * kouter - 1) * log_xcount;
|
|
564
404
|
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
uint32_t j = vector_get_i(pruned, right);
|
|
569
|
-
uint32_t jnext = vector_get_i(pruned, right + 1);
|
|
570
|
-
LDouble sl =
|
|
571
|
-
matrix_get_f(state.cost, row - 1, j - 1) + dissim(j, i, state.xsum, state.xsumsq);
|
|
572
|
-
LDouble snext =
|
|
573
|
-
matrix_get_f(state.cost, row - 1, jnext - 1) + dissim(jnext, i, state.xsum, state.xsumsq);
|
|
574
|
-
|
|
575
|
-
if ((sl < snext) && (left < n - 1)) {
|
|
576
|
-
vector_set_i(pruned, left, j);
|
|
577
|
-
left++;
|
|
578
|
-
right++;
|
|
579
|
-
} else if ((sl < snext) && (left == n - 1)) {
|
|
580
|
-
right++;
|
|
581
|
-
m--;
|
|
582
|
-
vector_set_i(pruned, right, j);
|
|
405
|
+
if (kouter == kmin) {
|
|
406
|
+
max_bic = bic;
|
|
407
|
+
kopt = kmin;
|
|
583
408
|
} else {
|
|
584
|
-
if (
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
right++;
|
|
409
|
+
if (bic > max_bic) {
|
|
410
|
+
max_bic = bic;
|
|
411
|
+
kopt = kouter;
|
|
588
412
|
}
|
|
589
|
-
|
|
590
|
-
m--;
|
|
591
413
|
}
|
|
592
414
|
}
|
|
593
|
-
|
|
594
|
-
for (uint32_t i = left; i < m; i++) {
|
|
595
|
-
vector_set_i(pruned, i, vector_get_i(pruned, right++));
|
|
596
|
-
}
|
|
597
|
-
|
|
598
|
-
vector_downsize_i(pruned, m);
|
|
599
|
-
|
|
600
|
-
return pruned;
|
|
415
|
+
return kopt;
|
|
601
416
|
}
|
|
602
417
|
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
418
|
+
VectorI *backtrack_sizes(State state, VectorI *sizes, uint32_t k)
|
|
419
|
+
{
|
|
420
|
+
MatrixI *splits = state.splits;
|
|
421
|
+
uint32_t xcount = state.xcount;
|
|
422
|
+
uint32_t right = xcount - 1;
|
|
423
|
+
uint32_t left = 0;
|
|
608
424
|
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
} else {
|
|
614
|
-
LDouble xsumi = vector_get_f(xsum, i);
|
|
615
|
-
sji = vector_get_f(xsumsq, i) - (xsumi * xsumi / (i + 1));
|
|
425
|
+
/* Common case works with `i` remaining unsigned and unconditional assignment of the next `left` and `right` */
|
|
426
|
+
for (uint32_t i = k - 1; i > 0; i--, right = left - 1) {
|
|
427
|
+
left = matrix_get_i(splits, i, right);
|
|
428
|
+
vector_set_i(sizes, i, right - left + 1);
|
|
616
429
|
}
|
|
430
|
+
/* Special case outside of the loop removing the need for conditionals */
|
|
431
|
+
left = matrix_get_i(splits, 0, right);
|
|
432
|
+
vector_set_i(sizes, 0, right - left + 1);
|
|
617
433
|
|
|
618
|
-
return
|
|
434
|
+
return sizes;
|
|
619
435
|
}
|
|
620
436
|
|
|
621
|
-
|
|
622
|
-
inline LDouble dissimilarity_l1(uint32_t j, uint32_t i, VectorF *restrict xsum, VectorF *restrict _xsumsq)
|
|
437
|
+
SegmentStats shifted_data_variance(VectorF *xsorted, uint32_t left, uint32_t right)
|
|
623
438
|
{
|
|
624
|
-
|
|
439
|
+
const uint32_t n = right - left + 1;
|
|
440
|
+
LDouble sum = 0.0;
|
|
441
|
+
LDouble sumsq = 0.0;
|
|
442
|
+
SegmentStats stats = { .mean = 0.0, .variance = 0.0 };
|
|
625
443
|
|
|
626
|
-
if (
|
|
444
|
+
if (right >= left) {
|
|
445
|
+
const LDouble median = vector_get_f(xsorted, (left + right) / 2);
|
|
627
446
|
|
|
628
|
-
|
|
629
|
-
|
|
447
|
+
for (uint32_t i = left; i <= right; i++) {
|
|
448
|
+
const LDouble sumi = vector_get_f(xsorted, i) - median;
|
|
630
449
|
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
- vector_get_f(xsum, median_idx - 1)
|
|
634
|
-
+ vector_get_f(xsum, j - 1)
|
|
635
|
-
+ vector_get_f(xsum, i)
|
|
636
|
-
- vector_get_f(xsum, median_idx);
|
|
637
|
-
} else {
|
|
638
|
-
sji =
|
|
639
|
-
- vector_get_f(xsum, median_idx)
|
|
640
|
-
+ vector_get_f(xsum, j - 1)
|
|
641
|
-
+ vector_get_f(xsum, i)
|
|
642
|
-
- vector_get_f(xsum, median_idx);
|
|
450
|
+
sum += sumi;
|
|
451
|
+
sumsq += sumi * sumi;
|
|
643
452
|
}
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
sji =
|
|
649
|
-
- vector_get_f(xsum, median_idx - 1)
|
|
650
|
-
+ vector_get_f(xsum, i)
|
|
651
|
-
- vector_get_f(xsum, median_idx);
|
|
652
|
-
} else {
|
|
653
|
-
sji =
|
|
654
|
-
- vector_get_f(xsum, median_idx)
|
|
655
|
-
+ vector_get_f(xsum, i)
|
|
656
|
-
- vector_get_f(xsum, median_idx);
|
|
453
|
+
|
|
454
|
+
stats.mean = (sum / n) + median;
|
|
455
|
+
if (n > 1) {
|
|
456
|
+
stats.variance = (sumsq - (sum * sum / n)) / (n - 1);
|
|
657
457
|
}
|
|
658
458
|
}
|
|
659
459
|
|
|
660
|
-
return
|
|
460
|
+
return stats;
|
|
661
461
|
}
|
|
662
462
|
|
|
663
463
|
inline VectorF *vector_create_f(Arena *arena, uint32_t size) {
|
data/lib/ckmeans/clusterer.rb
CHANGED
|
@@ -1,19 +1,37 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Ckmeans
|
|
4
|
-
|
|
4
|
+
# Optimal k-means clustering for univariate (1D) data using dynamic programming.
|
|
5
|
+
# Minimizes within-cluster sum of squared distances (L2 norm).
|
|
6
|
+
class Clusterer
|
|
7
|
+
# Creates a new Ckmeans clusterer.
|
|
8
|
+
#
|
|
9
|
+
# @param entries [Array<Numeric>] The data points to cluster
|
|
10
|
+
# @param kmin [Integer] Minimum number of clusters to consider
|
|
11
|
+
# @param kmax [Integer] Maximum number of clusters to consider (defaults to kmin for fixed K)
|
|
12
|
+
# @param kestimate [Symbol] Method for estimating optimal K:
|
|
13
|
+
# - :fast - Quick heuristic using implicit Gaussian assumption (best for large datasets)
|
|
14
|
+
# - :stable - Model-based estimation using Gaussian Mixture Model (better for duplicates/edge cases)
|
|
15
|
+
# - :gmm - Alias for :stable (Gaussian Mixture Model)
|
|
16
|
+
#
|
|
17
|
+
# @example Fixed number of clusters
|
|
18
|
+
# Ckmeans::Clusterer.new([1, 2, 3, 100, 101], 2).clusters
|
|
19
|
+
# # => [[1, 2, 3], [100, 101]]
|
|
20
|
+
#
|
|
21
|
+
# @example Automatic K selection with stable estimation
|
|
22
|
+
# Ckmeans::Clusterer.new([1, 1, 1, 5, 5, 5, 10, 10, 10], 1, 5, :stable).clusters
|
|
5
23
|
def initialize(entries, kmin, kmax = kmin, kestimate = :fast)
|
|
6
24
|
@xcount = entries.size
|
|
7
25
|
|
|
8
26
|
raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
|
|
9
27
|
raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
|
|
10
28
|
|
|
11
|
-
@kmin
|
|
12
|
-
@unique_xcount
|
|
13
|
-
@kmax
|
|
14
|
-
@xsorted_original
|
|
15
|
-
@xsorted
|
|
16
|
-
@
|
|
29
|
+
@kmin = kmin
|
|
30
|
+
@unique_xcount = entries.uniq.size
|
|
31
|
+
@kmax = [@unique_xcount, kmax].min
|
|
32
|
+
@xsorted_original = entries.sort
|
|
33
|
+
@xsorted = @xsorted_original.map(&:to_f)
|
|
34
|
+
@use_stable_estimation = %i[gmm stable].include?(kestimate)
|
|
17
35
|
end
|
|
18
36
|
|
|
19
37
|
def clusters
|
data/lib/ckmeans/version.rb
CHANGED
data/lib/ckmedian/clusterer.rb
CHANGED
|
@@ -1,18 +1,39 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Ckmedian
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
# Optimal k-median clustering for univariate (1D) data using dynamic programming.
|
|
5
|
+
# Minimizes within-cluster sum of absolute deviations (L1 norm).
|
|
6
|
+
# More robust to outliers than k-means.
|
|
7
|
+
class Clusterer
|
|
8
|
+
# Creates a new Ckmedian clusterer.
|
|
9
|
+
#
|
|
10
|
+
# @param entries [Array<Numeric>] The data points to cluster
|
|
11
|
+
# @param kmin [Integer] Minimum number of clusters to consider
|
|
12
|
+
# @param kmax [Integer] Maximum number of clusters to consider (defaults to kmin for fixed K)
|
|
13
|
+
# @param kestimate [Symbol] Method for estimating optimal K:
|
|
14
|
+
# - :fast - Quick heuristic using implicit Gaussian assumption (best for large datasets)
|
|
15
|
+
# - :stable - Model-based estimation using Laplace Mixture Model (better for outliers/bursts)
|
|
16
|
+
# - :lmm - Alias for :stable (Laplace Mixture Model)
|
|
17
|
+
#
|
|
18
|
+
# @example Fixed number of clusters
|
|
19
|
+
# Ckmedian::Clusterer.new([1, 2, 3, 100, 101], 2).clusters
|
|
20
|
+
# # => [[1, 2, 3], [100, 101]]
|
|
21
|
+
#
|
|
22
|
+
# @example Photo timeline clustering (robust to bursts and outliers)
|
|
23
|
+
# timestamps = photos.map(&:taken_at).map(&:to_i)
|
|
24
|
+
# Ckmedian::Clusterer.new(timestamps, 1, 20, :stable).clusters
|
|
25
|
+
def initialize(entries, kmin, kmax = kmin, kestimate = :fast)
|
|
6
26
|
@xcount = entries.size
|
|
7
27
|
|
|
8
28
|
raise ArgumentError, "Minimum cluster count is bigger than element count" if kmin > @xcount
|
|
9
29
|
raise ArgumentError, "Maximum cluster count is bigger than element count" if kmax > @xcount
|
|
10
30
|
|
|
11
|
-
@kmin
|
|
12
|
-
@unique_xcount
|
|
13
|
-
@kmax
|
|
14
|
-
@xsorted_original
|
|
15
|
-
@xsorted
|
|
31
|
+
@kmin = kmin
|
|
32
|
+
@unique_xcount = entries.uniq.size
|
|
33
|
+
@kmax = [@unique_xcount, kmax].min
|
|
34
|
+
@xsorted_original = entries.sort
|
|
35
|
+
@xsorted = @xsorted_original.map(&:to_f)
|
|
36
|
+
@use_stable_estimation = %i[lmm stable].include?(kestimate)
|
|
16
37
|
end
|
|
17
38
|
|
|
18
39
|
def clusters
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ckmeans
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.1.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Vlad Lebedev
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-
|
|
11
|
+
date: 2025-12-14 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Repeatable clustering of unidimensional data
|
|
14
14
|
email:
|
|
@@ -28,6 +28,8 @@ files:
|
|
|
28
28
|
- LICENSE
|
|
29
29
|
- README.md
|
|
30
30
|
- Rakefile
|
|
31
|
+
- ext/ckmeans/algorithm.inc
|
|
32
|
+
- ext/ckmeans/dissimilarity.h
|
|
31
33
|
- ext/ckmeans/extconf.rb
|
|
32
34
|
- ext/ckmeans/extensions.c
|
|
33
35
|
- lib/ckmeans.rb
|