clusterkit 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +3236 -0
- data/README.md +227 -7
- data/docs/KNOWN_ISSUES.md +5 -5
- data/docs/RUST_ERROR_HANDLING.md +6 -6
- data/docs/assets/clusterkit-wide.png +0 -0
- data/docs/assets/clusterkit.png +0 -0
- data/docs/assets/visualization.png +0 -0
- data/ext/clusterkit/Cargo.toml +5 -4
- data/ext/clusterkit/extconf.rb +9 -1
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +27 -62
- data/ext/clusterkit/src/clustering.rs +68 -114
- data/ext/clusterkit/src/embedder.rs +48 -131
- data/ext/clusterkit/src/hnsw.rs +579 -0
- data/ext/clusterkit/src/lib.rs +7 -5
- data/ext/clusterkit/src/svd.rs +35 -58
- data/ext/clusterkit/src/utils.rs +159 -9
- data/lib/clusterkit/clustering/hdbscan.rb +4 -17
- data/lib/clusterkit/clustering.rb +4 -23
- data/lib/clusterkit/data_validator.rb +132 -0
- data/lib/clusterkit/dimensionality/pca.rb +12 -12
- data/lib/clusterkit/dimensionality/svd.rb +47 -16
- data/lib/clusterkit/dimensionality/umap.rb +7 -40
- data/lib/clusterkit/hnsw.rb +251 -0
- data/lib/clusterkit/version.rb +1 -1
- data/lib/clusterkit.rb +2 -1
- metadata +40 -20
- data/clusterkit.gemspec +0 -45
|
@@ -1,68 +1,52 @@
|
|
|
1
|
-
use magnus::{function, prelude::*, Error, Value, RArray,
|
|
1
|
+
use magnus::{function, prelude::*, Error, Value, RArray, Ruby};
|
|
2
2
|
use ndarray::{Array1, Array2, ArrayView1, Axis};
|
|
3
3
|
use rand::prelude::*;
|
|
4
|
+
use rand::rngs::StdRng;
|
|
5
|
+
use rand::SeedableRng;
|
|
6
|
+
use crate::utils::ruby_array_to_ndarray2;
|
|
4
7
|
|
|
5
8
|
mod hdbscan_wrapper;
|
|
6
9
|
|
|
7
10
|
pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
|
8
11
|
let clustering_module = parent.define_module("Clustering")?;
|
|
9
|
-
|
|
12
|
+
|
|
10
13
|
clustering_module.define_singleton_method(
|
|
11
14
|
"kmeans_rust",
|
|
12
|
-
function!(kmeans,
|
|
15
|
+
function!(kmeans, 4),
|
|
13
16
|
)?;
|
|
14
|
-
|
|
17
|
+
|
|
15
18
|
clustering_module.define_singleton_method(
|
|
16
19
|
"kmeans_predict_rust",
|
|
17
20
|
function!(kmeans_predict, 2),
|
|
18
21
|
)?;
|
|
19
|
-
|
|
22
|
+
|
|
20
23
|
// Initialize HDBSCAN functions
|
|
21
24
|
hdbscan_wrapper::init(&clustering_module)?;
|
|
22
|
-
|
|
25
|
+
|
|
23
26
|
Ok(())
|
|
24
27
|
}
|
|
25
28
|
|
|
26
29
|
/// Perform K-means clustering
|
|
27
30
|
/// Returns (labels, centroids, inertia)
|
|
28
|
-
fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64), Error> {
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
magnus::exception::arg_error(),
|
|
36
|
-
"Data cannot be empty",
|
|
37
|
-
));
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
// Get dimensions
|
|
41
|
-
let first_row: RArray = rarray.entry::<RArray>(0)?;
|
|
42
|
-
let n_features = first_row.len();
|
|
43
|
-
|
|
31
|
+
fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> Result<(RArray, RArray, f64), Error> {
|
|
32
|
+
let ruby = Ruby::get().unwrap();
|
|
33
|
+
|
|
34
|
+
// Convert Ruby array to ndarray using shared helper
|
|
35
|
+
let data_array = ruby_array_to_ndarray2(data)?;
|
|
36
|
+
let (n_samples, n_features) = data_array.dim();
|
|
37
|
+
|
|
44
38
|
if k > n_samples {
|
|
45
39
|
return Err(Error::new(
|
|
46
|
-
|
|
40
|
+
ruby.exception_arg_error(),
|
|
47
41
|
format!("k ({}) cannot be larger than number of samples ({})", k, n_samples),
|
|
48
42
|
));
|
|
49
43
|
}
|
|
50
|
-
|
|
51
|
-
// Convert to ndarray
|
|
52
|
-
let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
|
|
53
|
-
for i in 0..n_samples {
|
|
54
|
-
let row: RArray = rarray.entry(i as isize)?;
|
|
55
|
-
for j in 0..n_features {
|
|
56
|
-
let val: f64 = row.entry(j as isize)?;
|
|
57
|
-
data_array[[i, j]] = val;
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
|
-
|
|
44
|
+
|
|
61
45
|
// Initialize centroids using K-means++
|
|
62
|
-
let mut centroids = kmeans_plusplus(&data_array, k)?;
|
|
46
|
+
let mut centroids = kmeans_plusplus(&data_array, k, random_seed)?;
|
|
63
47
|
let mut labels = vec![0usize; n_samples];
|
|
64
48
|
let mut prev_labels = vec![0usize; n_samples];
|
|
65
|
-
|
|
49
|
+
|
|
66
50
|
// K-means iterations
|
|
67
51
|
for iteration in 0..max_iter {
|
|
68
52
|
// Assign points to nearest centroid
|
|
@@ -71,7 +55,7 @@ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64
|
|
|
71
55
|
let point = data_array.row(i);
|
|
72
56
|
let mut min_dist = f64::INFINITY;
|
|
73
57
|
let mut best_cluster = 0;
|
|
74
|
-
|
|
58
|
+
|
|
75
59
|
for (j, centroid) in centroids.axis_iter(Axis(0)).enumerate() {
|
|
76
60
|
let dist = euclidean_distance(&point, ¢roid);
|
|
77
61
|
if dist < min_dist {
|
|
@@ -79,38 +63,38 @@ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64
|
|
|
79
63
|
best_cluster = j;
|
|
80
64
|
}
|
|
81
65
|
}
|
|
82
|
-
|
|
66
|
+
|
|
83
67
|
if labels[i] != best_cluster {
|
|
84
68
|
changed = true;
|
|
85
69
|
}
|
|
86
70
|
labels[i] = best_cluster;
|
|
87
71
|
}
|
|
88
|
-
|
|
72
|
+
|
|
89
73
|
// Check for convergence
|
|
90
74
|
if !changed && iteration > 0 {
|
|
91
75
|
break;
|
|
92
76
|
}
|
|
93
|
-
|
|
77
|
+
|
|
94
78
|
// Update centroids
|
|
95
79
|
for j in 0..k {
|
|
96
80
|
let mut sum = Array1::<f64>::zeros(n_features);
|
|
97
81
|
let mut count = 0;
|
|
98
|
-
|
|
82
|
+
|
|
99
83
|
for i in 0..n_samples {
|
|
100
84
|
if labels[i] == j {
|
|
101
85
|
sum += &data_array.row(i);
|
|
102
86
|
count += 1;
|
|
103
87
|
}
|
|
104
88
|
}
|
|
105
|
-
|
|
89
|
+
|
|
106
90
|
if count > 0 {
|
|
107
91
|
centroids.row_mut(j).assign(&(sum / count as f64));
|
|
108
92
|
}
|
|
109
93
|
}
|
|
110
|
-
|
|
94
|
+
|
|
111
95
|
prev_labels.clone_from(&labels);
|
|
112
96
|
}
|
|
113
|
-
|
|
97
|
+
|
|
114
98
|
// Calculate inertia (sum of squared distances to nearest centroid)
|
|
115
99
|
let mut inertia = 0.0;
|
|
116
100
|
for i in 0..n_samples {
|
|
@@ -118,75 +102,43 @@ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64
|
|
|
118
102
|
let centroid = centroids.row(labels[i]);
|
|
119
103
|
inertia += euclidean_distance(&point, ¢roid).powi(2);
|
|
120
104
|
}
|
|
121
|
-
|
|
105
|
+
|
|
122
106
|
// Convert results to Ruby arrays
|
|
123
|
-
let
|
|
124
|
-
let labels_array = RArray::new();
|
|
107
|
+
let labels_array = ruby.ary_new();
|
|
125
108
|
for label in labels {
|
|
126
|
-
labels_array.push(
|
|
109
|
+
labels_array.push(ruby.integer_from_i64(label as i64))?;
|
|
127
110
|
}
|
|
128
|
-
|
|
129
|
-
let centroids_array =
|
|
111
|
+
|
|
112
|
+
let centroids_array = ruby.ary_new();
|
|
130
113
|
for i in 0..k {
|
|
131
|
-
let row_array =
|
|
114
|
+
let row_array = ruby.ary_new();
|
|
132
115
|
for j in 0..n_features {
|
|
133
116
|
row_array.push(centroids[[i, j]])?;
|
|
134
117
|
}
|
|
135
118
|
centroids_array.push(row_array)?;
|
|
136
119
|
}
|
|
137
|
-
|
|
120
|
+
|
|
138
121
|
Ok((labels_array, centroids_array, inertia))
|
|
139
122
|
}
|
|
140
123
|
|
|
141
124
|
/// Predict cluster labels for new data given centroids
|
|
142
125
|
fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
let
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
return Err(Error::new(
|
|
152
|
-
magnus::exception::arg_error(),
|
|
153
|
-
"Data cannot be empty",
|
|
154
|
-
));
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
// Get dimensions
|
|
158
|
-
let first_row: RArray = data_array.entry::<RArray>(0)?;
|
|
159
|
-
let n_features = first_row.len();
|
|
160
|
-
|
|
161
|
-
// Convert data to ndarray
|
|
162
|
-
let mut data_matrix = Array2::<f64>::zeros((n_samples, n_features));
|
|
163
|
-
for i in 0..n_samples {
|
|
164
|
-
let row: RArray = data_array.entry(i as isize)?;
|
|
165
|
-
for j in 0..n_features {
|
|
166
|
-
let val: f64 = row.entry(j as isize)?;
|
|
167
|
-
data_matrix[[i, j]] = val;
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
// Convert centroids to ndarray
|
|
172
|
-
let mut centroids_matrix = Array2::<f64>::zeros((k, n_features));
|
|
173
|
-
for i in 0..k {
|
|
174
|
-
let row: RArray = centroids_array.entry(i as isize)?;
|
|
175
|
-
for j in 0..n_features {
|
|
176
|
-
let val: f64 = row.entry(j as isize)?;
|
|
177
|
-
centroids_matrix[[i, j]] = val;
|
|
178
|
-
}
|
|
179
|
-
}
|
|
180
|
-
|
|
126
|
+
let ruby = Ruby::get().unwrap();
|
|
127
|
+
|
|
128
|
+
// Convert inputs using shared helpers
|
|
129
|
+
let data_matrix = ruby_array_to_ndarray2(data)?;
|
|
130
|
+
let centroids_matrix = ruby_array_to_ndarray2(centroids)?;
|
|
131
|
+
|
|
132
|
+
let (n_samples, _) = data_matrix.dim();
|
|
133
|
+
|
|
181
134
|
// Predict labels
|
|
182
|
-
let
|
|
183
|
-
|
|
184
|
-
|
|
135
|
+
let labels_array = ruby.ary_new();
|
|
136
|
+
|
|
185
137
|
for i in 0..n_samples {
|
|
186
138
|
let point = data_matrix.row(i);
|
|
187
139
|
let mut min_dist = f64::INFINITY;
|
|
188
140
|
let mut best_cluster = 0;
|
|
189
|
-
|
|
141
|
+
|
|
190
142
|
for (j, centroid) in centroids_matrix.axis_iter(Axis(0)).enumerate() {
|
|
191
143
|
let dist = euclidean_distance(&point, ¢roid);
|
|
192
144
|
if dist < min_dist {
|
|
@@ -194,30 +146,37 @@ fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
|
|
|
194
146
|
best_cluster = j;
|
|
195
147
|
}
|
|
196
148
|
}
|
|
197
|
-
|
|
198
|
-
labels_array.push(
|
|
149
|
+
|
|
150
|
+
labels_array.push(ruby.integer_from_i64(best_cluster as i64))?;
|
|
199
151
|
}
|
|
200
|
-
|
|
152
|
+
|
|
201
153
|
Ok(labels_array)
|
|
202
154
|
}
|
|
203
155
|
|
|
204
156
|
/// K-means++ initialization
|
|
205
|
-
fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
|
|
157
|
+
fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Result<Array2<f64>, Error> {
|
|
206
158
|
let n_samples = data.nrows();
|
|
207
159
|
let n_features = data.ncols();
|
|
208
|
-
|
|
209
|
-
|
|
160
|
+
|
|
161
|
+
// Use seeded RNG if seed is provided, otherwise use thread_rng
|
|
162
|
+
let mut rng: Box<dyn RngCore> = match random_seed {
|
|
163
|
+
Some(seed) => {
|
|
164
|
+
let seed_u64 = seed as u64;
|
|
165
|
+
Box::new(StdRng::seed_from_u64(seed_u64))
|
|
166
|
+
},
|
|
167
|
+
None => Box::new(thread_rng()),
|
|
168
|
+
};
|
|
169
|
+
|
|
210
170
|
let mut centroids = Array2::<f64>::zeros((k, n_features));
|
|
211
|
-
|
|
171
|
+
|
|
212
172
|
// Choose first centroid randomly
|
|
213
173
|
let first_idx = rng.gen_range(0..n_samples);
|
|
214
174
|
centroids.row_mut(0).assign(&data.row(first_idx));
|
|
215
|
-
|
|
175
|
+
|
|
216
176
|
// Choose remaining centroids
|
|
217
177
|
for i in 1..k {
|
|
218
178
|
let mut distances = vec![f64::INFINITY; n_samples];
|
|
219
|
-
|
|
220
|
-
// Calculate distance to nearest centroid for each point
|
|
179
|
+
|
|
221
180
|
for j in 0..n_samples {
|
|
222
181
|
for c in 0..i {
|
|
223
182
|
let dist = euclidean_distance(&data.row(j), ¢roids.row(c));
|
|
@@ -226,25 +185,20 @@ fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
|
|
|
226
185
|
}
|
|
227
186
|
}
|
|
228
187
|
}
|
|
229
|
-
|
|
230
|
-
// Convert distances to probabilities
|
|
188
|
+
|
|
231
189
|
let total: f64 = distances.iter().map(|d| d * d).sum();
|
|
232
190
|
if total == 0.0 {
|
|
233
|
-
// All points are identical or we've selected duplicates
|
|
234
|
-
// Just use sequential points as centroids
|
|
235
191
|
if i < n_samples {
|
|
236
192
|
centroids.row_mut(i).assign(&data.row(i));
|
|
237
193
|
} else {
|
|
238
|
-
// Reuse first point if we run out
|
|
239
194
|
centroids.row_mut(i).assign(&data.row(0));
|
|
240
195
|
}
|
|
241
196
|
continue;
|
|
242
197
|
}
|
|
243
|
-
|
|
244
|
-
// Choose next centroid with probability proportional to squared distance
|
|
198
|
+
|
|
245
199
|
let mut cumsum = 0.0;
|
|
246
200
|
let rand_val: f64 = rng.gen::<f64>() * total;
|
|
247
|
-
|
|
201
|
+
|
|
248
202
|
for j in 0..n_samples {
|
|
249
203
|
cumsum += distances[j] * distances[j];
|
|
250
204
|
if cumsum >= rand_val {
|
|
@@ -253,7 +207,7 @@ fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
|
|
|
253
207
|
}
|
|
254
208
|
}
|
|
255
209
|
}
|
|
256
|
-
|
|
210
|
+
|
|
257
211
|
Ok(centroids)
|
|
258
212
|
}
|
|
259
213
|
|
|
@@ -264,4 +218,4 @@ fn euclidean_distance(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
|
|
|
264
218
|
.map(|(x, y)| (x - y).powi(2))
|
|
265
219
|
.sum::<f64>()
|
|
266
220
|
.sqrt()
|
|
267
|
-
}
|
|
221
|
+
}
|