clusterkit 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +3236 -0
- data/README.md +227 -7
- data/docs/KNOWN_ISSUES.md +5 -5
- data/docs/RUST_ERROR_HANDLING.md +6 -6
- data/docs/assets/clusterkit-wide.png +0 -0
- data/docs/assets/clusterkit.png +0 -0
- data/docs/assets/visualization.png +0 -0
- data/ext/clusterkit/Cargo.toml +5 -4
- data/ext/clusterkit/extconf.rb +9 -1
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +27 -62
- data/ext/clusterkit/src/clustering.rs +68 -114
- data/ext/clusterkit/src/embedder.rs +48 -131
- data/ext/clusterkit/src/hnsw.rs +579 -0
- data/ext/clusterkit/src/lib.rs +7 -5
- data/ext/clusterkit/src/svd.rs +35 -58
- data/ext/clusterkit/src/utils.rs +159 -9
- data/lib/clusterkit/clustering/hdbscan.rb +4 -17
- data/lib/clusterkit/clustering.rb +4 -23
- data/lib/clusterkit/data_validator.rb +132 -0
- data/lib/clusterkit/dimensionality/pca.rb +12 -12
- data/lib/clusterkit/dimensionality/svd.rb +47 -16
- data/lib/clusterkit/dimensionality/umap.rb +7 -40
- data/lib/clusterkit/hnsw.rb +251 -0
- data/lib/clusterkit/version.rb +1 -1
- data/lib/clusterkit.rb +2 -1
- metadata +40 -20
- data/clusterkit.gemspec +0 -45
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
use magnus::{Error, RArray, RHash, Value, TryConvert, Integer,
|
|
1
|
+
use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Module, Object, Ruby};
|
|
2
2
|
use magnus::value::ReprValue;
|
|
3
3
|
use hnsw_rs::prelude::*;
|
|
4
4
|
use annembed::prelude::*;
|
|
@@ -7,6 +7,7 @@ use std::io::{Write, Read};
|
|
|
7
7
|
use std::cell::RefCell;
|
|
8
8
|
use bincode;
|
|
9
9
|
use serde::{Serialize, Deserialize};
|
|
10
|
+
use crate::utils::ruby_array_to_vec_vec_f32;
|
|
10
11
|
|
|
11
12
|
// Simple struct to serialize UMAP results
|
|
12
13
|
#[derive(Serialize, Deserialize)]
|
|
@@ -20,7 +21,8 @@ struct SavedUMAPModel {
|
|
|
20
21
|
}
|
|
21
22
|
|
|
22
23
|
pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
|
23
|
-
let
|
|
24
|
+
let ruby = Ruby::get().unwrap();
|
|
25
|
+
let umap_class = parent.define_class("RustUMAP", ruby.class_object())?;
|
|
24
26
|
|
|
25
27
|
umap_class.define_singleton_method("new", magnus::function!(RustUMAP::new, 1))?;
|
|
26
28
|
umap_class.define_singleton_method("load_model", magnus::function!(RustUMAP::load_model, 1))?;
|
|
@@ -39,15 +41,15 @@ struct RustUMAP {
|
|
|
39
41
|
random_seed: Option<u64>,
|
|
40
42
|
nb_grad_batch: usize,
|
|
41
43
|
nb_sampling_by_edge: usize,
|
|
42
|
-
// Store the training data and embeddings for transform approximation
|
|
43
|
-
// Use RefCell for interior mutability
|
|
44
44
|
training_data: RefCell<Option<Vec<Vec<f32>>>>,
|
|
45
45
|
training_embeddings: RefCell<Option<Vec<Vec<f64>>>>,
|
|
46
46
|
}
|
|
47
47
|
|
|
48
48
|
impl RustUMAP {
|
|
49
49
|
fn new(options: RHash) -> Result<Self, Error> {
|
|
50
|
-
let
|
|
50
|
+
let ruby = Ruby::get().unwrap();
|
|
51
|
+
|
|
52
|
+
let n_components = match options.lookup::<_, Value>(ruby.to_symbol("n_components")) {
|
|
51
53
|
Ok(val) => {
|
|
52
54
|
if val.is_nil() {
|
|
53
55
|
2
|
|
@@ -60,7 +62,7 @@ impl RustUMAP {
|
|
|
60
62
|
Err(_) => 2,
|
|
61
63
|
};
|
|
62
64
|
|
|
63
|
-
let n_neighbors = match options.lookup::<_, Value>(
|
|
65
|
+
let n_neighbors = match options.lookup::<_, Value>(ruby.to_symbol("n_neighbors")) {
|
|
64
66
|
Ok(val) => {
|
|
65
67
|
if val.is_nil() {
|
|
66
68
|
15
|
|
@@ -73,7 +75,7 @@ impl RustUMAP {
|
|
|
73
75
|
Err(_) => 15,
|
|
74
76
|
};
|
|
75
77
|
|
|
76
|
-
let random_seed = match options.lookup::<_, Value>(
|
|
78
|
+
let random_seed = match options.lookup::<_, Value>(ruby.to_symbol("random_seed")) {
|
|
77
79
|
Ok(val) => {
|
|
78
80
|
if val.is_nil() {
|
|
79
81
|
None
|
|
@@ -86,10 +88,10 @@ impl RustUMAP {
|
|
|
86
88
|
Err(_) => None,
|
|
87
89
|
};
|
|
88
90
|
|
|
89
|
-
let nb_grad_batch = match options.lookup::<_, Value>(
|
|
91
|
+
let nb_grad_batch = match options.lookup::<_, Value>(ruby.to_symbol("nb_grad_batch")) {
|
|
90
92
|
Ok(val) => {
|
|
91
93
|
if val.is_nil() {
|
|
92
|
-
10
|
|
94
|
+
10
|
|
93
95
|
} else {
|
|
94
96
|
Integer::try_convert(val)
|
|
95
97
|
.map(|i| i.to_u32().unwrap_or(10) as usize)
|
|
@@ -98,11 +100,11 @@ impl RustUMAP {
|
|
|
98
100
|
}
|
|
99
101
|
Err(_) => 10,
|
|
100
102
|
};
|
|
101
|
-
|
|
102
|
-
let nb_sampling_by_edge = match options.lookup::<_, Value>(
|
|
103
|
+
|
|
104
|
+
let nb_sampling_by_edge = match options.lookup::<_, Value>(ruby.to_symbol("nb_sampling_by_edge")) {
|
|
103
105
|
Ok(val) => {
|
|
104
106
|
if val.is_nil() {
|
|
105
|
-
8
|
|
107
|
+
8
|
|
106
108
|
} else {
|
|
107
109
|
Integer::try_convert(val)
|
|
108
110
|
.map(|i| i.to_u32().unwrap_or(8) as usize)
|
|
@@ -124,61 +126,10 @@ impl RustUMAP {
|
|
|
124
126
|
}
|
|
125
127
|
|
|
126
128
|
fn fit_transform(&self, data: Value) -> Result<RArray, Error> {
|
|
127
|
-
|
|
128
|
-
let ruby_array = RArray::try_convert(data)?;
|
|
129
|
-
let mut rust_data: Vec<Vec<f64>> = Vec::new();
|
|
130
|
-
|
|
131
|
-
// Get array length
|
|
132
|
-
let array_len = ruby_array.len();
|
|
133
|
-
|
|
134
|
-
for i in 0..array_len {
|
|
135
|
-
let row = ruby_array.entry::<Value>(i as isize)?;
|
|
136
|
-
let row_array = RArray::try_convert(row).map_err(|_| {
|
|
137
|
-
Error::new(
|
|
138
|
-
magnus::exception::type_error(),
|
|
139
|
-
"Expected array of arrays (2D array)",
|
|
140
|
-
)
|
|
141
|
-
})?;
|
|
142
|
-
|
|
143
|
-
let mut rust_row: Vec<f64> = Vec::new();
|
|
144
|
-
let row_len = row_array.len();
|
|
145
|
-
|
|
146
|
-
for j in 0..row_len {
|
|
147
|
-
let val = row_array.entry::<Value>(j as isize)?;
|
|
148
|
-
let float_val = if let Ok(f) = Float::try_convert(val) {
|
|
149
|
-
f.to_f64()
|
|
150
|
-
} else if let Ok(i) = Integer::try_convert(val) {
|
|
151
|
-
i.to_i64()? as f64
|
|
152
|
-
} else {
|
|
153
|
-
return Err(Error::new(
|
|
154
|
-
magnus::exception::type_error(),
|
|
155
|
-
"All values must be numeric",
|
|
156
|
-
));
|
|
157
|
-
};
|
|
158
|
-
rust_row.push(float_val);
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
if !rust_data.is_empty() && rust_row.len() != rust_data[0].len() {
|
|
162
|
-
return Err(Error::new(
|
|
163
|
-
magnus::exception::arg_error(),
|
|
164
|
-
"All rows must have the same length",
|
|
165
|
-
));
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
rust_data.push(rust_row);
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
if rust_data.is_empty() {
|
|
172
|
-
return Err(Error::new(
|
|
173
|
-
magnus::exception::arg_error(),
|
|
174
|
-
"Input data cannot be empty",
|
|
175
|
-
));
|
|
176
|
-
}
|
|
129
|
+
let ruby = Ruby::get().unwrap();
|
|
177
130
|
|
|
178
|
-
// Convert to Vec<Vec<f32>>
|
|
179
|
-
let data_f32
|
|
180
|
-
.map(|row| row.iter().map(|&x| x as f32).collect())
|
|
181
|
-
.collect();
|
|
131
|
+
// Convert Ruby array to Rust Vec<Vec<f32>> using shared helper
|
|
132
|
+
let data_f32 = ruby_array_to_vec_vec_f32(data)?;
|
|
182
133
|
|
|
183
134
|
// Build HNSW graph
|
|
184
135
|
let ef_c = 50;
|
|
@@ -201,9 +152,7 @@ impl RustUMAP {
|
|
|
201
152
|
.enumerate()
|
|
202
153
|
.map(|(i, v)| (v, i))
|
|
203
154
|
.collect();
|
|
204
|
-
|
|
205
|
-
// Use serial_insert for reproducibility when seed is provided,
|
|
206
|
-
// parallel_insert for performance when no seed
|
|
155
|
+
|
|
207
156
|
if self.random_seed.is_some() {
|
|
208
157
|
hnsw.serial_insert(&data_with_id);
|
|
209
158
|
} else {
|
|
@@ -212,36 +161,34 @@ impl RustUMAP {
|
|
|
212
161
|
|
|
213
162
|
// Create KGraph from HNSW
|
|
214
163
|
let kgraph: annembed::fromhnsw::kgraph::KGraph<f32> = annembed::fromhnsw::kgraph::kgraph_from_hnsw_all(&hnsw, self.n_neighbors)
|
|
215
|
-
.map_err(|e| Error::new(
|
|
164
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
216
165
|
|
|
217
166
|
// Set up embedding parameters
|
|
218
167
|
let mut embed_params = EmbedderParams::default();
|
|
219
168
|
embed_params.asked_dim = self.n_components;
|
|
220
|
-
embed_params.nb_grad_batch = self.nb_grad_batch;
|
|
169
|
+
embed_params.nb_grad_batch = self.nb_grad_batch;
|
|
221
170
|
embed_params.scale_rho = 1.;
|
|
222
171
|
embed_params.beta = 1.;
|
|
223
172
|
embed_params.b = 1.;
|
|
224
173
|
embed_params.grad_step = 1.;
|
|
225
|
-
embed_params.nb_sampling_by_edge = self.nb_sampling_by_edge;
|
|
226
|
-
// Enable diffusion map initialization (annembed now has fallback to random if it fails)
|
|
174
|
+
embed_params.nb_sampling_by_edge = self.nb_sampling_by_edge;
|
|
227
175
|
embed_params.dmap_init = true;
|
|
228
|
-
embed_params.random_seed = self.random_seed;
|
|
176
|
+
embed_params.random_seed = self.random_seed;
|
|
229
177
|
|
|
230
178
|
// Create embedder and perform embedding
|
|
231
179
|
let mut embedder = Embedder::new(&kgraph, embed_params);
|
|
232
180
|
|
|
233
181
|
let embed_result = embedder.embed()
|
|
234
|
-
.map_err(|e| Error::new(
|
|
182
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(),
|
|
235
183
|
format!("Embedding failed: {}", e)))?;
|
|
236
184
|
|
|
237
185
|
if embed_result == 0 {
|
|
238
|
-
return Err(Error::new(
|
|
186
|
+
return Err(Error::new(ruby.exception_runtime_error(), "No points were embedded"));
|
|
239
187
|
}
|
|
240
188
|
|
|
241
189
|
// Get embedded data
|
|
242
190
|
let embedded_array = embedder.get_embedded_reindexed();
|
|
243
191
|
|
|
244
|
-
// Store results in a simpler format
|
|
245
192
|
let mut embeddings = Vec::new();
|
|
246
193
|
for i in 0..embedded_array.nrows() {
|
|
247
194
|
let mut row = Vec::new();
|
|
@@ -250,13 +197,15 @@ impl RustUMAP {
|
|
|
250
197
|
}
|
|
251
198
|
embeddings.push(row);
|
|
252
199
|
}
|
|
200
|
+
|
|
253
201
|
// Store the training data and embeddings for future transforms
|
|
254
202
|
*self.training_data.borrow_mut() = Some(data_f32.clone());
|
|
255
203
|
*self.training_embeddings.borrow_mut() = Some(embeddings.clone());
|
|
204
|
+
|
|
256
205
|
// Convert result back to Ruby array
|
|
257
|
-
let result =
|
|
206
|
+
let result = ruby.ary_new();
|
|
258
207
|
for embedding in &embeddings {
|
|
259
|
-
let row =
|
|
208
|
+
let row = ruby.ary_new();
|
|
260
209
|
for &val in embedding {
|
|
261
210
|
row.push(val)?;
|
|
262
211
|
}
|
|
@@ -265,16 +214,15 @@ impl RustUMAP {
|
|
|
265
214
|
Ok(result)
|
|
266
215
|
}
|
|
267
216
|
|
|
268
|
-
// Save the full model (training data + embeddings + params) for future transforms
|
|
269
217
|
fn save_model(&self, path: String) -> Result<(), Error> {
|
|
270
|
-
|
|
218
|
+
let ruby = Ruby::get().unwrap();
|
|
271
219
|
let training_data = self.training_data.borrow();
|
|
272
220
|
let training_embeddings = self.training_embeddings.borrow();
|
|
273
221
|
|
|
274
222
|
let training_data_ref = training_data.as_ref()
|
|
275
|
-
.ok_or_else(|| Error::new(
|
|
223
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No model to save. Run fit_transform first."))?;
|
|
276
224
|
let training_embeddings_ref = training_embeddings.as_ref()
|
|
277
|
-
.ok_or_else(|| Error::new(
|
|
225
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No embeddings to save."))?;
|
|
278
226
|
|
|
279
227
|
let saved_model = SavedUMAPModel {
|
|
280
228
|
n_components: self.n_components,
|
|
@@ -286,28 +234,29 @@ impl RustUMAP {
|
|
|
286
234
|
};
|
|
287
235
|
|
|
288
236
|
let serialized = bincode::serialize(&saved_model)
|
|
289
|
-
.map_err(|e| Error::new(
|
|
237
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
290
238
|
|
|
291
239
|
let mut file = File::create(&path)
|
|
292
|
-
.map_err(|e| Error::new(
|
|
240
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
293
241
|
|
|
294
242
|
file.write_all(&serialized)
|
|
295
|
-
.map_err(|e| Error::new(
|
|
243
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
296
244
|
|
|
297
245
|
Ok(())
|
|
298
246
|
}
|
|
299
247
|
|
|
300
|
-
// Load a full model for transforming new data
|
|
301
248
|
fn load_model(path: String) -> Result<Self, Error> {
|
|
249
|
+
let ruby = Ruby::get().unwrap();
|
|
250
|
+
|
|
302
251
|
let mut file = File::open(&path)
|
|
303
|
-
.map_err(|e| Error::new(
|
|
252
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
304
253
|
|
|
305
254
|
let mut buffer = Vec::new();
|
|
306
255
|
file.read_to_end(&mut buffer)
|
|
307
|
-
.map_err(|e| Error::new(
|
|
256
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
308
257
|
|
|
309
258
|
let saved_model: SavedUMAPModel = bincode::deserialize(&buffer)
|
|
310
|
-
.map_err(|e| Error::new(
|
|
259
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
311
260
|
|
|
312
261
|
Ok(RustUMAP {
|
|
313
262
|
n_components: saved_model.n_components,
|
|
@@ -320,66 +269,36 @@ impl RustUMAP {
|
|
|
320
269
|
})
|
|
321
270
|
}
|
|
322
271
|
|
|
323
|
-
// Transform new data using k-NN approximation with the training data
|
|
324
272
|
fn transform(&self, data: Value) -> Result<RArray, Error> {
|
|
325
|
-
|
|
273
|
+
let ruby = Ruby::get().unwrap();
|
|
326
274
|
let training_data = self.training_data.borrow();
|
|
327
275
|
let training_embeddings = self.training_embeddings.borrow();
|
|
328
276
|
|
|
329
277
|
let training_data_ref = training_data.as_ref()
|
|
330
|
-
.ok_or_else(|| Error::new(
|
|
278
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No model loaded. Load a model or run fit_transform first."))?;
|
|
331
279
|
let training_embeddings_ref = training_embeddings.as_ref()
|
|
332
|
-
.ok_or_else(|| Error::new(
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
let ruby_array = RArray::try_convert(data)?;
|
|
336
|
-
let mut new_data: Vec<Vec<f32>> = Vec::new();
|
|
337
|
-
|
|
338
|
-
for i in 0..ruby_array.len() {
|
|
339
|
-
let row = ruby_array.entry::<Value>(i as isize)?;
|
|
340
|
-
let row_array = RArray::try_convert(row)?;
|
|
341
|
-
let mut rust_row: Vec<f32> = Vec::new();
|
|
342
|
-
|
|
343
|
-
for j in 0..row_array.len() {
|
|
344
|
-
let val = row_array.entry::<Value>(j as isize)?;
|
|
345
|
-
let float_val = if let Ok(f) = Float::try_convert(val) {
|
|
346
|
-
f.to_f64() as f32
|
|
347
|
-
} else if let Ok(i) = Integer::try_convert(val) {
|
|
348
|
-
i.to_i64()? as f32
|
|
349
|
-
} else {
|
|
350
|
-
return Err(Error::new(
|
|
351
|
-
magnus::exception::type_error(),
|
|
352
|
-
"All values must be numeric",
|
|
353
|
-
));
|
|
354
|
-
};
|
|
355
|
-
rust_row.push(float_val);
|
|
356
|
-
}
|
|
357
|
-
new_data.push(rust_row);
|
|
358
|
-
}
|
|
280
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No embeddings available."))?;
|
|
281
|
+
|
|
282
|
+
let new_data = ruby_array_to_vec_vec_f32(data)?;
|
|
359
283
|
|
|
360
|
-
// For each new point, find k nearest neighbors in training data
|
|
361
|
-
// and average their embeddings (weighted by distance)
|
|
362
284
|
let k = self.n_neighbors.min(training_data_ref.len());
|
|
363
|
-
let result =
|
|
285
|
+
let result = ruby.ary_new();
|
|
364
286
|
|
|
365
287
|
for new_point in &new_data {
|
|
366
|
-
// Calculate distances to all training points
|
|
367
288
|
let mut distances: Vec<(f32, usize)> = Vec::new();
|
|
368
289
|
for (idx, train_point) in training_data_ref.iter().enumerate() {
|
|
369
290
|
let dist = euclidean_distance(new_point, train_point);
|
|
370
291
|
distances.push((dist, idx));
|
|
371
292
|
}
|
|
372
293
|
|
|
373
|
-
// Sort by distance and take k nearest
|
|
374
294
|
distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
|
375
295
|
let k_nearest = &distances[..k];
|
|
376
296
|
|
|
377
|
-
// Weighted average of k nearest embeddings
|
|
378
297
|
let mut avg_embedding = vec![0.0; self.n_components];
|
|
379
298
|
let mut total_weight = 0.0;
|
|
380
299
|
|
|
381
300
|
for &(dist, idx) in k_nearest {
|
|
382
|
-
let weight = 1.0 / (dist as f64 + 0.001);
|
|
301
|
+
let weight = 1.0 / (dist as f64 + 0.001);
|
|
383
302
|
total_weight += weight;
|
|
384
303
|
|
|
385
304
|
for (i, &val) in training_embeddings_ref[idx].iter().enumerate() {
|
|
@@ -387,13 +306,11 @@ impl RustUMAP {
|
|
|
387
306
|
}
|
|
388
307
|
}
|
|
389
308
|
|
|
390
|
-
// Normalize
|
|
391
309
|
for val in &mut avg_embedding {
|
|
392
310
|
*val /= total_weight;
|
|
393
311
|
}
|
|
394
312
|
|
|
395
|
-
|
|
396
|
-
let row = RArray::new();
|
|
313
|
+
let row = ruby.ary_new();
|
|
397
314
|
for val in avg_embedding {
|
|
398
315
|
row.push(val)?;
|
|
399
316
|
}
|
|
@@ -410,4 +327,4 @@ fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
|
|
|
410
327
|
.map(|(x, y)| (x - y).powi(2))
|
|
411
328
|
.sum::<f32>()
|
|
412
329
|
.sqrt()
|
|
413
|
-
}
|
|
330
|
+
}
|