clusterkit 0.2.4 → 0.2.6.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +12 -12
- data/ext/clusterkit/Cargo.toml +2 -2
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +23 -36
- data/ext/clusterkit/src/clustering.rs +47 -53
- data/ext/clusterkit/src/embedder.rs +65 -54
- data/ext/clusterkit/src/hnsw.rs +181 -215
- data/ext/clusterkit/src/lib.rs +5 -5
- data/ext/clusterkit/src/svd.rs +31 -33
- data/ext/clusterkit/src/utils.rs +24 -21
- data/lib/clusterkit/version.rb +1 -1
- data/lib/clusterkit.rb +1 -1
- metadata +17 -4
- data/clusterkit.gemspec +0 -45
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Module, Object};
|
|
1
|
+
use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Module, Object, Ruby};
|
|
2
2
|
use magnus::value::ReprValue;
|
|
3
3
|
use hnsw_rs::prelude::*;
|
|
4
4
|
use annembed::prelude::*;
|
|
@@ -21,7 +21,8 @@ struct SavedUMAPModel {
|
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
|
24
|
-
let
|
|
24
|
+
let ruby = Ruby::get().unwrap();
|
|
25
|
+
let umap_class = parent.define_class("RustUMAP", ruby.class_object())?;
|
|
25
26
|
|
|
26
27
|
umap_class.define_singleton_method("new", magnus::function!(RustUMAP::new, 1))?;
|
|
27
28
|
umap_class.define_singleton_method("load_model", magnus::function!(RustUMAP::load_model, 1))?;
|
|
@@ -40,15 +41,15 @@ struct RustUMAP {
|
|
|
40
41
|
random_seed: Option<u64>,
|
|
41
42
|
nb_grad_batch: usize,
|
|
42
43
|
nb_sampling_by_edge: usize,
|
|
43
|
-
// Store the training data and embeddings for transform approximation
|
|
44
|
-
// Use RefCell for interior mutability
|
|
45
44
|
training_data: RefCell<Option<Vec<Vec<f32>>>>,
|
|
46
45
|
training_embeddings: RefCell<Option<Vec<Vec<f64>>>>,
|
|
47
46
|
}
|
|
48
47
|
|
|
49
48
|
impl RustUMAP {
|
|
50
49
|
fn new(options: RHash) -> Result<Self, Error> {
|
|
51
|
-
let
|
|
50
|
+
let ruby = Ruby::get().unwrap();
|
|
51
|
+
|
|
52
|
+
let n_components = match options.lookup::<_, Value>(ruby.to_symbol("n_components")) {
|
|
52
53
|
Ok(val) => {
|
|
53
54
|
if val.is_nil() {
|
|
54
55
|
2
|
|
@@ -61,7 +62,7 @@ impl RustUMAP {
|
|
|
61
62
|
Err(_) => 2,
|
|
62
63
|
};
|
|
63
64
|
|
|
64
|
-
let n_neighbors = match options.lookup::<_, Value>(
|
|
65
|
+
let n_neighbors = match options.lookup::<_, Value>(ruby.to_symbol("n_neighbors")) {
|
|
65
66
|
Ok(val) => {
|
|
66
67
|
if val.is_nil() {
|
|
67
68
|
15
|
|
@@ -74,7 +75,7 @@ impl RustUMAP {
|
|
|
74
75
|
Err(_) => 15,
|
|
75
76
|
};
|
|
76
77
|
|
|
77
|
-
let random_seed = match options.lookup::<_, Value>(
|
|
78
|
+
let random_seed = match options.lookup::<_, Value>(ruby.to_symbol("random_seed")) {
|
|
78
79
|
Ok(val) => {
|
|
79
80
|
if val.is_nil() {
|
|
80
81
|
None
|
|
@@ -87,10 +88,10 @@ impl RustUMAP {
|
|
|
87
88
|
Err(_) => None,
|
|
88
89
|
};
|
|
89
90
|
|
|
90
|
-
let nb_grad_batch = match options.lookup::<_, Value>(
|
|
91
|
+
let nb_grad_batch = match options.lookup::<_, Value>(ruby.to_symbol("nb_grad_batch")) {
|
|
91
92
|
Ok(val) => {
|
|
92
93
|
if val.is_nil() {
|
|
93
|
-
10
|
|
94
|
+
10
|
|
94
95
|
} else {
|
|
95
96
|
Integer::try_convert(val)
|
|
96
97
|
.map(|i| i.to_u32().unwrap_or(10) as usize)
|
|
@@ -99,11 +100,11 @@ impl RustUMAP {
|
|
|
99
100
|
}
|
|
100
101
|
Err(_) => 10,
|
|
101
102
|
};
|
|
102
|
-
|
|
103
|
-
let nb_sampling_by_edge = match options.lookup::<_, Value>(
|
|
103
|
+
|
|
104
|
+
let nb_sampling_by_edge = match options.lookup::<_, Value>(ruby.to_symbol("nb_sampling_by_edge")) {
|
|
104
105
|
Ok(val) => {
|
|
105
106
|
if val.is_nil() {
|
|
106
|
-
8
|
|
107
|
+
8
|
|
107
108
|
} else {
|
|
108
109
|
Integer::try_convert(val)
|
|
109
110
|
.map(|i| i.to_u32().unwrap_or(8) as usize)
|
|
@@ -125,6 +126,8 @@ impl RustUMAP {
|
|
|
125
126
|
}
|
|
126
127
|
|
|
127
128
|
fn fit_transform(&self, data: Value) -> Result<RArray, Error> {
|
|
129
|
+
let ruby = Ruby::get().unwrap();
|
|
130
|
+
|
|
128
131
|
// Convert Ruby array to Rust Vec<Vec<f32>> using shared helper
|
|
129
132
|
let data_f32 = ruby_array_to_vec_vec_f32(data)?;
|
|
130
133
|
|
|
@@ -149,9 +152,7 @@ impl RustUMAP {
|
|
|
149
152
|
.enumerate()
|
|
150
153
|
.map(|(i, v)| (v, i))
|
|
151
154
|
.collect();
|
|
152
|
-
|
|
153
|
-
// Use serial_insert for reproducibility when seed is provided,
|
|
154
|
-
// parallel_insert for performance when no seed
|
|
155
|
+
|
|
155
156
|
if self.random_seed.is_some() {
|
|
156
157
|
hnsw.serial_insert(&data_with_id);
|
|
157
158
|
} else {
|
|
@@ -160,36 +161,53 @@ impl RustUMAP {
|
|
|
160
161
|
|
|
161
162
|
// Create KGraph from HNSW
|
|
162
163
|
let kgraph: annembed::fromhnsw::kgraph::KGraph<f32> = annembed::fromhnsw::kgraph::kgraph_from_hnsw_all(&hnsw, self.n_neighbors)
|
|
163
|
-
.map_err(|e| Error::new(
|
|
164
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
164
165
|
|
|
165
166
|
// Set up embedding parameters
|
|
166
167
|
let mut embed_params = EmbedderParams::default();
|
|
167
168
|
embed_params.asked_dim = self.n_components;
|
|
168
|
-
embed_params.nb_grad_batch = self.nb_grad_batch;
|
|
169
|
+
embed_params.nb_grad_batch = self.nb_grad_batch;
|
|
169
170
|
embed_params.scale_rho = 1.;
|
|
170
171
|
embed_params.beta = 1.;
|
|
171
172
|
embed_params.b = 1.;
|
|
172
173
|
embed_params.grad_step = 1.;
|
|
173
|
-
embed_params.nb_sampling_by_edge = self.nb_sampling_by_edge;
|
|
174
|
-
// Enable diffusion map initialization (annembed now has fallback to random if it fails)
|
|
174
|
+
embed_params.nb_sampling_by_edge = self.nb_sampling_by_edge;
|
|
175
175
|
embed_params.dmap_init = true;
|
|
176
|
-
embed_params.random_seed = self.random_seed;
|
|
176
|
+
embed_params.random_seed = self.random_seed;
|
|
177
177
|
|
|
178
178
|
// Create embedder and perform embedding
|
|
179
179
|
let mut embedder = Embedder::new(&kgraph, embed_params);
|
|
180
180
|
|
|
181
|
-
let embed_result =
|
|
182
|
-
.
|
|
183
|
-
|
|
181
|
+
let embed_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
|
182
|
+
embedder.embed()
|
|
183
|
+
}));
|
|
184
|
+
|
|
185
|
+
let embed_result = match embed_result {
|
|
186
|
+
Ok(Ok(result)) => result,
|
|
187
|
+
Ok(Err(e)) => {
|
|
188
|
+
return Err(Error::new(ruby.exception_runtime_error(),
|
|
189
|
+
format!("Embedding failed: {}", e)));
|
|
190
|
+
}
|
|
191
|
+
Err(panic_info) => {
|
|
192
|
+
let msg = if let Some(s) = panic_info.downcast_ref::<String>() {
|
|
193
|
+
s.clone()
|
|
194
|
+
} else if let Some(s) = panic_info.downcast_ref::<&str>() {
|
|
195
|
+
s.to_string()
|
|
196
|
+
} else {
|
|
197
|
+
"unknown panic".to_string()
|
|
198
|
+
};
|
|
199
|
+
return Err(Error::new(ruby.exception_runtime_error(),
|
|
200
|
+
format!("Embedding panicked: {}", msg)));
|
|
201
|
+
}
|
|
202
|
+
};
|
|
184
203
|
|
|
185
204
|
if embed_result == 0 {
|
|
186
|
-
return Err(Error::new(
|
|
205
|
+
return Err(Error::new(ruby.exception_runtime_error(), "No points were embedded"));
|
|
187
206
|
}
|
|
188
207
|
|
|
189
208
|
// Get embedded data
|
|
190
209
|
let embedded_array = embedder.get_embedded_reindexed();
|
|
191
210
|
|
|
192
|
-
// Store results in a simpler format
|
|
193
211
|
let mut embeddings = Vec::new();
|
|
194
212
|
for i in 0..embedded_array.nrows() {
|
|
195
213
|
let mut row = Vec::new();
|
|
@@ -198,13 +216,15 @@ impl RustUMAP {
|
|
|
198
216
|
}
|
|
199
217
|
embeddings.push(row);
|
|
200
218
|
}
|
|
219
|
+
|
|
201
220
|
// Store the training data and embeddings for future transforms
|
|
202
221
|
*self.training_data.borrow_mut() = Some(data_f32.clone());
|
|
203
222
|
*self.training_embeddings.borrow_mut() = Some(embeddings.clone());
|
|
223
|
+
|
|
204
224
|
// Convert result back to Ruby array
|
|
205
|
-
let result =
|
|
225
|
+
let result = ruby.ary_new();
|
|
206
226
|
for embedding in &embeddings {
|
|
207
|
-
let row =
|
|
227
|
+
let row = ruby.ary_new();
|
|
208
228
|
for &val in embedding {
|
|
209
229
|
row.push(val)?;
|
|
210
230
|
}
|
|
@@ -213,16 +233,15 @@ impl RustUMAP {
|
|
|
213
233
|
Ok(result)
|
|
214
234
|
}
|
|
215
235
|
|
|
216
|
-
// Save the full model (training data + embeddings + params) for future transforms
|
|
217
236
|
fn save_model(&self, path: String) -> Result<(), Error> {
|
|
218
|
-
|
|
237
|
+
let ruby = Ruby::get().unwrap();
|
|
219
238
|
let training_data = self.training_data.borrow();
|
|
220
239
|
let training_embeddings = self.training_embeddings.borrow();
|
|
221
240
|
|
|
222
241
|
let training_data_ref = training_data.as_ref()
|
|
223
|
-
.ok_or_else(|| Error::new(
|
|
242
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No model to save. Run fit_transform first."))?;
|
|
224
243
|
let training_embeddings_ref = training_embeddings.as_ref()
|
|
225
|
-
.ok_or_else(|| Error::new(
|
|
244
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No embeddings to save."))?;
|
|
226
245
|
|
|
227
246
|
let saved_model = SavedUMAPModel {
|
|
228
247
|
n_components: self.n_components,
|
|
@@ -234,28 +253,29 @@ impl RustUMAP {
|
|
|
234
253
|
};
|
|
235
254
|
|
|
236
255
|
let serialized = bincode::serialize(&saved_model)
|
|
237
|
-
.map_err(|e| Error::new(
|
|
256
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
238
257
|
|
|
239
258
|
let mut file = File::create(&path)
|
|
240
|
-
.map_err(|e| Error::new(
|
|
259
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
241
260
|
|
|
242
261
|
file.write_all(&serialized)
|
|
243
|
-
.map_err(|e| Error::new(
|
|
262
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
244
263
|
|
|
245
264
|
Ok(())
|
|
246
265
|
}
|
|
247
266
|
|
|
248
|
-
// Load a full model for transforming new data
|
|
249
267
|
fn load_model(path: String) -> Result<Self, Error> {
|
|
268
|
+
let ruby = Ruby::get().unwrap();
|
|
269
|
+
|
|
250
270
|
let mut file = File::open(&path)
|
|
251
|
-
.map_err(|e| Error::new(
|
|
271
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
252
272
|
|
|
253
273
|
let mut buffer = Vec::new();
|
|
254
274
|
file.read_to_end(&mut buffer)
|
|
255
|
-
.map_err(|e| Error::new(
|
|
275
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
256
276
|
|
|
257
277
|
let saved_model: SavedUMAPModel = bincode::deserialize(&buffer)
|
|
258
|
-
.map_err(|e| Error::new(
|
|
278
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
259
279
|
|
|
260
280
|
Ok(RustUMAP {
|
|
261
281
|
n_components: saved_model.n_components,
|
|
@@ -268,43 +288,36 @@ impl RustUMAP {
|
|
|
268
288
|
})
|
|
269
289
|
}
|
|
270
290
|
|
|
271
|
-
// Transform new data using k-NN approximation with the training data
|
|
272
291
|
fn transform(&self, data: Value) -> Result<RArray, Error> {
|
|
273
|
-
|
|
292
|
+
let ruby = Ruby::get().unwrap();
|
|
274
293
|
let training_data = self.training_data.borrow();
|
|
275
294
|
let training_embeddings = self.training_embeddings.borrow();
|
|
276
295
|
|
|
277
296
|
let training_data_ref = training_data.as_ref()
|
|
278
|
-
.ok_or_else(|| Error::new(
|
|
297
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No model loaded. Load a model or run fit_transform first."))?;
|
|
279
298
|
let training_embeddings_ref = training_embeddings.as_ref()
|
|
280
|
-
.ok_or_else(|| Error::new(
|
|
299
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No embeddings available."))?;
|
|
281
300
|
|
|
282
|
-
// Convert input data to Rust format using shared helper
|
|
283
301
|
let new_data = ruby_array_to_vec_vec_f32(data)?;
|
|
284
302
|
|
|
285
|
-
// For each new point, find k nearest neighbors in training data
|
|
286
|
-
// and average their embeddings (weighted by distance)
|
|
287
303
|
let k = self.n_neighbors.min(training_data_ref.len());
|
|
288
|
-
let result =
|
|
304
|
+
let result = ruby.ary_new();
|
|
289
305
|
|
|
290
306
|
for new_point in &new_data {
|
|
291
|
-
// Calculate distances to all training points
|
|
292
307
|
let mut distances: Vec<(f32, usize)> = Vec::new();
|
|
293
308
|
for (idx, train_point) in training_data_ref.iter().enumerate() {
|
|
294
309
|
let dist = euclidean_distance(new_point, train_point);
|
|
295
310
|
distances.push((dist, idx));
|
|
296
311
|
}
|
|
297
312
|
|
|
298
|
-
// Sort by distance and take k nearest
|
|
299
313
|
distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
|
300
314
|
let k_nearest = &distances[..k];
|
|
301
315
|
|
|
302
|
-
// Weighted average of k nearest embeddings
|
|
303
316
|
let mut avg_embedding = vec![0.0; self.n_components];
|
|
304
317
|
let mut total_weight = 0.0;
|
|
305
318
|
|
|
306
319
|
for &(dist, idx) in k_nearest {
|
|
307
|
-
let weight = 1.0 / (dist as f64 + 0.001);
|
|
320
|
+
let weight = 1.0 / (dist as f64 + 0.001);
|
|
308
321
|
total_weight += weight;
|
|
309
322
|
|
|
310
323
|
for (i, &val) in training_embeddings_ref[idx].iter().enumerate() {
|
|
@@ -312,13 +325,11 @@ impl RustUMAP {
|
|
|
312
325
|
}
|
|
313
326
|
}
|
|
314
327
|
|
|
315
|
-
// Normalize
|
|
316
328
|
for val in &mut avg_embedding {
|
|
317
329
|
*val /= total_weight;
|
|
318
330
|
}
|
|
319
331
|
|
|
320
|
-
|
|
321
|
-
let row = RArray::new();
|
|
332
|
+
let row = ruby.ary_new();
|
|
322
333
|
for val in avg_embedding {
|
|
323
334
|
row.push(val)?;
|
|
324
335
|
}
|
|
@@ -335,4 +346,4 @@ fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
|
|
|
335
346
|
.map(|(x, y)| (x - y).powi(2))
|
|
336
347
|
.sum::<f32>()
|
|
337
348
|
.sqrt()
|
|
338
|
-
}
|
|
349
|
+
}
|