clusterkit 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +3236 -0
- data/ext/clusterkit/Cargo.toml +2 -1
- data/ext/clusterkit/extconf.rb +9 -1
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +23 -36
- data/ext/clusterkit/src/clustering.rs +47 -53
- data/ext/clusterkit/src/embedder.rs +44 -52
- data/ext/clusterkit/src/hnsw.rs +181 -215
- data/ext/clusterkit/src/lib.rs +5 -5
- data/ext/clusterkit/src/svd.rs +31 -33
- data/ext/clusterkit/src/utils.rs +24 -21
- data/lib/clusterkit/version.rb +1 -1
- data/lib/clusterkit.rb +1 -1
- metadata +18 -4
- data/clusterkit.gemspec +0 -45
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Module, Object};
|
|
1
|
+
use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Module, Object, Ruby};
|
|
2
2
|
use magnus::value::ReprValue;
|
|
3
3
|
use hnsw_rs::prelude::*;
|
|
4
4
|
use annembed::prelude::*;
|
|
@@ -21,7 +21,8 @@ struct SavedUMAPModel {
|
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
|
24
|
-
let
|
|
24
|
+
let ruby = Ruby::get().unwrap();
|
|
25
|
+
let umap_class = parent.define_class("RustUMAP", ruby.class_object())?;
|
|
25
26
|
|
|
26
27
|
umap_class.define_singleton_method("new", magnus::function!(RustUMAP::new, 1))?;
|
|
27
28
|
umap_class.define_singleton_method("load_model", magnus::function!(RustUMAP::load_model, 1))?;
|
|
@@ -40,15 +41,15 @@ struct RustUMAP {
|
|
|
40
41
|
random_seed: Option<u64>,
|
|
41
42
|
nb_grad_batch: usize,
|
|
42
43
|
nb_sampling_by_edge: usize,
|
|
43
|
-
// Store the training data and embeddings for transform approximation
|
|
44
|
-
// Use RefCell for interior mutability
|
|
45
44
|
training_data: RefCell<Option<Vec<Vec<f32>>>>,
|
|
46
45
|
training_embeddings: RefCell<Option<Vec<Vec<f64>>>>,
|
|
47
46
|
}
|
|
48
47
|
|
|
49
48
|
impl RustUMAP {
|
|
50
49
|
fn new(options: RHash) -> Result<Self, Error> {
|
|
51
|
-
let
|
|
50
|
+
let ruby = Ruby::get().unwrap();
|
|
51
|
+
|
|
52
|
+
let n_components = match options.lookup::<_, Value>(ruby.to_symbol("n_components")) {
|
|
52
53
|
Ok(val) => {
|
|
53
54
|
if val.is_nil() {
|
|
54
55
|
2
|
|
@@ -61,7 +62,7 @@ impl RustUMAP {
|
|
|
61
62
|
Err(_) => 2,
|
|
62
63
|
};
|
|
63
64
|
|
|
64
|
-
let n_neighbors = match options.lookup::<_, Value>(
|
|
65
|
+
let n_neighbors = match options.lookup::<_, Value>(ruby.to_symbol("n_neighbors")) {
|
|
65
66
|
Ok(val) => {
|
|
66
67
|
if val.is_nil() {
|
|
67
68
|
15
|
|
@@ -74,7 +75,7 @@ impl RustUMAP {
|
|
|
74
75
|
Err(_) => 15,
|
|
75
76
|
};
|
|
76
77
|
|
|
77
|
-
let random_seed = match options.lookup::<_, Value>(
|
|
78
|
+
let random_seed = match options.lookup::<_, Value>(ruby.to_symbol("random_seed")) {
|
|
78
79
|
Ok(val) => {
|
|
79
80
|
if val.is_nil() {
|
|
80
81
|
None
|
|
@@ -87,10 +88,10 @@ impl RustUMAP {
|
|
|
87
88
|
Err(_) => None,
|
|
88
89
|
};
|
|
89
90
|
|
|
90
|
-
let nb_grad_batch = match options.lookup::<_, Value>(
|
|
91
|
+
let nb_grad_batch = match options.lookup::<_, Value>(ruby.to_symbol("nb_grad_batch")) {
|
|
91
92
|
Ok(val) => {
|
|
92
93
|
if val.is_nil() {
|
|
93
|
-
10
|
|
94
|
+
10
|
|
94
95
|
} else {
|
|
95
96
|
Integer::try_convert(val)
|
|
96
97
|
.map(|i| i.to_u32().unwrap_or(10) as usize)
|
|
@@ -99,11 +100,11 @@ impl RustUMAP {
|
|
|
99
100
|
}
|
|
100
101
|
Err(_) => 10,
|
|
101
102
|
};
|
|
102
|
-
|
|
103
|
-
let nb_sampling_by_edge = match options.lookup::<_, Value>(
|
|
103
|
+
|
|
104
|
+
let nb_sampling_by_edge = match options.lookup::<_, Value>(ruby.to_symbol("nb_sampling_by_edge")) {
|
|
104
105
|
Ok(val) => {
|
|
105
106
|
if val.is_nil() {
|
|
106
|
-
8
|
|
107
|
+
8
|
|
107
108
|
} else {
|
|
108
109
|
Integer::try_convert(val)
|
|
109
110
|
.map(|i| i.to_u32().unwrap_or(8) as usize)
|
|
@@ -125,6 +126,8 @@ impl RustUMAP {
|
|
|
125
126
|
}
|
|
126
127
|
|
|
127
128
|
fn fit_transform(&self, data: Value) -> Result<RArray, Error> {
|
|
129
|
+
let ruby = Ruby::get().unwrap();
|
|
130
|
+
|
|
128
131
|
// Convert Ruby array to Rust Vec<Vec<f32>> using shared helper
|
|
129
132
|
let data_f32 = ruby_array_to_vec_vec_f32(data)?;
|
|
130
133
|
|
|
@@ -149,9 +152,7 @@ impl RustUMAP {
|
|
|
149
152
|
.enumerate()
|
|
150
153
|
.map(|(i, v)| (v, i))
|
|
151
154
|
.collect();
|
|
152
|
-
|
|
153
|
-
// Use serial_insert for reproducibility when seed is provided,
|
|
154
|
-
// parallel_insert for performance when no seed
|
|
155
|
+
|
|
155
156
|
if self.random_seed.is_some() {
|
|
156
157
|
hnsw.serial_insert(&data_with_id);
|
|
157
158
|
} else {
|
|
@@ -160,36 +161,34 @@ impl RustUMAP {
|
|
|
160
161
|
|
|
161
162
|
// Create KGraph from HNSW
|
|
162
163
|
let kgraph: annembed::fromhnsw::kgraph::KGraph<f32> = annembed::fromhnsw::kgraph::kgraph_from_hnsw_all(&hnsw, self.n_neighbors)
|
|
163
|
-
.map_err(|e| Error::new(
|
|
164
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
164
165
|
|
|
165
166
|
// Set up embedding parameters
|
|
166
167
|
let mut embed_params = EmbedderParams::default();
|
|
167
168
|
embed_params.asked_dim = self.n_components;
|
|
168
|
-
embed_params.nb_grad_batch = self.nb_grad_batch;
|
|
169
|
+
embed_params.nb_grad_batch = self.nb_grad_batch;
|
|
169
170
|
embed_params.scale_rho = 1.;
|
|
170
171
|
embed_params.beta = 1.;
|
|
171
172
|
embed_params.b = 1.;
|
|
172
173
|
embed_params.grad_step = 1.;
|
|
173
|
-
embed_params.nb_sampling_by_edge = self.nb_sampling_by_edge;
|
|
174
|
-
// Enable diffusion map initialization (annembed now has fallback to random if it fails)
|
|
174
|
+
embed_params.nb_sampling_by_edge = self.nb_sampling_by_edge;
|
|
175
175
|
embed_params.dmap_init = true;
|
|
176
|
-
embed_params.random_seed = self.random_seed;
|
|
176
|
+
embed_params.random_seed = self.random_seed;
|
|
177
177
|
|
|
178
178
|
// Create embedder and perform embedding
|
|
179
179
|
let mut embedder = Embedder::new(&kgraph, embed_params);
|
|
180
180
|
|
|
181
181
|
let embed_result = embedder.embed()
|
|
182
|
-
.map_err(|e| Error::new(
|
|
182
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(),
|
|
183
183
|
format!("Embedding failed: {}", e)))?;
|
|
184
184
|
|
|
185
185
|
if embed_result == 0 {
|
|
186
|
-
return Err(Error::new(
|
|
186
|
+
return Err(Error::new(ruby.exception_runtime_error(), "No points were embedded"));
|
|
187
187
|
}
|
|
188
188
|
|
|
189
189
|
// Get embedded data
|
|
190
190
|
let embedded_array = embedder.get_embedded_reindexed();
|
|
191
191
|
|
|
192
|
-
// Store results in a simpler format
|
|
193
192
|
let mut embeddings = Vec::new();
|
|
194
193
|
for i in 0..embedded_array.nrows() {
|
|
195
194
|
let mut row = Vec::new();
|
|
@@ -198,13 +197,15 @@ impl RustUMAP {
|
|
|
198
197
|
}
|
|
199
198
|
embeddings.push(row);
|
|
200
199
|
}
|
|
200
|
+
|
|
201
201
|
// Store the training data and embeddings for future transforms
|
|
202
202
|
*self.training_data.borrow_mut() = Some(data_f32.clone());
|
|
203
203
|
*self.training_embeddings.borrow_mut() = Some(embeddings.clone());
|
|
204
|
+
|
|
204
205
|
// Convert result back to Ruby array
|
|
205
|
-
let result =
|
|
206
|
+
let result = ruby.ary_new();
|
|
206
207
|
for embedding in &embeddings {
|
|
207
|
-
let row =
|
|
208
|
+
let row = ruby.ary_new();
|
|
208
209
|
for &val in embedding {
|
|
209
210
|
row.push(val)?;
|
|
210
211
|
}
|
|
@@ -213,16 +214,15 @@ impl RustUMAP {
|
|
|
213
214
|
Ok(result)
|
|
214
215
|
}
|
|
215
216
|
|
|
216
|
-
// Save the full model (training data + embeddings + params) for future transforms
|
|
217
217
|
fn save_model(&self, path: String) -> Result<(), Error> {
|
|
218
|
-
|
|
218
|
+
let ruby = Ruby::get().unwrap();
|
|
219
219
|
let training_data = self.training_data.borrow();
|
|
220
220
|
let training_embeddings = self.training_embeddings.borrow();
|
|
221
221
|
|
|
222
222
|
let training_data_ref = training_data.as_ref()
|
|
223
|
-
.ok_or_else(|| Error::new(
|
|
223
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No model to save. Run fit_transform first."))?;
|
|
224
224
|
let training_embeddings_ref = training_embeddings.as_ref()
|
|
225
|
-
.ok_or_else(|| Error::new(
|
|
225
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No embeddings to save."))?;
|
|
226
226
|
|
|
227
227
|
let saved_model = SavedUMAPModel {
|
|
228
228
|
n_components: self.n_components,
|
|
@@ -234,28 +234,29 @@ impl RustUMAP {
|
|
|
234
234
|
};
|
|
235
235
|
|
|
236
236
|
let serialized = bincode::serialize(&saved_model)
|
|
237
|
-
.map_err(|e| Error::new(
|
|
237
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
238
238
|
|
|
239
239
|
let mut file = File::create(&path)
|
|
240
|
-
.map_err(|e| Error::new(
|
|
240
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
241
241
|
|
|
242
242
|
file.write_all(&serialized)
|
|
243
|
-
.map_err(|e| Error::new(
|
|
243
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
244
244
|
|
|
245
245
|
Ok(())
|
|
246
246
|
}
|
|
247
247
|
|
|
248
|
-
// Load a full model for transforming new data
|
|
249
248
|
fn load_model(path: String) -> Result<Self, Error> {
|
|
249
|
+
let ruby = Ruby::get().unwrap();
|
|
250
|
+
|
|
250
251
|
let mut file = File::open(&path)
|
|
251
|
-
.map_err(|e| Error::new(
|
|
252
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
252
253
|
|
|
253
254
|
let mut buffer = Vec::new();
|
|
254
255
|
file.read_to_end(&mut buffer)
|
|
255
|
-
.map_err(|e| Error::new(
|
|
256
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
256
257
|
|
|
257
258
|
let saved_model: SavedUMAPModel = bincode::deserialize(&buffer)
|
|
258
|
-
.map_err(|e| Error::new(
|
|
259
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
259
260
|
|
|
260
261
|
Ok(RustUMAP {
|
|
261
262
|
n_components: saved_model.n_components,
|
|
@@ -268,43 +269,36 @@ impl RustUMAP {
|
|
|
268
269
|
})
|
|
269
270
|
}
|
|
270
271
|
|
|
271
|
-
// Transform new data using k-NN approximation with the training data
|
|
272
272
|
fn transform(&self, data: Value) -> Result<RArray, Error> {
|
|
273
|
-
|
|
273
|
+
let ruby = Ruby::get().unwrap();
|
|
274
274
|
let training_data = self.training_data.borrow();
|
|
275
275
|
let training_embeddings = self.training_embeddings.borrow();
|
|
276
276
|
|
|
277
277
|
let training_data_ref = training_data.as_ref()
|
|
278
|
-
.ok_or_else(|| Error::new(
|
|
278
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No model loaded. Load a model or run fit_transform first."))?;
|
|
279
279
|
let training_embeddings_ref = training_embeddings.as_ref()
|
|
280
|
-
.ok_or_else(|| Error::new(
|
|
280
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No embeddings available."))?;
|
|
281
281
|
|
|
282
|
-
// Convert input data to Rust format using shared helper
|
|
283
282
|
let new_data = ruby_array_to_vec_vec_f32(data)?;
|
|
284
283
|
|
|
285
|
-
// For each new point, find k nearest neighbors in training data
|
|
286
|
-
// and average their embeddings (weighted by distance)
|
|
287
284
|
let k = self.n_neighbors.min(training_data_ref.len());
|
|
288
|
-
let result =
|
|
285
|
+
let result = ruby.ary_new();
|
|
289
286
|
|
|
290
287
|
for new_point in &new_data {
|
|
291
|
-
// Calculate distances to all training points
|
|
292
288
|
let mut distances: Vec<(f32, usize)> = Vec::new();
|
|
293
289
|
for (idx, train_point) in training_data_ref.iter().enumerate() {
|
|
294
290
|
let dist = euclidean_distance(new_point, train_point);
|
|
295
291
|
distances.push((dist, idx));
|
|
296
292
|
}
|
|
297
293
|
|
|
298
|
-
// Sort by distance and take k nearest
|
|
299
294
|
distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
|
300
295
|
let k_nearest = &distances[..k];
|
|
301
296
|
|
|
302
|
-
// Weighted average of k nearest embeddings
|
|
303
297
|
let mut avg_embedding = vec![0.0; self.n_components];
|
|
304
298
|
let mut total_weight = 0.0;
|
|
305
299
|
|
|
306
300
|
for &(dist, idx) in k_nearest {
|
|
307
|
-
let weight = 1.0 / (dist as f64 + 0.001);
|
|
301
|
+
let weight = 1.0 / (dist as f64 + 0.001);
|
|
308
302
|
total_weight += weight;
|
|
309
303
|
|
|
310
304
|
for (i, &val) in training_embeddings_ref[idx].iter().enumerate() {
|
|
@@ -312,13 +306,11 @@ impl RustUMAP {
|
|
|
312
306
|
}
|
|
313
307
|
}
|
|
314
308
|
|
|
315
|
-
// Normalize
|
|
316
309
|
for val in &mut avg_embedding {
|
|
317
310
|
*val /= total_weight;
|
|
318
311
|
}
|
|
319
312
|
|
|
320
|
-
|
|
321
|
-
let row = RArray::new();
|
|
313
|
+
let row = ruby.ary_new();
|
|
322
314
|
for val in avg_embedding {
|
|
323
315
|
row.push(val)?;
|
|
324
316
|
}
|
|
@@ -335,4 +327,4 @@ fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
|
|
|
335
327
|
.map(|(x, y)| (x - y).powi(2))
|
|
336
328
|
.sum::<f32>()
|
|
337
329
|
.sqrt()
|
|
338
|
-
}
|
|
330
|
+
}
|