clusterkit 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Float, Module, Object};
1
+ use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Module, Object, Ruby};
2
2
  use magnus::value::ReprValue;
3
3
  use hnsw_rs::prelude::*;
4
4
  use annembed::prelude::*;
@@ -7,6 +7,7 @@ use std::io::{Write, Read};
7
7
  use std::cell::RefCell;
8
8
  use bincode;
9
9
  use serde::{Serialize, Deserialize};
10
+ use crate::utils::ruby_array_to_vec_vec_f32;
10
11
 
11
12
  // Simple struct to serialize UMAP results
12
13
  #[derive(Serialize, Deserialize)]
@@ -20,7 +21,8 @@ struct SavedUMAPModel {
20
21
  }
21
22
 
22
23
  pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
23
- let umap_class = parent.define_class("RustUMAP", magnus::class::object())?;
24
+ let ruby = Ruby::get().unwrap();
25
+ let umap_class = parent.define_class("RustUMAP", ruby.class_object())?;
24
26
 
25
27
  umap_class.define_singleton_method("new", magnus::function!(RustUMAP::new, 1))?;
26
28
  umap_class.define_singleton_method("load_model", magnus::function!(RustUMAP::load_model, 1))?;
@@ -39,15 +41,15 @@ struct RustUMAP {
39
41
  random_seed: Option<u64>,
40
42
  nb_grad_batch: usize,
41
43
  nb_sampling_by_edge: usize,
42
- // Store the training data and embeddings for transform approximation
43
- // Use RefCell for interior mutability
44
44
  training_data: RefCell<Option<Vec<Vec<f32>>>>,
45
45
  training_embeddings: RefCell<Option<Vec<Vec<f64>>>>,
46
46
  }
47
47
 
48
48
  impl RustUMAP {
49
49
  fn new(options: RHash) -> Result<Self, Error> {
50
- let n_components = match options.lookup::<_, Value>(magnus::Symbol::new("n_components")) {
50
+ let ruby = Ruby::get().unwrap();
51
+
52
+ let n_components = match options.lookup::<_, Value>(ruby.to_symbol("n_components")) {
51
53
  Ok(val) => {
52
54
  if val.is_nil() {
53
55
  2
@@ -60,7 +62,7 @@ impl RustUMAP {
60
62
  Err(_) => 2,
61
63
  };
62
64
 
63
- let n_neighbors = match options.lookup::<_, Value>(magnus::Symbol::new("n_neighbors")) {
65
+ let n_neighbors = match options.lookup::<_, Value>(ruby.to_symbol("n_neighbors")) {
64
66
  Ok(val) => {
65
67
  if val.is_nil() {
66
68
  15
@@ -73,7 +75,7 @@ impl RustUMAP {
73
75
  Err(_) => 15,
74
76
  };
75
77
 
76
- let random_seed = match options.lookup::<_, Value>(magnus::Symbol::new("random_seed")) {
78
+ let random_seed = match options.lookup::<_, Value>(ruby.to_symbol("random_seed")) {
77
79
  Ok(val) => {
78
80
  if val.is_nil() {
79
81
  None
@@ -86,10 +88,10 @@ impl RustUMAP {
86
88
  Err(_) => None,
87
89
  };
88
90
 
89
- let nb_grad_batch = match options.lookup::<_, Value>(magnus::Symbol::new("nb_grad_batch")) {
91
+ let nb_grad_batch = match options.lookup::<_, Value>(ruby.to_symbol("nb_grad_batch")) {
90
92
  Ok(val) => {
91
93
  if val.is_nil() {
92
- 10 // Default value
94
+ 10
93
95
  } else {
94
96
  Integer::try_convert(val)
95
97
  .map(|i| i.to_u32().unwrap_or(10) as usize)
@@ -98,11 +100,11 @@ impl RustUMAP {
98
100
  }
99
101
  Err(_) => 10,
100
102
  };
101
-
102
- let nb_sampling_by_edge = match options.lookup::<_, Value>(magnus::Symbol::new("nb_sampling_by_edge")) {
103
+
104
+ let nb_sampling_by_edge = match options.lookup::<_, Value>(ruby.to_symbol("nb_sampling_by_edge")) {
103
105
  Ok(val) => {
104
106
  if val.is_nil() {
105
- 8 // Default value
107
+ 8
106
108
  } else {
107
109
  Integer::try_convert(val)
108
110
  .map(|i| i.to_u32().unwrap_or(8) as usize)
@@ -124,61 +126,10 @@ impl RustUMAP {
124
126
  }
125
127
 
126
128
  fn fit_transform(&self, data: Value) -> Result<RArray, Error> {
127
- // Convert Ruby array to Rust Vec<Vec<f64>>
128
- let ruby_array = RArray::try_convert(data)?;
129
- let mut rust_data: Vec<Vec<f64>> = Vec::new();
130
-
131
- // Get array length
132
- let array_len = ruby_array.len();
133
-
134
- for i in 0..array_len {
135
- let row = ruby_array.entry::<Value>(i as isize)?;
136
- let row_array = RArray::try_convert(row).map_err(|_| {
137
- Error::new(
138
- magnus::exception::type_error(),
139
- "Expected array of arrays (2D array)",
140
- )
141
- })?;
142
-
143
- let mut rust_row: Vec<f64> = Vec::new();
144
- let row_len = row_array.len();
145
-
146
- for j in 0..row_len {
147
- let val = row_array.entry::<Value>(j as isize)?;
148
- let float_val = if let Ok(f) = Float::try_convert(val) {
149
- f.to_f64()
150
- } else if let Ok(i) = Integer::try_convert(val) {
151
- i.to_i64()? as f64
152
- } else {
153
- return Err(Error::new(
154
- magnus::exception::type_error(),
155
- "All values must be numeric",
156
- ));
157
- };
158
- rust_row.push(float_val);
159
- }
160
-
161
- if !rust_data.is_empty() && rust_row.len() != rust_data[0].len() {
162
- return Err(Error::new(
163
- magnus::exception::arg_error(),
164
- "All rows must have the same length",
165
- ));
166
- }
167
-
168
- rust_data.push(rust_row);
169
- }
170
-
171
- if rust_data.is_empty() {
172
- return Err(Error::new(
173
- magnus::exception::arg_error(),
174
- "Input data cannot be empty",
175
- ));
176
- }
129
+ let ruby = Ruby::get().unwrap();
177
130
 
178
- // Convert to Vec<Vec<f32>> for HNSW
179
- let data_f32: Vec<Vec<f32>> = rust_data.iter()
180
- .map(|row| row.iter().map(|&x| x as f32).collect())
181
- .collect();
131
+ // Convert Ruby array to Rust Vec<Vec<f32>> using shared helper
132
+ let data_f32 = ruby_array_to_vec_vec_f32(data)?;
182
133
 
183
134
  // Build HNSW graph
184
135
  let ef_c = 50;
@@ -201,9 +152,7 @@ impl RustUMAP {
201
152
  .enumerate()
202
153
  .map(|(i, v)| (v, i))
203
154
  .collect();
204
-
205
- // Use serial_insert for reproducibility when seed is provided,
206
- // parallel_insert for performance when no seed
155
+
207
156
  if self.random_seed.is_some() {
208
157
  hnsw.serial_insert(&data_with_id);
209
158
  } else {
@@ -212,36 +161,34 @@ impl RustUMAP {
212
161
 
213
162
  // Create KGraph from HNSW
214
163
  let kgraph: annembed::fromhnsw::kgraph::KGraph<f32> = annembed::fromhnsw::kgraph::kgraph_from_hnsw_all(&hnsw, self.n_neighbors)
215
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
164
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
216
165
 
217
166
  // Set up embedding parameters
218
167
  let mut embed_params = EmbedderParams::default();
219
168
  embed_params.asked_dim = self.n_components;
220
- embed_params.nb_grad_batch = self.nb_grad_batch; // Configurable from Ruby
169
+ embed_params.nb_grad_batch = self.nb_grad_batch;
221
170
  embed_params.scale_rho = 1.;
222
171
  embed_params.beta = 1.;
223
172
  embed_params.b = 1.;
224
173
  embed_params.grad_step = 1.;
225
- embed_params.nb_sampling_by_edge = self.nb_sampling_by_edge; // Configurable from Ruby
226
- // Enable diffusion map initialization (annembed now has fallback to random if it fails)
174
+ embed_params.nb_sampling_by_edge = self.nb_sampling_by_edge;
227
175
  embed_params.dmap_init = true;
228
- embed_params.random_seed = self.random_seed; // Pass seed through to annembed
176
+ embed_params.random_seed = self.random_seed;
229
177
 
230
178
  // Create embedder and perform embedding
231
179
  let mut embedder = Embedder::new(&kgraph, embed_params);
232
180
 
233
181
  let embed_result = embedder.embed()
234
- .map_err(|e| Error::new(magnus::exception::runtime_error(),
182
+ .map_err(|e| Error::new(ruby.exception_runtime_error(),
235
183
  format!("Embedding failed: {}", e)))?;
236
184
 
237
185
  if embed_result == 0 {
238
- return Err(Error::new(magnus::exception::runtime_error(), "No points were embedded"));
186
+ return Err(Error::new(ruby.exception_runtime_error(), "No points were embedded"));
239
187
  }
240
188
 
241
189
  // Get embedded data
242
190
  let embedded_array = embedder.get_embedded_reindexed();
243
191
 
244
- // Store results in a simpler format
245
192
  let mut embeddings = Vec::new();
246
193
  for i in 0..embedded_array.nrows() {
247
194
  let mut row = Vec::new();
@@ -250,13 +197,15 @@ impl RustUMAP {
250
197
  }
251
198
  embeddings.push(row);
252
199
  }
200
+
253
201
  // Store the training data and embeddings for future transforms
254
202
  *self.training_data.borrow_mut() = Some(data_f32.clone());
255
203
  *self.training_embeddings.borrow_mut() = Some(embeddings.clone());
204
+
256
205
  // Convert result back to Ruby array
257
- let result = RArray::new();
206
+ let result = ruby.ary_new();
258
207
  for embedding in &embeddings {
259
- let row = RArray::new();
208
+ let row = ruby.ary_new();
260
209
  for &val in embedding {
261
210
  row.push(val)?;
262
211
  }
@@ -265,16 +214,15 @@ impl RustUMAP {
265
214
  Ok(result)
266
215
  }
267
216
 
268
- // Save the full model (training data + embeddings + params) for future transforms
269
217
  fn save_model(&self, path: String) -> Result<(), Error> {
270
- // Check if we have training data
218
+ let ruby = Ruby::get().unwrap();
271
219
  let training_data = self.training_data.borrow();
272
220
  let training_embeddings = self.training_embeddings.borrow();
273
221
 
274
222
  let training_data_ref = training_data.as_ref()
275
- .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No model to save. Run fit_transform first."))?;
223
+ .ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No model to save. Run fit_transform first."))?;
276
224
  let training_embeddings_ref = training_embeddings.as_ref()
277
- .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No embeddings to save."))?;
225
+ .ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No embeddings to save."))?;
278
226
 
279
227
  let saved_model = SavedUMAPModel {
280
228
  n_components: self.n_components,
@@ -286,28 +234,29 @@ impl RustUMAP {
286
234
  };
287
235
 
288
236
  let serialized = bincode::serialize(&saved_model)
289
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
237
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
290
238
 
291
239
  let mut file = File::create(&path)
292
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
240
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
293
241
 
294
242
  file.write_all(&serialized)
295
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
243
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
296
244
 
297
245
  Ok(())
298
246
  }
299
247
 
300
- // Load a full model for transforming new data
301
248
  fn load_model(path: String) -> Result<Self, Error> {
249
+ let ruby = Ruby::get().unwrap();
250
+
302
251
  let mut file = File::open(&path)
303
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
252
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
304
253
 
305
254
  let mut buffer = Vec::new();
306
255
  file.read_to_end(&mut buffer)
307
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
256
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
308
257
 
309
258
  let saved_model: SavedUMAPModel = bincode::deserialize(&buffer)
310
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
259
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
311
260
 
312
261
  Ok(RustUMAP {
313
262
  n_components: saved_model.n_components,
@@ -320,66 +269,36 @@ impl RustUMAP {
320
269
  })
321
270
  }
322
271
 
323
- // Transform new data using k-NN approximation with the training data
324
272
  fn transform(&self, data: Value) -> Result<RArray, Error> {
325
- // Get training data
273
+ let ruby = Ruby::get().unwrap();
326
274
  let training_data = self.training_data.borrow();
327
275
  let training_embeddings = self.training_embeddings.borrow();
328
276
 
329
277
  let training_data_ref = training_data.as_ref()
330
- .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No model loaded. Load a model or run fit_transform first."))?;
278
+ .ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No model loaded. Load a model or run fit_transform first."))?;
331
279
  let training_embeddings_ref = training_embeddings.as_ref()
332
- .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No embeddings available."))?;
333
-
334
- // Convert input data to Rust format
335
- let ruby_array = RArray::try_convert(data)?;
336
- let mut new_data: Vec<Vec<f32>> = Vec::new();
337
-
338
- for i in 0..ruby_array.len() {
339
- let row = ruby_array.entry::<Value>(i as isize)?;
340
- let row_array = RArray::try_convert(row)?;
341
- let mut rust_row: Vec<f32> = Vec::new();
342
-
343
- for j in 0..row_array.len() {
344
- let val = row_array.entry::<Value>(j as isize)?;
345
- let float_val = if let Ok(f) = Float::try_convert(val) {
346
- f.to_f64() as f32
347
- } else if let Ok(i) = Integer::try_convert(val) {
348
- i.to_i64()? as f32
349
- } else {
350
- return Err(Error::new(
351
- magnus::exception::type_error(),
352
- "All values must be numeric",
353
- ));
354
- };
355
- rust_row.push(float_val);
356
- }
357
- new_data.push(rust_row);
358
- }
280
+ .ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No embeddings available."))?;
281
+
282
+ let new_data = ruby_array_to_vec_vec_f32(data)?;
359
283
 
360
- // For each new point, find k nearest neighbors in training data
361
- // and average their embeddings (weighted by distance)
362
284
  let k = self.n_neighbors.min(training_data_ref.len());
363
- let result = RArray::new();
285
+ let result = ruby.ary_new();
364
286
 
365
287
  for new_point in &new_data {
366
- // Calculate distances to all training points
367
288
  let mut distances: Vec<(f32, usize)> = Vec::new();
368
289
  for (idx, train_point) in training_data_ref.iter().enumerate() {
369
290
  let dist = euclidean_distance(new_point, train_point);
370
291
  distances.push((dist, idx));
371
292
  }
372
293
 
373
- // Sort by distance and take k nearest
374
294
  distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
375
295
  let k_nearest = &distances[..k];
376
296
 
377
- // Weighted average of k nearest embeddings
378
297
  let mut avg_embedding = vec![0.0; self.n_components];
379
298
  let mut total_weight = 0.0;
380
299
 
381
300
  for &(dist, idx) in k_nearest {
382
- let weight = 1.0 / (dist as f64 + 0.001); // Inverse distance weighting
301
+ let weight = 1.0 / (dist as f64 + 0.001);
383
302
  total_weight += weight;
384
303
 
385
304
  for (i, &val) in training_embeddings_ref[idx].iter().enumerate() {
@@ -387,13 +306,11 @@ impl RustUMAP {
387
306
  }
388
307
  }
389
308
 
390
- // Normalize
391
309
  for val in &mut avg_embedding {
392
310
  *val /= total_weight;
393
311
  }
394
312
 
395
- // Convert to Ruby array
396
- let row = RArray::new();
313
+ let row = ruby.ary_new();
397
314
  for val in avg_embedding {
398
315
  row.push(val)?;
399
316
  }
@@ -410,4 +327,4 @@ fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
410
327
  .map(|(x, y)| (x - y).powi(2))
411
328
  .sum::<f32>()
412
329
  .sqrt()
413
- }
330
+ }