clusterkit 0.2.4 → 0.2.6.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Module, Object};
1
+ use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Module, Object, Ruby};
2
2
  use magnus::value::ReprValue;
3
3
  use hnsw_rs::prelude::*;
4
4
  use annembed::prelude::*;
@@ -21,7 +21,8 @@ struct SavedUMAPModel {
21
21
  }
22
22
 
23
23
  pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
24
- let umap_class = parent.define_class("RustUMAP", magnus::class::object())?;
24
+ let ruby = Ruby::get().unwrap();
25
+ let umap_class = parent.define_class("RustUMAP", ruby.class_object())?;
25
26
 
26
27
  umap_class.define_singleton_method("new", magnus::function!(RustUMAP::new, 1))?;
27
28
  umap_class.define_singleton_method("load_model", magnus::function!(RustUMAP::load_model, 1))?;
@@ -40,15 +41,15 @@ struct RustUMAP {
40
41
  random_seed: Option<u64>,
41
42
  nb_grad_batch: usize,
42
43
  nb_sampling_by_edge: usize,
43
- // Store the training data and embeddings for transform approximation
44
- // Use RefCell for interior mutability
45
44
  training_data: RefCell<Option<Vec<Vec<f32>>>>,
46
45
  training_embeddings: RefCell<Option<Vec<Vec<f64>>>>,
47
46
  }
48
47
 
49
48
  impl RustUMAP {
50
49
  fn new(options: RHash) -> Result<Self, Error> {
51
- let n_components = match options.lookup::<_, Value>(magnus::Symbol::new("n_components")) {
50
+ let ruby = Ruby::get().unwrap();
51
+
52
+ let n_components = match options.lookup::<_, Value>(ruby.to_symbol("n_components")) {
52
53
  Ok(val) => {
53
54
  if val.is_nil() {
54
55
  2
@@ -61,7 +62,7 @@ impl RustUMAP {
61
62
  Err(_) => 2,
62
63
  };
63
64
 
64
- let n_neighbors = match options.lookup::<_, Value>(magnus::Symbol::new("n_neighbors")) {
65
+ let n_neighbors = match options.lookup::<_, Value>(ruby.to_symbol("n_neighbors")) {
65
66
  Ok(val) => {
66
67
  if val.is_nil() {
67
68
  15
@@ -74,7 +75,7 @@ impl RustUMAP {
74
75
  Err(_) => 15,
75
76
  };
76
77
 
77
- let random_seed = match options.lookup::<_, Value>(magnus::Symbol::new("random_seed")) {
78
+ let random_seed = match options.lookup::<_, Value>(ruby.to_symbol("random_seed")) {
78
79
  Ok(val) => {
79
80
  if val.is_nil() {
80
81
  None
@@ -87,10 +88,10 @@ impl RustUMAP {
87
88
  Err(_) => None,
88
89
  };
89
90
 
90
- let nb_grad_batch = match options.lookup::<_, Value>(magnus::Symbol::new("nb_grad_batch")) {
91
+ let nb_grad_batch = match options.lookup::<_, Value>(ruby.to_symbol("nb_grad_batch")) {
91
92
  Ok(val) => {
92
93
  if val.is_nil() {
93
- 10 // Default value
94
+ 10
94
95
  } else {
95
96
  Integer::try_convert(val)
96
97
  .map(|i| i.to_u32().unwrap_or(10) as usize)
@@ -99,11 +100,11 @@ impl RustUMAP {
99
100
  }
100
101
  Err(_) => 10,
101
102
  };
102
-
103
- let nb_sampling_by_edge = match options.lookup::<_, Value>(magnus::Symbol::new("nb_sampling_by_edge")) {
103
+
104
+ let nb_sampling_by_edge = match options.lookup::<_, Value>(ruby.to_symbol("nb_sampling_by_edge")) {
104
105
  Ok(val) => {
105
106
  if val.is_nil() {
106
- 8 // Default value
107
+ 8
107
108
  } else {
108
109
  Integer::try_convert(val)
109
110
  .map(|i| i.to_u32().unwrap_or(8) as usize)
@@ -125,6 +126,8 @@ impl RustUMAP {
125
126
  }
126
127
 
127
128
  fn fit_transform(&self, data: Value) -> Result<RArray, Error> {
129
+ let ruby = Ruby::get().unwrap();
130
+
128
131
  // Convert Ruby array to Rust Vec<Vec<f32>> using shared helper
129
132
  let data_f32 = ruby_array_to_vec_vec_f32(data)?;
130
133
 
@@ -149,9 +152,7 @@ impl RustUMAP {
149
152
  .enumerate()
150
153
  .map(|(i, v)| (v, i))
151
154
  .collect();
152
-
153
- // Use serial_insert for reproducibility when seed is provided,
154
- // parallel_insert for performance when no seed
155
+
155
156
  if self.random_seed.is_some() {
156
157
  hnsw.serial_insert(&data_with_id);
157
158
  } else {
@@ -160,36 +161,53 @@ impl RustUMAP {
160
161
 
161
162
  // Create KGraph from HNSW
162
163
  let kgraph: annembed::fromhnsw::kgraph::KGraph<f32> = annembed::fromhnsw::kgraph::kgraph_from_hnsw_all(&hnsw, self.n_neighbors)
163
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
164
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
164
165
 
165
166
  // Set up embedding parameters
166
167
  let mut embed_params = EmbedderParams::default();
167
168
  embed_params.asked_dim = self.n_components;
168
- embed_params.nb_grad_batch = self.nb_grad_batch; // Configurable from Ruby
169
+ embed_params.nb_grad_batch = self.nb_grad_batch;
169
170
  embed_params.scale_rho = 1.;
170
171
  embed_params.beta = 1.;
171
172
  embed_params.b = 1.;
172
173
  embed_params.grad_step = 1.;
173
- embed_params.nb_sampling_by_edge = self.nb_sampling_by_edge; // Configurable from Ruby
174
- // Enable diffusion map initialization (annembed now has fallback to random if it fails)
174
+ embed_params.nb_sampling_by_edge = self.nb_sampling_by_edge;
175
175
  embed_params.dmap_init = true;
176
- embed_params.random_seed = self.random_seed; // Pass seed through to annembed
176
+ embed_params.random_seed = self.random_seed;
177
177
 
178
178
  // Create embedder and perform embedding
179
179
  let mut embedder = Embedder::new(&kgraph, embed_params);
180
180
 
181
- let embed_result = embedder.embed()
182
- .map_err(|e| Error::new(magnus::exception::runtime_error(),
183
- format!("Embedding failed: {}", e)))?;
181
+ let embed_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
182
+ embedder.embed()
183
+ }));
184
+
185
+ let embed_result = match embed_result {
186
+ Ok(Ok(result)) => result,
187
+ Ok(Err(e)) => {
188
+ return Err(Error::new(ruby.exception_runtime_error(),
189
+ format!("Embedding failed: {}", e)));
190
+ }
191
+ Err(panic_info) => {
192
+ let msg = if let Some(s) = panic_info.downcast_ref::<String>() {
193
+ s.clone()
194
+ } else if let Some(s) = panic_info.downcast_ref::<&str>() {
195
+ s.to_string()
196
+ } else {
197
+ "unknown panic".to_string()
198
+ };
199
+ return Err(Error::new(ruby.exception_runtime_error(),
200
+ format!("Embedding panicked: {}", msg)));
201
+ }
202
+ };
184
203
 
185
204
  if embed_result == 0 {
186
- return Err(Error::new(magnus::exception::runtime_error(), "No points were embedded"));
205
+ return Err(Error::new(ruby.exception_runtime_error(), "No points were embedded"));
187
206
  }
188
207
 
189
208
  // Get embedded data
190
209
  let embedded_array = embedder.get_embedded_reindexed();
191
210
 
192
- // Store results in a simpler format
193
211
  let mut embeddings = Vec::new();
194
212
  for i in 0..embedded_array.nrows() {
195
213
  let mut row = Vec::new();
@@ -198,13 +216,15 @@ impl RustUMAP {
198
216
  }
199
217
  embeddings.push(row);
200
218
  }
219
+
201
220
  // Store the training data and embeddings for future transforms
202
221
  *self.training_data.borrow_mut() = Some(data_f32.clone());
203
222
  *self.training_embeddings.borrow_mut() = Some(embeddings.clone());
223
+
204
224
  // Convert result back to Ruby array
205
- let result = RArray::new();
225
+ let result = ruby.ary_new();
206
226
  for embedding in &embeddings {
207
- let row = RArray::new();
227
+ let row = ruby.ary_new();
208
228
  for &val in embedding {
209
229
  row.push(val)?;
210
230
  }
@@ -213,16 +233,15 @@ impl RustUMAP {
213
233
  Ok(result)
214
234
  }
215
235
 
216
- // Save the full model (training data + embeddings + params) for future transforms
217
236
  fn save_model(&self, path: String) -> Result<(), Error> {
218
- // Check if we have training data
237
+ let ruby = Ruby::get().unwrap();
219
238
  let training_data = self.training_data.borrow();
220
239
  let training_embeddings = self.training_embeddings.borrow();
221
240
 
222
241
  let training_data_ref = training_data.as_ref()
223
- .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No model to save. Run fit_transform first."))?;
242
+ .ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No model to save. Run fit_transform first."))?;
224
243
  let training_embeddings_ref = training_embeddings.as_ref()
225
- .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No embeddings to save."))?;
244
+ .ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No embeddings to save."))?;
226
245
 
227
246
  let saved_model = SavedUMAPModel {
228
247
  n_components: self.n_components,
@@ -234,28 +253,29 @@ impl RustUMAP {
234
253
  };
235
254
 
236
255
  let serialized = bincode::serialize(&saved_model)
237
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
256
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
238
257
 
239
258
  let mut file = File::create(&path)
240
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
259
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
241
260
 
242
261
  file.write_all(&serialized)
243
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
262
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
244
263
 
245
264
  Ok(())
246
265
  }
247
266
 
248
- // Load a full model for transforming new data
249
267
  fn load_model(path: String) -> Result<Self, Error> {
268
+ let ruby = Ruby::get().unwrap();
269
+
250
270
  let mut file = File::open(&path)
251
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
271
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
252
272
 
253
273
  let mut buffer = Vec::new();
254
274
  file.read_to_end(&mut buffer)
255
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
275
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
256
276
 
257
277
  let saved_model: SavedUMAPModel = bincode::deserialize(&buffer)
258
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
278
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
259
279
 
260
280
  Ok(RustUMAP {
261
281
  n_components: saved_model.n_components,
@@ -268,43 +288,36 @@ impl RustUMAP {
268
288
  })
269
289
  }
270
290
 
271
- // Transform new data using k-NN approximation with the training data
272
291
  fn transform(&self, data: Value) -> Result<RArray, Error> {
273
- // Get training data
292
+ let ruby = Ruby::get().unwrap();
274
293
  let training_data = self.training_data.borrow();
275
294
  let training_embeddings = self.training_embeddings.borrow();
276
295
 
277
296
  let training_data_ref = training_data.as_ref()
278
- .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No model loaded. Load a model or run fit_transform first."))?;
297
+ .ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No model loaded. Load a model or run fit_transform first."))?;
279
298
  let training_embeddings_ref = training_embeddings.as_ref()
280
- .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No embeddings available."))?;
299
+ .ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No embeddings available."))?;
281
300
 
282
- // Convert input data to Rust format using shared helper
283
301
  let new_data = ruby_array_to_vec_vec_f32(data)?;
284
302
 
285
- // For each new point, find k nearest neighbors in training data
286
- // and average their embeddings (weighted by distance)
287
303
  let k = self.n_neighbors.min(training_data_ref.len());
288
- let result = RArray::new();
304
+ let result = ruby.ary_new();
289
305
 
290
306
  for new_point in &new_data {
291
- // Calculate distances to all training points
292
307
  let mut distances: Vec<(f32, usize)> = Vec::new();
293
308
  for (idx, train_point) in training_data_ref.iter().enumerate() {
294
309
  let dist = euclidean_distance(new_point, train_point);
295
310
  distances.push((dist, idx));
296
311
  }
297
312
 
298
- // Sort by distance and take k nearest
299
313
  distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
300
314
  let k_nearest = &distances[..k];
301
315
 
302
- // Weighted average of k nearest embeddings
303
316
  let mut avg_embedding = vec![0.0; self.n_components];
304
317
  let mut total_weight = 0.0;
305
318
 
306
319
  for &(dist, idx) in k_nearest {
307
- let weight = 1.0 / (dist as f64 + 0.001); // Inverse distance weighting
320
+ let weight = 1.0 / (dist as f64 + 0.001);
308
321
  total_weight += weight;
309
322
 
310
323
  for (i, &val) in training_embeddings_ref[idx].iter().enumerate() {
@@ -312,13 +325,11 @@ impl RustUMAP {
312
325
  }
313
326
  }
314
327
 
315
- // Normalize
316
328
  for val in &mut avg_embedding {
317
329
  *val /= total_weight;
318
330
  }
319
331
 
320
- // Convert to Ruby array
321
- let row = RArray::new();
332
+ let row = ruby.ary_new();
322
333
  for val in avg_embedding {
323
334
  row.push(val)?;
324
335
  }
@@ -335,4 +346,4 @@ fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
335
346
  .map(|(x, y)| (x - y).powi(2))
336
347
  .sum::<f32>()
337
348
  .sqrt()
338
- }
349
+ }