clusterkit 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Module, Object};
1
+ use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Module, Object, Ruby};
2
2
  use magnus::value::ReprValue;
3
3
  use hnsw_rs::prelude::*;
4
4
  use annembed::prelude::*;
@@ -21,7 +21,8 @@ struct SavedUMAPModel {
21
21
  }
22
22
 
23
23
  pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
24
- let umap_class = parent.define_class("RustUMAP", magnus::class::object())?;
24
+ let ruby = Ruby::get().unwrap();
25
+ let umap_class = parent.define_class("RustUMAP", ruby.class_object())?;
25
26
 
26
27
  umap_class.define_singleton_method("new", magnus::function!(RustUMAP::new, 1))?;
27
28
  umap_class.define_singleton_method("load_model", magnus::function!(RustUMAP::load_model, 1))?;
@@ -40,15 +41,15 @@ struct RustUMAP {
40
41
  random_seed: Option<u64>,
41
42
  nb_grad_batch: usize,
42
43
  nb_sampling_by_edge: usize,
43
- // Store the training data and embeddings for transform approximation
44
- // Use RefCell for interior mutability
45
44
  training_data: RefCell<Option<Vec<Vec<f32>>>>,
46
45
  training_embeddings: RefCell<Option<Vec<Vec<f64>>>>,
47
46
  }
48
47
 
49
48
  impl RustUMAP {
50
49
  fn new(options: RHash) -> Result<Self, Error> {
51
- let n_components = match options.lookup::<_, Value>(magnus::Symbol::new("n_components")) {
50
+ let ruby = Ruby::get().unwrap();
51
+
52
+ let n_components = match options.lookup::<_, Value>(ruby.to_symbol("n_components")) {
52
53
  Ok(val) => {
53
54
  if val.is_nil() {
54
55
  2
@@ -61,7 +62,7 @@ impl RustUMAP {
61
62
  Err(_) => 2,
62
63
  };
63
64
 
64
- let n_neighbors = match options.lookup::<_, Value>(magnus::Symbol::new("n_neighbors")) {
65
+ let n_neighbors = match options.lookup::<_, Value>(ruby.to_symbol("n_neighbors")) {
65
66
  Ok(val) => {
66
67
  if val.is_nil() {
67
68
  15
@@ -74,7 +75,7 @@ impl RustUMAP {
74
75
  Err(_) => 15,
75
76
  };
76
77
 
77
- let random_seed = match options.lookup::<_, Value>(magnus::Symbol::new("random_seed")) {
78
+ let random_seed = match options.lookup::<_, Value>(ruby.to_symbol("random_seed")) {
78
79
  Ok(val) => {
79
80
  if val.is_nil() {
80
81
  None
@@ -87,10 +88,10 @@ impl RustUMAP {
87
88
  Err(_) => None,
88
89
  };
89
90
 
90
- let nb_grad_batch = match options.lookup::<_, Value>(magnus::Symbol::new("nb_grad_batch")) {
91
+ let nb_grad_batch = match options.lookup::<_, Value>(ruby.to_symbol("nb_grad_batch")) {
91
92
  Ok(val) => {
92
93
  if val.is_nil() {
93
- 10 // Default value
94
+ 10
94
95
  } else {
95
96
  Integer::try_convert(val)
96
97
  .map(|i| i.to_u32().unwrap_or(10) as usize)
@@ -99,11 +100,11 @@ impl RustUMAP {
99
100
  }
100
101
  Err(_) => 10,
101
102
  };
102
-
103
- let nb_sampling_by_edge = match options.lookup::<_, Value>(magnus::Symbol::new("nb_sampling_by_edge")) {
103
+
104
+ let nb_sampling_by_edge = match options.lookup::<_, Value>(ruby.to_symbol("nb_sampling_by_edge")) {
104
105
  Ok(val) => {
105
106
  if val.is_nil() {
106
- 8 // Default value
107
+ 8
107
108
  } else {
108
109
  Integer::try_convert(val)
109
110
  .map(|i| i.to_u32().unwrap_or(8) as usize)
@@ -125,6 +126,8 @@ impl RustUMAP {
125
126
  }
126
127
 
127
128
  fn fit_transform(&self, data: Value) -> Result<RArray, Error> {
129
+ let ruby = Ruby::get().unwrap();
130
+
128
131
  // Convert Ruby array to Rust Vec<Vec<f32>> using shared helper
129
132
  let data_f32 = ruby_array_to_vec_vec_f32(data)?;
130
133
 
@@ -149,9 +152,7 @@ impl RustUMAP {
149
152
  .enumerate()
150
153
  .map(|(i, v)| (v, i))
151
154
  .collect();
152
-
153
- // Use serial_insert for reproducibility when seed is provided,
154
- // parallel_insert for performance when no seed
155
+
155
156
  if self.random_seed.is_some() {
156
157
  hnsw.serial_insert(&data_with_id);
157
158
  } else {
@@ -160,36 +161,34 @@ impl RustUMAP {
160
161
 
161
162
  // Create KGraph from HNSW
162
163
  let kgraph: annembed::fromhnsw::kgraph::KGraph<f32> = annembed::fromhnsw::kgraph::kgraph_from_hnsw_all(&hnsw, self.n_neighbors)
163
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
164
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
164
165
 
165
166
  // Set up embedding parameters
166
167
  let mut embed_params = EmbedderParams::default();
167
168
  embed_params.asked_dim = self.n_components;
168
- embed_params.nb_grad_batch = self.nb_grad_batch; // Configurable from Ruby
169
+ embed_params.nb_grad_batch = self.nb_grad_batch;
169
170
  embed_params.scale_rho = 1.;
170
171
  embed_params.beta = 1.;
171
172
  embed_params.b = 1.;
172
173
  embed_params.grad_step = 1.;
173
- embed_params.nb_sampling_by_edge = self.nb_sampling_by_edge; // Configurable from Ruby
174
- // Enable diffusion map initialization (annembed now has fallback to random if it fails)
174
+ embed_params.nb_sampling_by_edge = self.nb_sampling_by_edge;
175
175
  embed_params.dmap_init = true;
176
- embed_params.random_seed = self.random_seed; // Pass seed through to annembed
176
+ embed_params.random_seed = self.random_seed;
177
177
 
178
178
  // Create embedder and perform embedding
179
179
  let mut embedder = Embedder::new(&kgraph, embed_params);
180
180
 
181
181
  let embed_result = embedder.embed()
182
- .map_err(|e| Error::new(magnus::exception::runtime_error(),
182
+ .map_err(|e| Error::new(ruby.exception_runtime_error(),
183
183
  format!("Embedding failed: {}", e)))?;
184
184
 
185
185
  if embed_result == 0 {
186
- return Err(Error::new(magnus::exception::runtime_error(), "No points were embedded"));
186
+ return Err(Error::new(ruby.exception_runtime_error(), "No points were embedded"));
187
187
  }
188
188
 
189
189
  // Get embedded data
190
190
  let embedded_array = embedder.get_embedded_reindexed();
191
191
 
192
- // Store results in a simpler format
193
192
  let mut embeddings = Vec::new();
194
193
  for i in 0..embedded_array.nrows() {
195
194
  let mut row = Vec::new();
@@ -198,13 +197,15 @@ impl RustUMAP {
198
197
  }
199
198
  embeddings.push(row);
200
199
  }
200
+
201
201
  // Store the training data and embeddings for future transforms
202
202
  *self.training_data.borrow_mut() = Some(data_f32.clone());
203
203
  *self.training_embeddings.borrow_mut() = Some(embeddings.clone());
204
+
204
205
  // Convert result back to Ruby array
205
- let result = RArray::new();
206
+ let result = ruby.ary_new();
206
207
  for embedding in &embeddings {
207
- let row = RArray::new();
208
+ let row = ruby.ary_new();
208
209
  for &val in embedding {
209
210
  row.push(val)?;
210
211
  }
@@ -213,16 +214,15 @@ impl RustUMAP {
213
214
  Ok(result)
214
215
  }
215
216
 
216
- // Save the full model (training data + embeddings + params) for future transforms
217
217
  fn save_model(&self, path: String) -> Result<(), Error> {
218
- // Check if we have training data
218
+ let ruby = Ruby::get().unwrap();
219
219
  let training_data = self.training_data.borrow();
220
220
  let training_embeddings = self.training_embeddings.borrow();
221
221
 
222
222
  let training_data_ref = training_data.as_ref()
223
- .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No model to save. Run fit_transform first."))?;
223
+ .ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No model to save. Run fit_transform first."))?;
224
224
  let training_embeddings_ref = training_embeddings.as_ref()
225
- .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No embeddings to save."))?;
225
+ .ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No embeddings to save."))?;
226
226
 
227
227
  let saved_model = SavedUMAPModel {
228
228
  n_components: self.n_components,
@@ -234,28 +234,29 @@ impl RustUMAP {
234
234
  };
235
235
 
236
236
  let serialized = bincode::serialize(&saved_model)
237
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
237
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
238
238
 
239
239
  let mut file = File::create(&path)
240
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
240
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
241
241
 
242
242
  file.write_all(&serialized)
243
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
243
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
244
244
 
245
245
  Ok(())
246
246
  }
247
247
 
248
- // Load a full model for transforming new data
249
248
  fn load_model(path: String) -> Result<Self, Error> {
249
+ let ruby = Ruby::get().unwrap();
250
+
250
251
  let mut file = File::open(&path)
251
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
252
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
252
253
 
253
254
  let mut buffer = Vec::new();
254
255
  file.read_to_end(&mut buffer)
255
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
256
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
256
257
 
257
258
  let saved_model: SavedUMAPModel = bincode::deserialize(&buffer)
258
- .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
259
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
259
260
 
260
261
  Ok(RustUMAP {
261
262
  n_components: saved_model.n_components,
@@ -268,43 +269,36 @@ impl RustUMAP {
268
269
  })
269
270
  }
270
271
 
271
- // Transform new data using k-NN approximation with the training data
272
272
  fn transform(&self, data: Value) -> Result<RArray, Error> {
273
- // Get training data
273
+ let ruby = Ruby::get().unwrap();
274
274
  let training_data = self.training_data.borrow();
275
275
  let training_embeddings = self.training_embeddings.borrow();
276
276
 
277
277
  let training_data_ref = training_data.as_ref()
278
- .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No model loaded. Load a model or run fit_transform first."))?;
278
+ .ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No model loaded. Load a model or run fit_transform first."))?;
279
279
  let training_embeddings_ref = training_embeddings.as_ref()
280
- .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No embeddings available."))?;
280
+ .ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No embeddings available."))?;
281
281
 
282
- // Convert input data to Rust format using shared helper
283
282
  let new_data = ruby_array_to_vec_vec_f32(data)?;
284
283
 
285
- // For each new point, find k nearest neighbors in training data
286
- // and average their embeddings (weighted by distance)
287
284
  let k = self.n_neighbors.min(training_data_ref.len());
288
- let result = RArray::new();
285
+ let result = ruby.ary_new();
289
286
 
290
287
  for new_point in &new_data {
291
- // Calculate distances to all training points
292
288
  let mut distances: Vec<(f32, usize)> = Vec::new();
293
289
  for (idx, train_point) in training_data_ref.iter().enumerate() {
294
290
  let dist = euclidean_distance(new_point, train_point);
295
291
  distances.push((dist, idx));
296
292
  }
297
293
 
298
- // Sort by distance and take k nearest
299
294
  distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
300
295
  let k_nearest = &distances[..k];
301
296
 
302
- // Weighted average of k nearest embeddings
303
297
  let mut avg_embedding = vec![0.0; self.n_components];
304
298
  let mut total_weight = 0.0;
305
299
 
306
300
  for &(dist, idx) in k_nearest {
307
- let weight = 1.0 / (dist as f64 + 0.001); // Inverse distance weighting
301
+ let weight = 1.0 / (dist as f64 + 0.001);
308
302
  total_weight += weight;
309
303
 
310
304
  for (i, &val) in training_embeddings_ref[idx].iter().enumerate() {
@@ -312,13 +306,11 @@ impl RustUMAP {
312
306
  }
313
307
  }
314
308
 
315
- // Normalize
316
309
  for val in &mut avg_embedding {
317
310
  *val /= total_weight;
318
311
  }
319
312
 
320
- // Convert to Ruby array
321
- let row = RArray::new();
313
+ let row = ruby.ary_new();
322
314
  for val in avg_embedding {
323
315
  row.push(val)?;
324
316
  }
@@ -335,4 +327,4 @@ fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
335
327
  .map(|(x, y)| (x - y).powi(2))
336
328
  .sum::<f32>()
337
329
  .sqrt()
338
- }
330
+ }