clusterkit 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.simplecov +47 -0
  4. data/CHANGELOG.md +35 -0
  5. data/CLAUDE.md +226 -0
  6. data/Cargo.toml +8 -0
  7. data/Gemfile +17 -0
  8. data/IMPLEMENTATION_NOTES.md +143 -0
  9. data/LICENSE.txt +21 -0
  10. data/PYTHON_COMPARISON.md +183 -0
  11. data/README.md +499 -0
  12. data/Rakefile +245 -0
  13. data/clusterkit.gemspec +45 -0
  14. data/docs/KNOWN_ISSUES.md +130 -0
  15. data/docs/RUST_ERROR_HANDLING.md +164 -0
  16. data/docs/TEST_FIXTURES.md +170 -0
  17. data/docs/UMAP_EXPLAINED.md +362 -0
  18. data/docs/UMAP_TROUBLESHOOTING.md +284 -0
  19. data/docs/VERBOSE_OUTPUT.md +84 -0
  20. data/examples/hdbscan_example.rb +147 -0
  21. data/examples/optimal_kmeans_example.rb +96 -0
  22. data/examples/pca_example.rb +114 -0
  23. data/examples/reproducible_umap.rb +99 -0
  24. data/examples/verbose_control.rb +43 -0
  25. data/ext/clusterkit/Cargo.toml +25 -0
  26. data/ext/clusterkit/extconf.rb +4 -0
  27. data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +115 -0
  28. data/ext/clusterkit/src/clustering.rs +267 -0
  29. data/ext/clusterkit/src/embedder.rs +413 -0
  30. data/ext/clusterkit/src/lib.rs +22 -0
  31. data/ext/clusterkit/src/svd.rs +112 -0
  32. data/ext/clusterkit/src/tests.rs +16 -0
  33. data/ext/clusterkit/src/utils.rs +33 -0
  34. data/lib/clusterkit/clustering/hdbscan.rb +177 -0
  35. data/lib/clusterkit/clustering.rb +213 -0
  36. data/lib/clusterkit/clusterkit.rb +9 -0
  37. data/lib/clusterkit/configuration.rb +24 -0
  38. data/lib/clusterkit/dimensionality/pca.rb +251 -0
  39. data/lib/clusterkit/dimensionality/svd.rb +144 -0
  40. data/lib/clusterkit/dimensionality/umap.rb +311 -0
  41. data/lib/clusterkit/dimensionality.rb +29 -0
  42. data/lib/clusterkit/hdbscan_api_design.rb +142 -0
  43. data/lib/clusterkit/preprocessing.rb +106 -0
  44. data/lib/clusterkit/silence.rb +42 -0
  45. data/lib/clusterkit/utils.rb +51 -0
  46. data/lib/clusterkit/version.rb +5 -0
  47. data/lib/clusterkit.rb +93 -0
  48. data/lib/tasks/visualize.rake +641 -0
  49. metadata +194 -0
@@ -0,0 +1,413 @@
1
+ use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Float, Module, Object};
2
+ use magnus::value::ReprValue;
3
+ use hnsw_rs::prelude::*;
4
+ use annembed::prelude::*;
5
+ use std::fs::File;
6
+ use std::io::{Write, Read};
7
+ use std::cell::RefCell;
8
+ use bincode;
9
+ use serde::{Serialize, Deserialize};
10
+
11
+ // Simple struct to serialize UMAP results
12
+ #[derive(Serialize, Deserialize)]
13
+ struct SavedUMAPModel {
14
+ n_components: usize,
15
+ n_neighbors: usize,
16
+ nb_grad_batch: usize,
17
+ nb_sampling_by_edge: usize,
18
+ embeddings: Vec<Vec<f64>>,
19
+ original_data: Vec<Vec<f32>>,
20
+ }
21
+
22
+ pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
23
+ let umap_class = parent.define_class("RustUMAP", magnus::class::object())?;
24
+
25
+ umap_class.define_singleton_method("new", magnus::function!(RustUMAP::new, 1))?;
26
+ umap_class.define_singleton_method("load_model", magnus::function!(RustUMAP::load_model, 1))?;
27
+ umap_class.define_method("fit_transform", magnus::method!(RustUMAP::fit_transform, 1))?;
28
+ umap_class.define_method("save_model", magnus::method!(RustUMAP::save_model, 1))?;
29
+ umap_class.define_method("transform", magnus::method!(RustUMAP::transform, 1))?;
30
+
31
+ Ok(())
32
+ }
33
+
34
+ #[magnus::wrap(class = "ClusterKit::RustUMAP")]
35
+ struct RustUMAP {
36
+ n_components: usize,
37
+ n_neighbors: usize,
38
+ #[allow(dead_code)]
39
+ random_seed: Option<u64>,
40
+ nb_grad_batch: usize,
41
+ nb_sampling_by_edge: usize,
42
+ // Store the training data and embeddings for transform approximation
43
+ // Use RefCell for interior mutability
44
+ training_data: RefCell<Option<Vec<Vec<f32>>>>,
45
+ training_embeddings: RefCell<Option<Vec<Vec<f64>>>>,
46
+ }
47
+
48
+ impl RustUMAP {
49
+ fn new(options: RHash) -> Result<Self, Error> {
50
+ let n_components = match options.lookup::<_, Value>(magnus::Symbol::new("n_components")) {
51
+ Ok(val) => {
52
+ if val.is_nil() {
53
+ 2
54
+ } else {
55
+ Integer::try_convert(val)
56
+ .map(|i| i.to_u32().unwrap_or(2) as usize)
57
+ .unwrap_or(2)
58
+ }
59
+ }
60
+ Err(_) => 2,
61
+ };
62
+
63
+ let n_neighbors = match options.lookup::<_, Value>(magnus::Symbol::new("n_neighbors")) {
64
+ Ok(val) => {
65
+ if val.is_nil() {
66
+ 15
67
+ } else {
68
+ Integer::try_convert(val)
69
+ .map(|i| i.to_u32().unwrap_or(15) as usize)
70
+ .unwrap_or(15)
71
+ }
72
+ }
73
+ Err(_) => 15,
74
+ };
75
+
76
+ let random_seed = match options.lookup::<_, Value>(magnus::Symbol::new("random_seed")) {
77
+ Ok(val) => {
78
+ if val.is_nil() {
79
+ None
80
+ } else {
81
+ Integer::try_convert(val)
82
+ .map(|i| Some(i.to_u64().unwrap_or(42)))
83
+ .unwrap_or(None)
84
+ }
85
+ }
86
+ Err(_) => None,
87
+ };
88
+
89
+ let nb_grad_batch = match options.lookup::<_, Value>(magnus::Symbol::new("nb_grad_batch")) {
90
+ Ok(val) => {
91
+ if val.is_nil() {
92
+ 10 // Default value
93
+ } else {
94
+ Integer::try_convert(val)
95
+ .map(|i| i.to_u32().unwrap_or(10) as usize)
96
+ .unwrap_or(10)
97
+ }
98
+ }
99
+ Err(_) => 10,
100
+ };
101
+
102
+ let nb_sampling_by_edge = match options.lookup::<_, Value>(magnus::Symbol::new("nb_sampling_by_edge")) {
103
+ Ok(val) => {
104
+ if val.is_nil() {
105
+ 8 // Default value
106
+ } else {
107
+ Integer::try_convert(val)
108
+ .map(|i| i.to_u32().unwrap_or(8) as usize)
109
+ .unwrap_or(8)
110
+ }
111
+ }
112
+ Err(_) => 8,
113
+ };
114
+
115
+ Ok(RustUMAP {
116
+ n_components,
117
+ n_neighbors,
118
+ random_seed,
119
+ nb_grad_batch,
120
+ nb_sampling_by_edge,
121
+ training_data: RefCell::new(None),
122
+ training_embeddings: RefCell::new(None),
123
+ })
124
+ }
125
+
126
+ fn fit_transform(&self, data: Value) -> Result<RArray, Error> {
127
+ // Convert Ruby array to Rust Vec<Vec<f64>>
128
+ let ruby_array = RArray::try_convert(data)?;
129
+ let mut rust_data: Vec<Vec<f64>> = Vec::new();
130
+
131
+ // Get array length
132
+ let array_len = ruby_array.len();
133
+
134
+ for i in 0..array_len {
135
+ let row = ruby_array.entry::<Value>(i as isize)?;
136
+ let row_array = RArray::try_convert(row).map_err(|_| {
137
+ Error::new(
138
+ magnus::exception::type_error(),
139
+ "Expected array of arrays (2D array)",
140
+ )
141
+ })?;
142
+
143
+ let mut rust_row: Vec<f64> = Vec::new();
144
+ let row_len = row_array.len();
145
+
146
+ for j in 0..row_len {
147
+ let val = row_array.entry::<Value>(j as isize)?;
148
+ let float_val = if let Ok(f) = Float::try_convert(val) {
149
+ f.to_f64()
150
+ } else if let Ok(i) = Integer::try_convert(val) {
151
+ i.to_i64()? as f64
152
+ } else {
153
+ return Err(Error::new(
154
+ magnus::exception::type_error(),
155
+ "All values must be numeric",
156
+ ));
157
+ };
158
+ rust_row.push(float_val);
159
+ }
160
+
161
+ if !rust_data.is_empty() && rust_row.len() != rust_data[0].len() {
162
+ return Err(Error::new(
163
+ magnus::exception::arg_error(),
164
+ "All rows must have the same length",
165
+ ));
166
+ }
167
+
168
+ rust_data.push(rust_row);
169
+ }
170
+
171
+ if rust_data.is_empty() {
172
+ return Err(Error::new(
173
+ magnus::exception::arg_error(),
174
+ "Input data cannot be empty",
175
+ ));
176
+ }
177
+
178
+ // Convert to Vec<Vec<f32>> for HNSW
179
+ let data_f32: Vec<Vec<f32>> = rust_data.iter()
180
+ .map(|row| row.iter().map(|&x| x as f32).collect())
181
+ .collect();
182
+
183
+ // Build HNSW graph
184
+ let ef_c = 50;
185
+ let max_nb_connection = 70;
186
+ let nb_points = data_f32.len();
187
+ let nb_layer = 16.min((nb_points as f32).ln().trunc() as usize);
188
+
189
+ // Create HNSW with or without seed
190
+ let hnsw = match self.random_seed {
191
+ Some(seed) => Hnsw::<f32, DistL2>::new_with_seed(
192
+ max_nb_connection, nb_points, nb_layer, ef_c, DistL2 {}, seed
193
+ ),
194
+ None => Hnsw::<f32, DistL2>::new(
195
+ max_nb_connection, nb_points, nb_layer, ef_c, DistL2 {}
196
+ ),
197
+ };
198
+
199
+ // Insert data into HNSW
200
+ let data_with_id: Vec<(&Vec<f32>, usize)> = data_f32.iter()
201
+ .enumerate()
202
+ .map(|(i, v)| (v, i))
203
+ .collect();
204
+
205
+ // Use serial_insert for reproducibility when seed is provided,
206
+ // parallel_insert for performance when no seed
207
+ if self.random_seed.is_some() {
208
+ hnsw.serial_insert(&data_with_id);
209
+ } else {
210
+ hnsw.parallel_insert(&data_with_id);
211
+ }
212
+
213
+ // Create KGraph from HNSW
214
+ let kgraph: annembed::fromhnsw::kgraph::KGraph<f32> = annembed::fromhnsw::kgraph::kgraph_from_hnsw_all(&hnsw, self.n_neighbors)
215
+ .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
216
+
217
+ // Set up embedding parameters
218
+ let mut embed_params = EmbedderParams::default();
219
+ embed_params.asked_dim = self.n_components;
220
+ embed_params.nb_grad_batch = self.nb_grad_batch; // Configurable from Ruby
221
+ embed_params.scale_rho = 1.;
222
+ embed_params.beta = 1.;
223
+ embed_params.b = 1.;
224
+ embed_params.grad_step = 1.;
225
+ embed_params.nb_sampling_by_edge = self.nb_sampling_by_edge; // Configurable from Ruby
226
+ // Enable diffusion map initialization (annembed now has fallback to random if it fails)
227
+ embed_params.dmap_init = true;
228
+ embed_params.random_seed = self.random_seed; // Pass seed through to annembed
229
+
230
+ // Create embedder and perform embedding
231
+ let mut embedder = Embedder::new(&kgraph, embed_params);
232
+
233
+ let embed_result = embedder.embed()
234
+ .map_err(|e| Error::new(magnus::exception::runtime_error(),
235
+ format!("Embedding failed: {}", e)))?;
236
+
237
+ if embed_result == 0 {
238
+ return Err(Error::new(magnus::exception::runtime_error(), "No points were embedded"));
239
+ }
240
+
241
+ // Get embedded data
242
+ let embedded_array = embedder.get_embedded_reindexed();
243
+
244
+ // Store results in a simpler format
245
+ let mut embeddings = Vec::new();
246
+ for i in 0..embedded_array.nrows() {
247
+ let mut row = Vec::new();
248
+ for j in 0..embedded_array.ncols() {
249
+ row.push(embedded_array[[i, j]] as f64);
250
+ }
251
+ embeddings.push(row);
252
+ }
253
+ // Store the training data and embeddings for future transforms
254
+ *self.training_data.borrow_mut() = Some(data_f32.clone());
255
+ *self.training_embeddings.borrow_mut() = Some(embeddings.clone());
256
+ // Convert result back to Ruby array
257
+ let result = RArray::new();
258
+ for embedding in &embeddings {
259
+ let row = RArray::new();
260
+ for &val in embedding {
261
+ row.push(val)?;
262
+ }
263
+ result.push(row)?;
264
+ }
265
+ Ok(result)
266
+ }
267
+
268
+ // Save the full model (training data + embeddings + params) for future transforms
269
+ fn save_model(&self, path: String) -> Result<(), Error> {
270
+ // Check if we have training data
271
+ let training_data = self.training_data.borrow();
272
+ let training_embeddings = self.training_embeddings.borrow();
273
+
274
+ let training_data_ref = training_data.as_ref()
275
+ .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No model to save. Run fit_transform first."))?;
276
+ let training_embeddings_ref = training_embeddings.as_ref()
277
+ .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No embeddings to save."))?;
278
+
279
+ let saved_model = SavedUMAPModel {
280
+ n_components: self.n_components,
281
+ n_neighbors: self.n_neighbors,
282
+ nb_grad_batch: self.nb_grad_batch,
283
+ nb_sampling_by_edge: self.nb_sampling_by_edge,
284
+ embeddings: training_embeddings_ref.clone(),
285
+ original_data: training_data_ref.clone(),
286
+ };
287
+
288
+ let serialized = bincode::serialize(&saved_model)
289
+ .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
290
+
291
+ let mut file = File::create(&path)
292
+ .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
293
+
294
+ file.write_all(&serialized)
295
+ .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
296
+
297
+ Ok(())
298
+ }
299
+
300
+ // Load a full model for transforming new data
301
+ fn load_model(path: String) -> Result<Self, Error> {
302
+ let mut file = File::open(&path)
303
+ .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
304
+
305
+ let mut buffer = Vec::new();
306
+ file.read_to_end(&mut buffer)
307
+ .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
308
+
309
+ let saved_model: SavedUMAPModel = bincode::deserialize(&buffer)
310
+ .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
311
+
312
+ Ok(RustUMAP {
313
+ n_components: saved_model.n_components,
314
+ n_neighbors: saved_model.n_neighbors,
315
+ random_seed: None,
316
+ nb_grad_batch: saved_model.nb_grad_batch,
317
+ nb_sampling_by_edge: saved_model.nb_sampling_by_edge,
318
+ training_data: RefCell::new(Some(saved_model.original_data)),
319
+ training_embeddings: RefCell::new(Some(saved_model.embeddings)),
320
+ })
321
+ }
322
+
323
+ // Transform new data using k-NN approximation with the training data
324
+ fn transform(&self, data: Value) -> Result<RArray, Error> {
325
+ // Get training data
326
+ let training_data = self.training_data.borrow();
327
+ let training_embeddings = self.training_embeddings.borrow();
328
+
329
+ let training_data_ref = training_data.as_ref()
330
+ .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No model loaded. Load a model or run fit_transform first."))?;
331
+ let training_embeddings_ref = training_embeddings.as_ref()
332
+ .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No embeddings available."))?;
333
+
334
+ // Convert input data to Rust format
335
+ let ruby_array = RArray::try_convert(data)?;
336
+ let mut new_data: Vec<Vec<f32>> = Vec::new();
337
+
338
+ for i in 0..ruby_array.len() {
339
+ let row = ruby_array.entry::<Value>(i as isize)?;
340
+ let row_array = RArray::try_convert(row)?;
341
+ let mut rust_row: Vec<f32> = Vec::new();
342
+
343
+ for j in 0..row_array.len() {
344
+ let val = row_array.entry::<Value>(j as isize)?;
345
+ let float_val = if let Ok(f) = Float::try_convert(val) {
346
+ f.to_f64() as f32
347
+ } else if let Ok(i) = Integer::try_convert(val) {
348
+ i.to_i64()? as f32
349
+ } else {
350
+ return Err(Error::new(
351
+ magnus::exception::type_error(),
352
+ "All values must be numeric",
353
+ ));
354
+ };
355
+ rust_row.push(float_val);
356
+ }
357
+ new_data.push(rust_row);
358
+ }
359
+
360
+ // For each new point, find k nearest neighbors in training data
361
+ // and average their embeddings (weighted by distance)
362
+ let k = self.n_neighbors.min(training_data_ref.len());
363
+ let result = RArray::new();
364
+
365
+ for new_point in &new_data {
366
+ // Calculate distances to all training points
367
+ let mut distances: Vec<(f32, usize)> = Vec::new();
368
+ for (idx, train_point) in training_data_ref.iter().enumerate() {
369
+ let dist = euclidean_distance(new_point, train_point);
370
+ distances.push((dist, idx));
371
+ }
372
+
373
+ // Sort by distance and take k nearest
374
+ distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
375
+ let k_nearest = &distances[..k];
376
+
377
+ // Weighted average of k nearest embeddings
378
+ let mut avg_embedding = vec![0.0; self.n_components];
379
+ let mut total_weight = 0.0;
380
+
381
+ for &(dist, idx) in k_nearest {
382
+ let weight = 1.0 / (dist as f64 + 0.001); // Inverse distance weighting
383
+ total_weight += weight;
384
+
385
+ for (i, &val) in training_embeddings_ref[idx].iter().enumerate() {
386
+ avg_embedding[i] += val * weight;
387
+ }
388
+ }
389
+
390
+ // Normalize
391
+ for val in &mut avg_embedding {
392
+ *val /= total_weight;
393
+ }
394
+
395
+ // Convert to Ruby array
396
+ let row = RArray::new();
397
+ for val in avg_embedding {
398
+ row.push(val)?;
399
+ }
400
+ result.push(row)?;
401
+ }
402
+
403
+ Ok(result)
404
+ }
405
+ }
406
+
407
+ fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
408
+ a.iter()
409
+ .zip(b.iter())
410
+ .map(|(x, y)| (x - y).powi(2))
411
+ .sum::<f32>()
412
+ .sqrt()
413
+ }
@@ -0,0 +1,22 @@
1
+ use magnus::{define_module, Error};
2
+
3
+ mod embedder;
4
+ mod svd;
5
+ mod utils;
6
+ mod clustering;
7
+
8
+ #[cfg(test)]
9
+ mod tests;
10
+
11
+ #[magnus::init]
12
+ fn init() -> Result<(), Error> {
13
+ let module = define_module("ClusterKit")?;
14
+
15
+ // Initialize submodules
16
+ embedder::init(&module)?;
17
+ svd::init(&module)?;
18
+ utils::init(&module)?;
19
+ clustering::init(&module)?;
20
+
21
+ Ok(())
22
+ }
@@ -0,0 +1,112 @@
1
+ use magnus::{function, prelude::*, Error, Value, RArray, TryConvert};
2
+ use annembed::tools::svdapprox::{SvdApprox, RangeApproxMode, RangeRank, MatRepr};
3
+ use ndarray::Array2;
4
+
5
+ pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
6
+ let svd_module = parent.define_module("SVD")?;
7
+
8
+ svd_module.define_singleton_method(
9
+ "randomized_svd_rust",
10
+ function!(randomized_svd, 3),
11
+ )?;
12
+
13
+ Ok(())
14
+ }
15
+
16
+ fn randomized_svd(matrix: Value, k: usize, n_iter: usize) -> Result<RArray, Error> {
17
+ // Convert Ruby array to ndarray
18
+ let rarray: RArray = TryConvert::try_convert(matrix)?;
19
+
20
+ // Check if it's a 2D array
21
+ let first_row: RArray = rarray.entry::<RArray>(0)?;
22
+ let n_rows = rarray.len();
23
+ let n_cols = first_row.len();
24
+
25
+ if n_rows == 0 || n_cols == 0 {
26
+ return Err(Error::new(
27
+ magnus::exception::arg_error(),
28
+ "Matrix cannot be empty",
29
+ ));
30
+ }
31
+
32
+ if k > n_rows.min(n_cols) {
33
+ return Err(Error::new(
34
+ magnus::exception::arg_error(),
35
+ format!("k ({}) cannot be larger than min(rows, cols) = {}", k, n_rows.min(n_cols)),
36
+ ));
37
+ }
38
+
39
+ // Convert to ndarray Array2
40
+ let mut matrix_data = Array2::<f64>::zeros((n_rows, n_cols));
41
+ for i in 0..n_rows {
42
+ let row: RArray = rarray.entry(i as isize)?;
43
+ for j in 0..n_cols {
44
+ let val: f64 = row.entry(j as isize)?;
45
+ matrix_data[[i, j]] = val;
46
+ }
47
+ }
48
+
49
+ // Create MatRepr for the full matrix
50
+ let mat_repr = MatRepr::from_array2(matrix_data.clone());
51
+
52
+ // Create SvdApprox instance
53
+ let mut svd_approx = SvdApprox::new(&mat_repr);
54
+
55
+ // Set up parameters for randomized SVD
56
+ // Use RANK mode to specify the desired rank
57
+ let params = RangeApproxMode::RANK(RangeRank::new(k, n_iter));
58
+
59
+ // Perform SVD
60
+ let svd_result = svd_approx.direct_svd(params)
61
+ .map_err(|e| Error::new(magnus::exception::runtime_error(), e))?;
62
+
63
+ // Extract U, S, V from the result - they are optional fields
64
+ let u_matrix = svd_result.u.ok_or_else(|| {
65
+ Error::new(magnus::exception::runtime_error(), "No U matrix in SVD result")
66
+ })?;
67
+
68
+ let s_values = svd_result.s.ok_or_else(|| {
69
+ Error::new(magnus::exception::runtime_error(), "No S values in SVD result")
70
+ })?;
71
+
72
+ let vt_matrix = svd_result.vt.ok_or_else(|| {
73
+ Error::new(magnus::exception::runtime_error(), "No V^T matrix in SVD result")
74
+ })?;
75
+
76
+ // Convert results to Ruby arrays
77
+ // U matrix - convert ndarray to Ruby nested array
78
+ let u_ruby = RArray::new();
79
+ let u_shape = u_matrix.shape();
80
+ for i in 0..u_shape[0] {
81
+ let row = RArray::new();
82
+ for j in 0..u_shape[1] {
83
+ row.push(u_matrix[[i, j]])?;
84
+ }
85
+ u_ruby.push(row)?;
86
+ }
87
+
88
+ // S values - convert to Ruby array
89
+ let s_ruby = RArray::new();
90
+ for val in s_values.iter() {
91
+ s_ruby.push(*val)?;
92
+ }
93
+
94
+ // V matrix (note: we have V^T, so we need to transpose)
95
+ let v_ruby = RArray::new();
96
+ let vt_shape = vt_matrix.shape();
97
+ for i in 0..vt_shape[0] {
98
+ let row = RArray::new();
99
+ for j in 0..vt_shape[1] {
100
+ row.push(vt_matrix[[i, j]])?;
101
+ }
102
+ v_ruby.push(row)?;
103
+ }
104
+
105
+ // Return [U, S, V^T] as a Ruby array
106
+ let result = RArray::new();
107
+ result.push(u_ruby)?;
108
+ result.push(s_ruby)?;
109
+ result.push(v_ruby)?;
110
+
111
+ Ok(result)
112
+ }
@@ -0,0 +1,16 @@
1
+ #[cfg(test)]
2
+ mod tests {
3
+
4
+ #[test]
5
+ fn test_rust_compilation() {
6
+ // Simple test to ensure Rust code compiles
7
+ assert_eq!(1 + 1, 2);
8
+ }
9
+
10
+ #[test]
11
+ fn test_vector_conversion() {
12
+ let data = vec![vec![1.0, 2.0], vec![3.0, 4.0]];
13
+ assert_eq!(data.len(), 2);
14
+ assert_eq!(data[0].len(), 2);
15
+ }
16
+ }
@@ -0,0 +1,33 @@
1
+ use magnus::{function, prelude::*, Error, Value};
2
+
3
+ pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
4
+ let utils_module = parent.define_module("Utils")?;
5
+
6
+ utils_module.define_singleton_method(
7
+ "estimate_intrinsic_dimension_rust",
8
+ function!(estimate_intrinsic_dimension, 2),
9
+ )?;
10
+
11
+ utils_module.define_singleton_method(
12
+ "estimate_hubness_rust",
13
+ function!(estimate_hubness, 1),
14
+ )?;
15
+
16
+ Ok(())
17
+ }
18
+
19
+ fn estimate_intrinsic_dimension(_data: Value, _k_neighbors: usize) -> Result<f64, Error> {
20
+ // TODO: Implement using annembed
21
+ Err(Error::new(
22
+ magnus::exception::not_imp_error(),
23
+ "Dimension estimation not implemented yet",
24
+ ))
25
+ }
26
+
27
+ fn estimate_hubness(_data: Value) -> Result<Value, Error> {
28
+ // TODO: Implement using annembed
29
+ Err(Error::new(
30
+ magnus::exception::not_imp_error(),
31
+ "Hubness estimation not implemented yet",
32
+ ))
33
+ }