RubyGems - clusterkit - Versions diffs - 0.1.0 → 0.1.1 - Mend

clusterkit 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/Cargo.lock +3236 -0
data/README.md +227 -7
data/docs/KNOWN_ISSUES.md +5 -5
data/docs/RUST_ERROR_HANDLING.md +6 -6
data/docs/assets/clusterkit-wide.png +0 -0
data/docs/assets/clusterkit.png +0 -0
data/docs/assets/visualization.png +0 -0
data/ext/clusterkit/Cargo.toml +5 -4
data/ext/clusterkit/extconf.rb +9 -1
data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +27 -62
data/ext/clusterkit/src/clustering.rs +68 -114
data/ext/clusterkit/src/embedder.rs +48 -131
data/ext/clusterkit/src/hnsw.rs +579 -0
data/ext/clusterkit/src/lib.rs +7 -5
data/ext/clusterkit/src/svd.rs +35 -58
data/ext/clusterkit/src/utils.rs +159 -9
data/lib/clusterkit/clustering/hdbscan.rb +4 -17
data/lib/clusterkit/clustering.rb +4 -23
data/lib/clusterkit/data_validator.rb +132 -0
data/lib/clusterkit/dimensionality/pca.rb +12 -12
data/lib/clusterkit/dimensionality/svd.rb +47 -16
data/lib/clusterkit/dimensionality/umap.rb +7 -40
data/lib/clusterkit/hnsw.rb +251 -0
data/lib/clusterkit/version.rb +1 -1
data/lib/clusterkit.rb +2 -1
metadata +40 -20
data/clusterkit.gemspec +0 -45

data/ext/clusterkit/src/clustering.rs CHANGED Viewed

@@ -1,68 +1,52 @@
-use magnus::{function, prelude::*, Error, Value, RArray, Integer, TryConvert};
+use magnus::{function, prelude::*, Error, Value, RArray, Ruby};
 use ndarray::{Array1, Array2, ArrayView1, Axis};
 use rand::prelude::*;
+use rand::rngs::StdRng;
+use rand::SeedableRng;
+use crate::utils::ruby_array_to_ndarray2;
 mod hdbscan_wrapper;
 pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
     let clustering_module = parent.define_module("Clustering")?;
     clustering_module.define_singleton_method(
         "kmeans_rust",
-        function!(kmeans, 3),
+        function!(kmeans, 4),
     )?;
     clustering_module.define_singleton_method(
         "kmeans_predict_rust",
         function!(kmeans_predict, 2),
     )?;
     // Initialize HDBSCAN functions
     hdbscan_wrapper::init(&clustering_module)?;
     Ok(())
 }
 /// Perform K-means clustering
 /// Returns (labels, centroids, inertia)
-fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64), Error> {
-    // Convert Ruby array to ndarray
-    let rarray: RArray = TryConvert::try_convert(data)?;
-    let n_samples = rarray.len();
-    if n_samples == 0 {
-        return Err(Error::new(
-            magnus::exception::arg_error(),
-            "Data cannot be empty",
-        ));
-    }
-    // Get dimensions
-    let first_row: RArray = rarray.entry::<RArray>(0)?;
-    let n_features = first_row.len();
+fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> Result<(RArray, RArray, f64), Error> {
+    let ruby = Ruby::get().unwrap();
+    // Convert Ruby array to ndarray using shared helper
+    let data_array = ruby_array_to_ndarray2(data)?;
+    let (n_samples, n_features) = data_array.dim();
     if k > n_samples {
         return Err(Error::new(
-            magnus::exception::arg_error(),
+            ruby.exception_arg_error(),
             format!("k ({}) cannot be larger than number of samples ({})", k, n_samples),
         ));
     }
-    // Convert to ndarray
-    let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
-    for i in 0..n_samples {
-        let row: RArray = rarray.entry(i as isize)?;
-        for j in 0..n_features {
-            let val: f64 = row.entry(j as isize)?;
-            data_array[[i, j]] = val;
-        }
-    }
     // Initialize centroids using K-means++
-    let mut centroids = kmeans_plusplus(&data_array, k)?;
+    let mut centroids = kmeans_plusplus(&data_array, k, random_seed)?;
     let mut labels = vec![0usize; n_samples];
     let mut prev_labels = vec![0usize; n_samples];
     // K-means iterations
     for iteration in 0..max_iter {
         // Assign points to nearest centroid
@@ -71,7 +55,7 @@ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64
             let point = data_array.row(i);
             let mut min_dist = f64::INFINITY;
             let mut best_cluster = 0;
             for (j, centroid) in centroids.axis_iter(Axis(0)).enumerate() {
                 let dist = euclidean_distance(&point, &centroid);
                 if dist < min_dist {
@@ -79,38 +63,38 @@ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64
                     best_cluster = j;
                 }
             }
             if labels[i] != best_cluster {
                 changed = true;
             }
             labels[i] = best_cluster;
         }
         // Check for convergence
         if !changed && iteration > 0 {
             break;
         }
         // Update centroids
         for j in 0..k {
             let mut sum = Array1::<f64>::zeros(n_features);
             let mut count = 0;
             for i in 0..n_samples {
                 if labels[i] == j {
                     sum += &data_array.row(i);
                     count += 1;
                 }
             }
             if count > 0 {
                 centroids.row_mut(j).assign(&(sum / count as f64));
             }
         }
         prev_labels.clone_from(&labels);
     }
     // Calculate inertia (sum of squared distances to nearest centroid)
     let mut inertia = 0.0;
     for i in 0..n_samples {
@@ -118,75 +102,43 @@ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64
         let centroid = centroids.row(labels[i]);
         inertia += euclidean_distance(&point, &centroid).powi(2);
     }
     // Convert results to Ruby arrays
-    let ruby = magnus::Ruby::get().unwrap();
-    let labels_array = RArray::new();
+    let labels_array = ruby.ary_new();
     for label in labels {
-        labels_array.push(Integer::from_value(ruby.eval(&format!("{}", label)).unwrap()).unwrap())?;
+        labels_array.push(ruby.integer_from_i64(label as i64))?;
     }
-    let centroids_array = RArray::new();
+    let centroids_array = ruby.ary_new();
     for i in 0..k {
-        let row_array = RArray::new();
+        let row_array = ruby.ary_new();
         for j in 0..n_features {
             row_array.push(centroids[[i, j]])?;
         }
         centroids_array.push(row_array)?;
     }
     Ok((labels_array, centroids_array, inertia))
 }
 /// Predict cluster labels for new data given centroids
 fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
-    // Convert inputs
-    let data_array: RArray = TryConvert::try_convert(data)?;
-    let centroids_array: RArray = TryConvert::try_convert(centroids)?;
-    let n_samples = data_array.len();
-    let k = centroids_array.len();
-    if n_samples == 0 {
-        return Err(Error::new(
-            magnus::exception::arg_error(),
-            "Data cannot be empty",
-        ));
-    }
-    // Get dimensions
-    let first_row: RArray = data_array.entry::<RArray>(0)?;
-    let n_features = first_row.len();
-    // Convert data to ndarray
-    let mut data_matrix = Array2::<f64>::zeros((n_samples, n_features));
-    for i in 0..n_samples {
-        let row: RArray = data_array.entry(i as isize)?;
-        for j in 0..n_features {
-            let val: f64 = row.entry(j as isize)?;
-            data_matrix[[i, j]] = val;
-        }
-    }
-    // Convert centroids to ndarray
-    let mut centroids_matrix = Array2::<f64>::zeros((k, n_features));
-    for i in 0..k {
-        let row: RArray = centroids_array.entry(i as isize)?;
-        for j in 0..n_features {
-            let val: f64 = row.entry(j as isize)?;
-            centroids_matrix[[i, j]] = val;
-        }
-    }
+    let ruby = Ruby::get().unwrap();
+    // Convert inputs using shared helpers
+    let data_matrix = ruby_array_to_ndarray2(data)?;
+    let centroids_matrix = ruby_array_to_ndarray2(centroids)?;
+    let (n_samples, _) = data_matrix.dim();
     // Predict labels
-    let ruby = magnus::Ruby::get().unwrap();
-    let labels_array = RArray::new();
+    let labels_array = ruby.ary_new();
     for i in 0..n_samples {
         let point = data_matrix.row(i);
         let mut min_dist = f64::INFINITY;
         let mut best_cluster = 0;
         for (j, centroid) in centroids_matrix.axis_iter(Axis(0)).enumerate() {
             let dist = euclidean_distance(&point, &centroid);
             if dist < min_dist {
@@ -194,30 +146,37 @@ fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
                 best_cluster = j;
             }
         }
-        labels_array.push(Integer::from_value(ruby.eval(&format!("{}", best_cluster)).unwrap()).unwrap())?;
+        labels_array.push(ruby.integer_from_i64(best_cluster as i64))?;
     }
     Ok(labels_array)
 }
 /// K-means++ initialization
-fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
+fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Result<Array2<f64>, Error> {
     let n_samples = data.nrows();
     let n_features = data.ncols();
-    let mut rng = thread_rng();
+    // Use seeded RNG if seed is provided, otherwise use thread_rng
+    let mut rng: Box<dyn RngCore> = match random_seed {
+        Some(seed) => {
+            let seed_u64 = seed as u64;
+            Box::new(StdRng::seed_from_u64(seed_u64))
+        },
+        None => Box::new(thread_rng()),
+    };
     let mut centroids = Array2::<f64>::zeros((k, n_features));
     // Choose first centroid randomly
     let first_idx = rng.gen_range(0..n_samples);
     centroids.row_mut(0).assign(&data.row(first_idx));
     // Choose remaining centroids
     for i in 1..k {
         let mut distances = vec![f64::INFINITY; n_samples];
-        // Calculate distance to nearest centroid for each point
         for j in 0..n_samples {
             for c in 0..i {
                 let dist = euclidean_distance(&data.row(j), &centroids.row(c));
@@ -226,25 +185,20 @@ fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
                 }
             }
         }
-        // Convert distances to probabilities
         let total: f64 = distances.iter().map(|d| d * d).sum();
         if total == 0.0 {
-            // All points are identical or we've selected duplicates
-            // Just use sequential points as centroids
             if i < n_samples {
                 centroids.row_mut(i).assign(&data.row(i));
             } else {
-                // Reuse first point if we run out
                 centroids.row_mut(i).assign(&data.row(0));
             }
             continue;
         }
-        // Choose next centroid with probability proportional to squared distance
         let mut cumsum = 0.0;
         let rand_val: f64 = rng.gen::<f64>() * total;
         for j in 0..n_samples {
             cumsum += distances[j] * distances[j];
             if cumsum >= rand_val {
@@ -253,7 +207,7 @@ fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
             }
         }
     }
     Ok(centroids)
 }
@@ -264,4 +218,4 @@ fn euclidean_distance(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
         .map(|(x, y)| (x - y).powi(2))
         .sum::<f64>()
         .sqrt()
-}
+}