RubyGems - clusterkit - Versions diffs - 0.1.0 → 0.1.1 - Mend

clusterkit 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/Cargo.lock +3236 -0
data/README.md +227 -7
data/docs/KNOWN_ISSUES.md +5 -5
data/docs/RUST_ERROR_HANDLING.md +6 -6
data/docs/assets/clusterkit-wide.png +0 -0
data/docs/assets/clusterkit.png +0 -0
data/docs/assets/visualization.png +0 -0
data/ext/clusterkit/Cargo.toml +5 -4
data/ext/clusterkit/extconf.rb +9 -1
data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +27 -62
data/ext/clusterkit/src/clustering.rs +68 -114
data/ext/clusterkit/src/embedder.rs +48 -131
data/ext/clusterkit/src/hnsw.rs +579 -0
data/ext/clusterkit/src/lib.rs +7 -5
data/ext/clusterkit/src/svd.rs +35 -58
data/ext/clusterkit/src/utils.rs +159 -9
data/lib/clusterkit/clustering/hdbscan.rb +4 -17
data/lib/clusterkit/clustering.rb +4 -23
data/lib/clusterkit/data_validator.rb +132 -0
data/lib/clusterkit/dimensionality/pca.rb +12 -12
data/lib/clusterkit/dimensionality/svd.rb +47 -16
data/lib/clusterkit/dimensionality/umap.rb +7 -40
data/lib/clusterkit/hnsw.rb +251 -0
data/lib/clusterkit/version.rb +1 -1
data/lib/clusterkit.rb +2 -1
metadata +40 -20
data/clusterkit.gemspec +0 -45

data/ext/clusterkit/src/svd.rs CHANGED Viewed

@@ -1,112 +1,89 @@
-use magnus::{function, prelude::*, Error, Value, RArray, TryConvert};
+use magnus::{function, prelude::*, Error, Value, RArray, Ruby};
 use annembed::tools::svdapprox::{SvdApprox, RangeApproxMode, RangeRank, MatRepr};
-use ndarray::Array2;
+use crate::utils::ruby_array_to_ndarray2;
 pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
     let svd_module = parent.define_module("SVD")?;
     svd_module.define_singleton_method(
         "randomized_svd_rust",
         function!(randomized_svd, 3),
     )?;
     Ok(())
 }
 fn randomized_svd(matrix: Value, k: usize, n_iter: usize) -> Result<RArray, Error> {
-    // Convert Ruby array to ndarray
-    let rarray: RArray = TryConvert::try_convert(matrix)?;
-    // Check if it's a 2D array
-    let first_row: RArray = rarray.entry::<RArray>(0)?;
-    let n_rows = rarray.len();
-    let n_cols = first_row.len();
-    if n_rows == 0 || n_cols == 0 {
-        return Err(Error::new(
-            magnus::exception::arg_error(),
-            "Matrix cannot be empty",
-        ));
-    }
+    let ruby = Ruby::get().unwrap();
+    // Convert Ruby array to ndarray using shared helper
+    let matrix_data = ruby_array_to_ndarray2(matrix)?;
+    let (n_rows, n_cols) = matrix_data.dim();
     if k > n_rows.min(n_cols) {
         return Err(Error::new(
-            magnus::exception::arg_error(),
+            ruby.exception_arg_error(),
             format!("k ({}) cannot be larger than min(rows, cols) = {}", k, n_rows.min(n_cols)),
         ));
     }
-    // Convert to ndarray Array2
-    let mut matrix_data = Array2::<f64>::zeros((n_rows, n_cols));
-    for i in 0..n_rows {
-        let row: RArray = rarray.entry(i as isize)?;
-        for j in 0..n_cols {
-            let val: f64 = row.entry(j as isize)?;
-            matrix_data[[i, j]] = val;
-        }
-    }
     // Create MatRepr for the full matrix
     let mat_repr = MatRepr::from_array2(matrix_data.clone());
     // Create SvdApprox instance
     let mut svd_approx = SvdApprox::new(&mat_repr);
     // Set up parameters for randomized SVD
-    // Use RANK mode to specify the desired rank
     let params = RangeApproxMode::RANK(RangeRank::new(k, n_iter));
     // Perform SVD
     let svd_result = svd_approx.direct_svd(params)
-        .map_err(|e| Error::new(magnus::exception::runtime_error(), e))?;
-    // Extract U, S, V from the result - they are optional fields
+        .map_err(|e| Error::new(ruby.exception_runtime_error(), e))?;
+    // Extract U, S, V from the result
     let u_matrix = svd_result.u.ok_or_else(|| {
-        Error::new(magnus::exception::runtime_error(), "No U matrix in SVD result")
+        Error::new(ruby.exception_runtime_error(), "No U matrix in SVD result")
     })?;
     let s_values = svd_result.s.ok_or_else(|| {
-        Error::new(magnus::exception::runtime_error(), "No S values in SVD result")
+        Error::new(ruby.exception_runtime_error(), "No S values in SVD result")
     })?;
     let vt_matrix = svd_result.vt.ok_or_else(|| {
-        Error::new(magnus::exception::runtime_error(), "No V^T matrix in SVD result")
+        Error::new(ruby.exception_runtime_error(), "No V^T matrix in SVD result")
     })?;
     // Convert results to Ruby arrays
-    // U matrix - convert ndarray to Ruby nested array
-    let u_ruby = RArray::new();
+    let u_ruby = ruby.ary_new();
     let u_shape = u_matrix.shape();
     for i in 0..u_shape[0] {
-        let row = RArray::new();
+        let row = ruby.ary_new();
         for j in 0..u_shape[1] {
             row.push(u_matrix[[i, j]])?;
         }
         u_ruby.push(row)?;
     }
-    // S values - convert to Ruby array
-    let s_ruby = RArray::new();
+    let s_ruby = ruby.ary_new();
     for val in s_values.iter() {
         s_ruby.push(*val)?;
     }
-    // V matrix (note: we have V^T, so we need to transpose)
-    let v_ruby = RArray::new();
+    let v_ruby = ruby.ary_new();
     let vt_shape = vt_matrix.shape();
     for i in 0..vt_shape[0] {
-        let row = RArray::new();
+        let row = ruby.ary_new();
         for j in 0..vt_shape[1] {
             row.push(vt_matrix[[i, j]])?;
         }
         v_ruby.push(row)?;
     }
     // Return [U, S, V^T] as a Ruby array
-    let result = RArray::new();
+    let result = ruby.ary_new();
     result.push(u_ruby)?;
     result.push(s_ruby)?;
     result.push(v_ruby)?;
     Ok(result)
-}
+}

data/ext/clusterkit/src/utils.rs CHANGED Viewed

@@ -1,33 +1,183 @@
-use magnus::{function, prelude::*, Error, Value};
+use magnus::{function, prelude::*, Error, Value, RArray, TryConvert, Float, Integer, Ruby};
+use ndarray::Array2;
 pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
     let utils_module = parent.define_module("Utils")?;
     utils_module.define_singleton_method(
         "estimate_intrinsic_dimension_rust",
         function!(estimate_intrinsic_dimension, 2),
     )?;
     utils_module.define_singleton_method(
         "estimate_hubness_rust",
         function!(estimate_hubness, 1),
     )?;
     Ok(())
 }
 fn estimate_intrinsic_dimension(_data: Value, _k_neighbors: usize) -> Result<f64, Error> {
-    // TODO: Implement using annembed
+    let ruby = Ruby::get().unwrap();
     Err(Error::new(
-        magnus::exception::not_imp_error(),
+        ruby.exception_not_imp_error(),
         "Dimension estimation not implemented yet",
     ))
 }
 fn estimate_hubness(_data: Value) -> Result<Value, Error> {
-    // TODO: Implement using annembed
+    let ruby = Ruby::get().unwrap();
     Err(Error::new(
-        magnus::exception::not_imp_error(),
+        ruby.exception_not_imp_error(),
         "Hubness estimation not implemented yet",
     ))
-}
+}
+/// Convert Ruby 2D array to ndarray Array2<f64>
+/// Handles validation and provides consistent error messages
+pub fn ruby_array_to_ndarray2(data: Value) -> Result<Array2<f64>, Error> {
+    let ruby = Ruby::get().unwrap();
+    let rarray: RArray = TryConvert::try_convert(data)?;
+    let n_samples = rarray.len();
+    if n_samples == 0 {
+        return Err(Error::new(
+            ruby.exception_arg_error(),
+            "Data cannot be empty",
+        ));
+    }
+    // Get dimensions from first row
+    let first_row: RArray = rarray.entry::<RArray>(0)?;
+    let n_features = first_row.len();
+    if n_features == 0 {
+        return Err(Error::new(
+            ruby.exception_arg_error(),
+            "Data rows cannot be empty",
+        ));
+    }
+    // Create ndarray and populate
+    let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
+    for i in 0..n_samples {
+        let row: RArray = rarray.entry(i as isize)?;
+        // Validate row length consistency
+        if row.len() != n_features {
+            return Err(Error::new(
+                ruby.exception_arg_error(),
+                format!("Row {} has {} elements, expected {}", i, row.len(), n_features),
+            ));
+        }
+        for j in 0..n_features {
+            let val: f64 = row.entry(j as isize)?;
+            data_array[[i, j]] = val;
+        }
+    }
+    Ok(data_array)
+}
+/// Convert Ruby 2D array to Vec<Vec<f64>>
+/// Handles validation and provides consistent error messages
+pub fn ruby_array_to_vec_vec_f64(data: Value) -> Result<Vec<Vec<f64>>, Error> {
+    let ruby = Ruby::get().unwrap();
+    let rarray: RArray = TryConvert::try_convert(data)?;
+    let n_samples = rarray.len();
+    if n_samples == 0 {
+        return Err(Error::new(
+            ruby.exception_arg_error(),
+            "Data cannot be empty",
+        ));
+    }
+    let mut data_vec: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
+    let mut expected_features: Option<usize> = None;
+    for i in 0..n_samples {
+        let row: RArray = rarray.entry(i as isize)?;
+        let n_features = row.len();
+        // Check row length consistency
+        match expected_features {
+            Some(expected) => {
+                if n_features != expected {
+                    return Err(Error::new(
+                        ruby.exception_arg_error(),
+                        format!("Row {} has {} elements, expected {}", i, n_features, expected),
+                    ));
+                }
+            }
+            None => expected_features = Some(n_features),
+        }
+        let mut row_vec: Vec<f64> = Vec::with_capacity(n_features);
+        for j in 0..n_features {
+            let val: f64 = row.entry(j as isize)?;
+            row_vec.push(val);
+        }
+        data_vec.push(row_vec);
+    }
+    Ok(data_vec)
+}
+/// Convert Ruby 2D array to Vec<Vec<f32>>
+/// For algorithms that require f32 precision (like UMAP)
+pub fn ruby_array_to_vec_vec_f32(data: Value) -> Result<Vec<Vec<f32>>, Error> {
+    let ruby = Ruby::get().unwrap();
+    let rarray: RArray = TryConvert::try_convert(data)?;
+    let array_len = rarray.len();
+    if array_len == 0 {
+        return Err(Error::new(
+            ruby.exception_arg_error(),
+            "Input data cannot be empty",
+        ));
+    }
+    let mut rust_data: Vec<Vec<f32>> = Vec::with_capacity(array_len);
+    for i in 0..array_len {
+        let row = rarray.entry::<Value>(i as isize)?;
+        let row_array = RArray::try_convert(row).map_err(|_| {
+            Error::new(
+                ruby.exception_type_error(),
+                "Expected array of arrays (2D array)",
+            )
+        })?;
+        let mut rust_row: Vec<f32> = Vec::new();
+        let row_len = row_array.len();
+        for j in 0..row_len {
+            let val = row_array.entry::<Value>(j as isize)?;
+            let float_val = if let Ok(f) = Float::try_convert(val) {
+                f.to_f64() as f32
+            } else if let Ok(i) = Integer::try_convert(val) {
+                i.to_i64()? as f32
+            } else {
+                return Err(Error::new(
+                    ruby.exception_type_error(),
+                    "All values must be numeric",
+                ));
+            };
+            rust_row.push(float_val);
+        }
+        // Validate row length consistency
+        if !rust_data.is_empty() && rust_row.len() != rust_data[0].len() {
+            return Err(Error::new(
+                ruby.exception_arg_error(),
+                "All rows must have the same length",
+            ));
+        }
+        rust_data.push(rust_row);
+    }
+    Ok(rust_data)
+}

data/lib/clusterkit/clustering/hdbscan.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 # frozen_string_literal: true
+require_relative '../data_validator'
 module ClusterKit
   module Clustering
     # HDBSCAN clustering algorithm - matching KMeans API pattern
@@ -128,23 +130,8 @@ module ClusterKit
       private
       def validate_data(data)
-        # Exact same validation as KMeans for consistency
-        raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
-        raise ArgumentError, "Data cannot be empty" if data.empty?
-        raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
-        row_length = data.first.length
-        unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
-          raise ArgumentError, "All rows must have the same length"
-        end
-        data.each_with_index do |row, i|
-          row.each_with_index do |val, j|
-            unless val.is_a?(Numeric)
-              raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
-            end
-          end
-        end
+        # Use same validation as KMeans for consistency
+        DataValidator.validate_clustering(data, check_finite: false)
       end
     end

data/lib/clusterkit/clustering.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require_relative 'clusterkit'
 require_relative 'clustering/hdbscan'
+require_relative 'data_validator'
 module ClusterKit
   # Module for clustering algorithms
@@ -28,11 +29,8 @@ module ClusterKit
       def fit(data)
         validate_data(data)
-        # Set random seed if provided
-        srand(@random_seed) if @random_seed
-        # Call Rust implementation
-        @labels, @centroids, @inertia = Clustering.kmeans_rust(data, @k, @max_iter)
+        # Call Rust implementation with optional seed
+        @labels, @centroids, @inertia = Clustering.kmeans_rust(data, @k, @max_iter, @random_seed)
         @fitted = true
         self
@@ -132,24 +130,7 @@ module ClusterKit
       private
       def validate_data(data)
-        raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
-        raise ArgumentError, "Data cannot be empty" if data.empty?
-        raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
-        # Check all rows have same length
-        row_length = data.first.length
-        unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
-          raise ArgumentError, "All rows must have the same length"
-        end
-        # Check all values are numeric
-        data.each_with_index do |row, i|
-          row.each_with_index do |val, j|
-            unless val.is_a?(Numeric)
-              raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
-            end
-          end
-        end
+        DataValidator.validate_clustering(data, check_finite: false)
       end
     end

data/lib/clusterkit/data_validator.rb ADDED Viewed

@@ -0,0 +1,132 @@
+# frozen_string_literal: true
+module ClusterKit
+  # Shared data validation methods for all algorithms
+  module DataValidator
+    class << self
+      # Validate basic data structure and types
+      # @param data [Array] Data to validate
+      # @raise [ArgumentError] If data structure is invalid
+      def validate_basic_structure(data)
+        raise ArgumentError, "Input must be an array" unless data.is_a?(Array)
+        raise ArgumentError, "Input cannot be empty" if data.empty?
+        first_row = data.first
+        raise ArgumentError, "Input must be a 2D array (array of arrays)" unless first_row.is_a?(Array)
+      end
+      # Validate row consistency (all rows have same length)
+      # @param data [Array] 2D array to validate
+      # @raise [ArgumentError] If rows have different lengths
+      def validate_row_consistency(data)
+        row_length = data.first.length
+        data.each_with_index do |row, i|
+          unless row.is_a?(Array)
+            raise ArgumentError, "Row #{i} is not an array"
+          end
+          if row.length != row_length
+            raise ArgumentError, "All rows must have the same length (row #{i} has #{row.length} elements, expected #{row_length})"
+          end
+        end
+      end
+      # Validate that all elements are numeric
+      # @param data [Array] 2D array to validate
+      # @raise [ArgumentError] If any element is not numeric
+      def validate_numeric_types(data)
+        data.each_with_index do |row, i|
+          row.each_with_index do |val, j|
+            unless val.is_a?(Numeric)
+              raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
+            end
+          end
+        end
+      end
+      # Validate finite values (no NaN or Infinite)
+      # @param data [Array] 2D array to validate
+      # @raise [ArgumentError] If any float is NaN or Infinite
+      def validate_finite_values(data)
+        data.each_with_index do |row, i|
+          row.each_with_index do |val, j|
+            # Only check for NaN/Infinite on floats
+            if val.is_a?(Float) && (val.nan? || val.infinite?)
+              raise ArgumentError, "Element at position [#{i}, #{j}] is NaN or Infinite"
+            end
+          end
+        end
+      end
+      # Standard validation for most algorithms
+      # @param data [Array] 2D array to validate
+      # @param check_finite [Boolean] Whether to check for NaN/Infinite values
+      # @raise [ArgumentError] If data is invalid
+      def validate_standard(data, check_finite: true)
+        validate_basic_structure(data)
+        validate_row_consistency(data)
+        validate_numeric_types(data)
+        validate_finite_values(data) if check_finite
+      end
+      # Validation for clustering algorithms (KMeans, HDBSCAN) with specific error messages
+      # @param data [Array] 2D array to validate
+      # @param check_finite [Boolean] Whether to check for NaN/Infinite values
+      # @raise [ArgumentError] If data is invalid
+      def validate_clustering(data, check_finite: false)
+        raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
+        raise ArgumentError, "Data cannot be empty" if data.empty?
+        raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
+        validate_row_consistency(data)
+        validate_numeric_types(data)
+        validate_finite_values(data) if check_finite
+      end
+      # Validation for PCA with specific error messages (same as clustering but without finite checks)
+      # @param data [Array] 2D array to validate
+      # @raise [ArgumentError] If data is invalid
+      def validate_pca(data)
+        raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
+        raise ArgumentError, "Data cannot be empty" if data.empty?
+        raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
+        validate_row_consistency(data)
+        validate_numeric_types(data)
+      end
+      # Get data statistics for warnings/error context
+      # @param data [Array] 2D array
+      # @return [Hash] Statistics about the data
+      def data_statistics(data)
+        return { n_samples: 0, n_features: 0, data_range: 0.0 } if data.empty?
+        n_samples = data.size
+        n_features = data.first&.size || 0
+        # Calculate data range for warnings
+        min_val = Float::INFINITY
+        max_val = -Float::INFINITY
+        data.each do |row|
+          row.each do |val|
+            val_f = val.to_f
+            min_val = val_f if val_f < min_val
+            max_val = val_f if val_f > max_val
+          end
+        end
+        data_range = max_val - min_val
+        {
+          n_samples: n_samples,
+          n_features: n_features,
+          data_range: data_range,
+          min_value: min_val,
+          max_value: max_val
+        }
+      end
+    end
+  end
+end

data/lib/clusterkit/dimensionality/pca.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require_relative '../clusterkit'
 require_relative 'svd'
+require_relative '../data_validator'
 module ClusterKit
   module Dimensionality
@@ -30,7 +31,7 @@ module ClusterKit
       # Perform SVD on centered data
       # U contains the transformed data, S contains singular values, VT contains components
-      u, s, vt = ClusterKit.svd(centered_data, @n_components, n_iter: 5)
+      u, s, vt = perform_svd(centered_data)
       # Store the principal components (eigenvectors)
       @components = vt  # Shape: (n_components, n_features)
@@ -76,7 +77,7 @@ module ClusterKit
       centered_data = center_data(data, @mean)
       # Perform SVD on centered data
-      u, s, vt = SVD.randomized_svd(centered_data, @n_components, n_iter: 5)
+      u, s, vt = perform_svd(centered_data)
       # Store the principal components (eigenvectors)
       @components = vt
@@ -166,17 +167,10 @@ module ClusterKit
     private
     def validate_data(data)
-      raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
-      raise ArgumentError, "Data cannot be empty" if data.empty?
-      raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
-      # Check all rows have same length
-      row_length = data.first.length
-      unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
-        raise ArgumentError, "All rows must have the same length"
-      end
+      # Use shared validation for common checks
+      DataValidator.validate_pca(data)
-      # Check we have enough samples for n_components
+      # PCA-specific validations
       if data.size < @n_components
         raise ArgumentError, "n_components (#{@n_components}) cannot be larger than n_samples (#{data.size})"
       end
@@ -237,6 +231,12 @@ module ClusterKit
       transformed
     end
+    # Shared SVD computation for both fit and fit_transform
+    # Ensures both methods use identical SVD invocation and parameters
+    def perform_svd(centered_data)
+      SVD.randomized_svd(centered_data, @n_components, n_iter: 5)
+    end
   end
   # Module-level convenience method