RubyGems - clusterkit - Versions diffs - 0.3.0-arm64-darwin - Mend

clusterkit 0.3.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.simplecov +47 -0
data/CHANGELOG.md +35 -0
data/CLAUDE.md +226 -0
data/Cargo.lock +3228 -0
data/Cargo.toml +8 -0
data/Gemfile +17 -0
data/IMPLEMENTATION_NOTES.md +143 -0
data/LICENSE.txt +21 -0
data/PYTHON_COMPARISON.md +183 -0
data/README.md +744 -0
data/Rakefile +259 -0
data/docs/KNOWN_ISSUES.md +130 -0
data/docs/RUST_ERROR_HANDLING.md +164 -0
data/docs/TEST_FIXTURES.md +170 -0
data/docs/UMAP_EXPLAINED.md +362 -0
data/docs/UMAP_TROUBLESHOOTING.md +284 -0
data/docs/VERBOSE_OUTPUT.md +84 -0
data/docs/assets/clusterkit-wide.png +0 -0
data/docs/assets/clusterkit.png +0 -0
data/docs/assets/visualization.png +0 -0
data/examples/hdbscan_example.rb +147 -0
data/examples/optimal_kmeans_example.rb +96 -0
data/examples/pca_example.rb +114 -0
data/examples/reproducible_umap.rb +99 -0
data/examples/verbose_control.rb +43 -0
data/ext/clusterkit/Cargo.toml +26 -0
data/ext/clusterkit/extconf.rb +23 -0
data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +80 -0
data/ext/clusterkit/src/clustering.rs +221 -0
data/ext/clusterkit/src/embedder.rs +349 -0
data/ext/clusterkit/src/hnsw.rs +579 -0
data/ext/clusterkit/src/lib.rs +24 -0
data/ext/clusterkit/src/svd.rs +89 -0
data/ext/clusterkit/src/tests.rs +16 -0
data/ext/clusterkit/src/utils.rs +183 -0
data/lib/clusterkit/3.1/clusterkit.bundle +0 -0
data/lib/clusterkit/3.2/clusterkit.bundle +0 -0
data/lib/clusterkit/3.3/clusterkit.bundle +0 -0
data/lib/clusterkit/3.4/clusterkit.bundle +0 -0
data/lib/clusterkit/clustering/hdbscan.rb +164 -0
data/lib/clusterkit/clustering.rb +194 -0
data/lib/clusterkit/clusterkit.rb +14 -0
data/lib/clusterkit/configuration.rb +24 -0
data/lib/clusterkit/data_validator.rb +132 -0
data/lib/clusterkit/dimensionality/pca.rb +251 -0
data/lib/clusterkit/dimensionality/svd.rb +175 -0
data/lib/clusterkit/dimensionality/umap.rb +282 -0
data/lib/clusterkit/dimensionality.rb +29 -0
data/lib/clusterkit/hdbscan_api_design.rb +142 -0
data/lib/clusterkit/hnsw.rb +251 -0
data/lib/clusterkit/preprocessing.rb +106 -0
data/lib/clusterkit/silence.rb +42 -0
data/lib/clusterkit/utils.rb +51 -0
data/lib/clusterkit/version.rb +5 -0
data/lib/clusterkit.rb +105 -0
data/lib/tasks/visualize.rake +641 -0
metadata +214 -0

data/lib/clusterkit/data_validator.rb ADDED Viewed

@@ -0,0 +1,132 @@
+# frozen_string_literal: true
+module ClusterKit
+  # Shared data validation methods for all algorithms
+  module DataValidator
+    class << self
+      # Validate basic data structure and types
+      # @param data [Array] Data to validate
+      # @raise [ArgumentError] If data structure is invalid
+      def validate_basic_structure(data)
+        raise ArgumentError, "Input must be an array" unless data.is_a?(Array)
+        raise ArgumentError, "Input cannot be empty" if data.empty?
+        first_row = data.first
+        raise ArgumentError, "Input must be a 2D array (array of arrays)" unless first_row.is_a?(Array)
+      end
+      # Validate row consistency (all rows have same length)
+      # @param data [Array] 2D array to validate
+      # @raise [ArgumentError] If rows have different lengths
+      def validate_row_consistency(data)
+        row_length = data.first.length
+        data.each_with_index do |row, i|
+          unless row.is_a?(Array)
+            raise ArgumentError, "Row #{i} is not an array"
+          end
+          if row.length != row_length
+            raise ArgumentError, "All rows must have the same length (row #{i} has #{row.length} elements, expected #{row_length})"
+          end
+        end
+      end
+      # Validate that all elements are numeric
+      # @param data [Array] 2D array to validate
+      # @raise [ArgumentError] If any element is not numeric
+      def validate_numeric_types(data)
+        data.each_with_index do |row, i|
+          row.each_with_index do |val, j|
+            unless val.is_a?(Numeric)
+              raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
+            end
+          end
+        end
+      end
+      # Validate finite values (no NaN or Infinite)
+      # @param data [Array] 2D array to validate
+      # @raise [ArgumentError] If any float is NaN or Infinite
+      def validate_finite_values(data)
+        data.each_with_index do |row, i|
+          row.each_with_index do |val, j|
+            # Only check for NaN/Infinite on floats
+            if val.is_a?(Float) && (val.nan? || val.infinite?)
+              raise ArgumentError, "Element at position [#{i}, #{j}] is NaN or Infinite"
+            end
+          end
+        end
+      end
+      # Standard validation for most algorithms
+      # @param data [Array] 2D array to validate
+      # @param check_finite [Boolean] Whether to check for NaN/Infinite values
+      # @raise [ArgumentError] If data is invalid
+      def validate_standard(data, check_finite: true)
+        validate_basic_structure(data)
+        validate_row_consistency(data)
+        validate_numeric_types(data)
+        validate_finite_values(data) if check_finite
+      end
+      # Validation for clustering algorithms (KMeans, HDBSCAN) with specific error messages
+      # @param data [Array] 2D array to validate
+      # @param check_finite [Boolean] Whether to check for NaN/Infinite values
+      # @raise [ArgumentError] If data is invalid
+      def validate_clustering(data, check_finite: false)
+        raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
+        raise ArgumentError, "Data cannot be empty" if data.empty?
+        raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
+        validate_row_consistency(data)
+        validate_numeric_types(data)
+        validate_finite_values(data) if check_finite
+      end
+      # Validation for PCA with specific error messages (same as clustering but without finite checks)
+      # @param data [Array] 2D array to validate
+      # @raise [ArgumentError] If data is invalid
+      def validate_pca(data)
+        raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
+        raise ArgumentError, "Data cannot be empty" if data.empty?
+        raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
+        validate_row_consistency(data)
+        validate_numeric_types(data)
+      end
+      # Get data statistics for warnings/error context
+      # @param data [Array] 2D array
+      # @return [Hash] Statistics about the data
+      def data_statistics(data)
+        return { n_samples: 0, n_features: 0, data_range: 0.0 } if data.empty?
+        n_samples = data.size
+        n_features = data.first&.size || 0
+        # Calculate data range for warnings
+        min_val = Float::INFINITY
+        max_val = -Float::INFINITY
+        data.each do |row|
+          row.each do |val|
+            val_f = val.to_f
+            min_val = val_f if val_f < min_val
+            max_val = val_f if val_f > max_val
+          end
+        end
+        data_range = max_val - min_val
+        {
+          n_samples: n_samples,
+          n_features: n_features,
+          data_range: data_range,
+          min_value: min_val,
+          max_value: max_val
+        }
+      end
+    end
+  end
+end

data/lib/clusterkit/dimensionality/pca.rb ADDED Viewed

@@ -0,0 +1,251 @@
+# frozen_string_literal: true
+require_relative '../clusterkit'
+require_relative 'svd'
+require_relative '../data_validator'
+module ClusterKit
+  module Dimensionality
+    # Principal Component Analysis using SVD
+    # PCA is a linear dimensionality reduction technique that finds
+    # the directions of maximum variance in the data
+    class PCA
+    attr_reader :n_components, :components, :explained_variance, :explained_variance_ratio, :mean
+    # Initialize PCA
+    # @param n_components [Integer] Number of principal components to keep
+    def initialize(n_components: 2)
+      @n_components = n_components
+      @fitted = false
+    end
+    # Fit the PCA model
+    # @param data [Array] 2D array of data points (n_samples × n_features)
+    # @return [self] Returns self for method chaining
+    def fit(data)
+      validate_data(data)
+      # Center the data (subtract mean from each feature)
+      @mean = calculate_mean(data)
+      centered_data = center_data(data, @mean)
+      # Perform SVD on centered data
+      # U contains the transformed data, S contains singular values, VT contains components
+      u, s, vt = perform_svd(centered_data)
+      # Store the principal components (eigenvectors)
+      @components = vt  # Shape: (n_components, n_features)
+      # Store singular values for consistency
+      @singular_values = s
+      # Calculate explained variance (eigenvalues)
+      n_samples = data.size.to_f
+      @explained_variance = s.map { |val| (val ** 2) / (n_samples - 1) }
+      # Calculate explained variance ratio
+      total_variance = calculate_total_variance(centered_data, n_samples)
+      @explained_variance_ratio = @explained_variance.map { |var| var / total_variance }
+      @fitted = true
+      self
+    end
+    # Transform data using the fitted PCA model
+    # @param data [Array] 2D array of data points
+    # @return [Array] Transformed data in principal component space
+    def transform(data)
+      raise RuntimeError, "Model must be fitted before transform" unless fitted?
+      validate_data(data)
+      # Center the data using the stored mean
+      centered_data = center_data(data, @mean)
+      # Project onto principal components
+      # Result = centered_data × components.T
+      project_data(centered_data, @components)
+    end
+    # Fit the model and transform the data in one step
+    # @param data [Array] 2D array of data points
+    # @return [Array] Transformed data
+    def fit_transform(data)
+      validate_data(data)
+      # Center the data (subtract mean from each feature)
+      @mean = calculate_mean(data)
+      centered_data = center_data(data, @mean)
+      # Perform SVD on centered data
+      u, s, vt = perform_svd(centered_data)
+      # Store the principal components (eigenvectors)
+      @components = vt
+      # Store singular values for later use
+      @singular_values = s
+      # Calculate explained variance (eigenvalues)
+      n_samples = data.size.to_f
+      @explained_variance = s.map { |val| (val ** 2) / (n_samples - 1) }
+      # Calculate explained variance ratio
+      total_variance = calculate_total_variance(centered_data, n_samples)
+      @explained_variance_ratio = @explained_variance.map { |var| var / total_variance }
+      @fitted = true
+      # For PCA, the transformed data is U * S
+      # Scale U by singular values
+      transformed = []
+      u.each do |row|
+        scaled_row = row.each_with_index.map { |val, i| val * s[i] }
+        transformed << scaled_row
+      end
+      transformed
+    end
+    # Inverse transform - reconstruct data from principal components
+    # @param data [Array] Transformed data in PC space
+    # @return [Array] Reconstructed data in original space
+    def inverse_transform(data)
+      raise RuntimeError, "Model must be fitted before inverse_transform" unless fitted?
+      # Reconstruct: data × components + mean
+      reconstructed = []
+      data.each do |sample|
+        reconstructed_sample = Array.new(@mean.size, 0.0)
+        sample.each_with_index do |value, i|
+          @components[i].each_with_index do |comp_val, j|
+            reconstructed_sample[j] += value * comp_val
+          end
+        end
+        # Add back the mean
+        reconstructed_sample = reconstructed_sample.zip(@mean).map { |r, m| r + m }
+        reconstructed << reconstructed_sample
+      end
+      reconstructed
+    end
+    # Get the amount of variance explained by each component
+    # @return [Array] Explained variance for each component
+    def explained_variance
+      raise RuntimeError, "Model must be fitted first" unless fitted?
+      @explained_variance
+    end
+    # Get the percentage of variance explained by each component
+    # @return [Array] Explained variance ratio for each component
+    def explained_variance_ratio
+      raise RuntimeError, "Model must be fitted first" unless fitted?
+      @explained_variance_ratio
+    end
+    # Get cumulative explained variance ratio
+    # @return [Array] Cumulative sum of explained variance ratios
+    def cumulative_explained_variance_ratio
+      raise RuntimeError, "Model must be fitted first" unless fitted?
+      cumsum = []
+      sum = 0.0
+      @explained_variance_ratio.each do |ratio|
+        sum += ratio
+        cumsum << sum
+      end
+      cumsum
+    end
+    # Check if model has been fitted
+    # @return [Boolean] True if fitted
+    def fitted?
+      @fitted
+    end
+    private
+    def validate_data(data)
+      # Use shared validation for common checks
+      DataValidator.validate_pca(data)
+      # PCA-specific validations
+      if data.size < @n_components
+        raise ArgumentError, "n_components (#{@n_components}) cannot be larger than n_samples (#{data.size})"
+      end
+      if data.first.size < @n_components
+        raise ArgumentError, "n_components (#{@n_components}) cannot be larger than n_features (#{data.first.size})"
+      end
+    end
+    def calculate_mean(data)
+      n_features = data.first.size
+      mean = Array.new(n_features, 0.0)
+      data.each do |row|
+        row.each_with_index do |val, i|
+          mean[i] += val
+        end
+      end
+      mean.map { |sum| sum / data.size.to_f }
+    end
+    def center_data(data, mean)
+      data.map do |row|
+        row.zip(mean).map { |val, m| val - m }
+      end
+    end
+    def calculate_total_variance(centered_data, n_samples)
+      total_var = 0.0
+      centered_data.each do |row|
+        row.each do |val|
+          total_var += val ** 2
+        end
+      end
+      total_var / (n_samples - 1)
+    end
+    def project_data(centered_data, components)
+      # Matrix multiplication: centered_data × components.T
+      transformed = []
+      centered_data.each do |sample|
+        projected = Array.new(@n_components, 0.0)
+        components.each_with_index do |component, i|
+          dot_product = 0.0
+          sample.each_with_index do |val, j|
+            dot_product += val * component[j]
+          end
+          projected[i] = dot_product
+        end
+        transformed << projected
+      end
+      transformed
+    end
+    # Shared SVD computation for both fit and fit_transform
+    # Ensures both methods use identical SVD invocation and parameters
+    def perform_svd(centered_data)
+      SVD.randomized_svd(centered_data, @n_components, n_iter: 5)
+    end
+  end
+  # Module-level convenience method
+  # @param data [Array] 2D array of data points
+  # @param n_components [Integer] Number of components
+  # @return [Array] Transformed data
+  def self.pca(data, n_components: 2)
+    pca = PCA.new(n_components: n_components)
+    pca.fit_transform(data)
+    end
+  end
+end

data/lib/clusterkit/dimensionality/svd.rb ADDED Viewed

@@ -0,0 +1,175 @@
+# frozen_string_literal: true
+require_relative '../clusterkit'
+require_relative '../data_validator'
+module ClusterKit
+  module Dimensionality
+    # Singular Value Decomposition
+    # Decomposes a matrix into U, S, V^T components
+    class SVD
+      attr_reader :n_components, :n_iter, :random_seed
+      attr_reader :u, :s, :vt, :n_features
+      # Initialize a new SVD instance
+      # @param n_components [Integer] Number of components to compute
+      # @param n_iter [Integer] Number of iterations for randomized algorithm (default: 2)
+      # @param random_seed [Integer, nil] Random seed for reproducibility
+      def initialize(n_components: nil, n_iter: 2, random_seed: nil)
+        @n_components = n_components
+        @n_iter = n_iter
+        @random_seed = random_seed
+        @fitted = false
+      end
+      # Fit the model and transform data in one step
+      # @param data [Array<Array<Numeric>>] Input data
+      # @return [Array] Returns [U, S, Vt] matrices
+      def fit_transform(data)
+        validate_input(data)
+        # Store data characteristics for later transform operations
+        @n_features = data.first.size
+        @original_data_id = data.object_id
+        # Determine n_components if not set
+        n_comp = @n_components || [data.size, data.first.size].min
+        # Call the Rust implementation
+        @u, @s, @vt = self.class.randomized_svd(data, n_comp, n_iter: @n_iter)
+        @fitted = true
+        [@u, @s, @vt]
+      end
+      # Fit the model to data
+      # @param data [Array<Array<Numeric>>] Input data
+      # @return [self]
+      def fit(data)
+        fit_transform(data)
+        self
+      end
+      # Get the U matrix (left singular vectors)
+      # @return [Array<Array<Float>>] U matrix
+      def components_u
+        raise RuntimeError, "Model must be fitted first" unless fitted?
+        @u
+      end
+      # Get the singular values
+      # @return [Array<Float>] Singular values
+      def singular_values
+        raise RuntimeError, "Model must be fitted first" unless fitted?
+        @s
+      end
+      # Get the V^T matrix (right singular vectors, transposed)
+      # @return [Array<Array<Float>>] V^T matrix
+      def components_vt
+        raise RuntimeError, "Model must be fitted first" unless fitted?
+        @vt
+      end
+      # Check if the model has been fitted
+      # @return [Boolean]
+      def fitted?
+        @fitted
+      end
+      # Transform data using fitted SVD (project onto components)
+      # @param data [Array<Array<Numeric>>] Data to transform
+      # @return [Array<Array<Float>>] Transformed data projected onto SVD components
+      def transform(data)
+        raise RuntimeError, "Model must be fitted first" unless fitted?
+        validate_transform_input(data)
+        if data.object_id == @original_data_id
+          # Same data that was fitted - return U * S
+          @u.map.with_index do |row, i|
+            row.map.with_index { |val, j| val * @s[j] }
+          end
+        else
+          # New data - project onto V components: data × V
+          # Since we have V^T, we need to transpose it back to V
+          # V = V^T^T, so we project: data × V^T^T
+          transform_new_data(data)
+        end
+      end
+      # Inverse transform (reconstruct from components)
+      # @param transformed_data [Array<Array<Float>>] Transformed data
+      # @return [Array<Array<Float>>] Reconstructed data
+      def inverse_transform(transformed_data)
+        raise RuntimeError, "Model must be fitted first" unless fitted?
+        # Reconstruction: (U * S) * V^T
+        # transformed_data should be U * S
+        # We multiply by V^T to reconstruct
+        result = []
+        transformed_data.each do |row|
+          reconstructed = Array.new(@vt.first.size, 0.0)
+          row.each_with_index do |val, i|
+            @vt[i].each_with_index do |v, j|
+              reconstructed[j] += val * v
+            end
+          end
+          result << reconstructed
+        end
+        result
+      end
+      # Class method for randomized SVD (kept for compatibility)
+      # @param matrix [Array<Array<Numeric>>] Input matrix
+      # @param k [Integer] Number of components
+      # @param n_iter [Integer] Number of iterations
+      # @return [Array] Returns [U, S, Vt]
+      def self.randomized_svd(matrix, k, n_iter: 2)
+        ::ClusterKit::SVD.randomized_svd_rust(matrix, k, n_iter)
+      end
+      private
+      def validate_input(data)
+        DataValidator.validate_standard(data, check_finite: false)
+      end
+      def validate_transform_input(data)
+        DataValidator.validate_standard(data, check_finite: false)
+        # Check feature count matches training data
+        if data.first.size != @n_features
+          raise ArgumentError, "New data has #{data.first.size} features, but model was fitted with #{@n_features} features"
+        end
+      end
+      # Transform new data by projecting onto V components
+      # Mathematical operation: new_data × V, where V = V^T^T
+      def transform_new_data(data)
+        # V^T is stored as @vt (shape: n_components × n_features)
+        # We need V (shape: n_features × n_components)
+        # V = V^T^T, so we transpose @vt
+        result = []
+        data.each do |sample|
+          # Project sample onto each component (column of V = row of V^T)
+          projected = Array.new(@vt.size, 0.0)
+          @vt.each_with_index do |vt_row, comp_idx|
+            # Dot product: sample · vt_row (this is sample · V[:, comp_idx])
+            dot_product = 0.0
+            sample.each_with_index do |val, feat_idx|
+              dot_product += val * vt_row[feat_idx]
+            end
+            projected[comp_idx] = dot_product
+          end
+          result << projected
+        end
+        result
+      end
+    end
+  end
+end