RubyGems - clusterkit - Versions diffs - 0.1.0.pre.1 → 0.1.0.pre.2 - Mend

clusterkit 0.1.0.pre.1 → 0.1.0.pre.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/README.md +25 -0
data/lib/clusterkit/dimensionality/umap.rb +68 -64
data/lib/clusterkit/version.rb +1 -1
data/lib/clusterkit.rb +6 -8
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fd025da9b7f5c97e370d05fb1062484cb99b0aaaa4a7c310eb27df78336c91b4
-  data.tar.gz: 7665cf847930bc47cb04adc1f466ba09ea33c14da5f242434b08493047945f91
+  metadata.gz: 502f5f9d65ce255e4f075ca506ec0a20fddc27f14cce2df67947055e3b92801e
+  data.tar.gz: e1f26d070737d8fb4e15eb5ed9231eda7bd7e30473e883837ddfd7c780d54645
 SHA512:
-  metadata.gz: a8db6d4738ad99a20887aef90398d4163bdd8bc47bbdb8dda74496adc105602051999e50d2bc6003b4981d16b8121151d71cf55618691262b4a092f3c46a2545
-  data.tar.gz: 1a6e43a00d19d7fdaf35be6deffa5215e994a1f013b140aa23ac9bbb9d982facde94d710a894cba39e1fbd39bccd7c020b86a99e9d32d2aa256f717844623e5e
+  metadata.gz: bccf375389b9c0b98a1426470830b69675b3f7be15ae691366433eaa313270980e71d6232812fb1a2b823a5f4a1370fd9d62e1d54b0b150c491dda19d0c41d63
+  data.tar.gz: 5478910b6c42c50aed1f9e8184638a4e0a3903c34136c724f25c2a531385d89f24d599c4b649b6e569a5e043dd2d9a36dfbce5d796d8e42369613831e38dbe0e

data/README.md CHANGED Viewed

@@ -38,6 +38,22 @@ This gem would not be possible without these foundational libraries. Please cons
   - Comparison of different algorithms
   - Built-in rake tasks for quick experimentation
+## API Structure
+ClusterKit organizes its functionality into clear modules:
+- **`ClusterKit::Dimensionality`** - All dimensionality reduction algorithms
+  - `ClusterKit::Dimensionality::UMAP` - UMAP implementation
+  - `ClusterKit::Dimensionality::PCA` - PCA implementation
+  - `ClusterKit::Dimensionality::SVD` - SVD implementation
+- **`ClusterKit::Clustering`** - All clustering algorithms
+  - `ClusterKit::Clustering::KMeans` - K-means clustering
+  - `ClusterKit::Clustering::HDBSCAN` - HDBSCAN clustering
+- **`ClusterKit::Utils`** - Utility functions
+- **`ClusterKit::Preprocessing`** - Data preprocessing tools
+All user-facing classes are in these modules. Implementation details are kept private.
 ## Installation
 Add this line to your application's Gemfile:
@@ -222,6 +238,15 @@ test_data = all_data[150..-1]       # Last 50 samples for testing
 umap.fit(training_data)
 test_embedded = umap.transform(test_data)
+# Save and load fitted models
+umap.save_model("umap_model.bin")                  # Save the fitted model
+loaded_umap = ClusterKit::Dimensionality::UMAP.load_model("umap_model.bin")  # Load it later
+new_data_embedded = loaded_umap.transform(new_data)  # Use loaded model for new data
+# Save and load transformed data (useful for caching results)
+ClusterKit::Dimensionality::UMAP.save_data(embedded, "embeddings.json")
+cached_embeddings = ClusterKit::Dimensionality::UMAP.load_data("embeddings.json")
 # Note: The library automatically adjusts n_neighbors if it's too large for your dataset
 ```

data/lib/clusterkit/dimensionality/umap.rb CHANGED Viewed

@@ -9,7 +9,7 @@ module ClusterKit
   module Dimensionality
     class UMAP
     attr_reader :n_components, :n_neighbors, :random_seed, :nb_grad_batch, :nb_sampling_by_edge
     # Initialize a new UMAP instance
     # @param n_components [Integer] Target number of dimensions (default: 2)
     # @param n_neighbors [Integer] Number of neighbors for manifold approximation (default: 15)
@@ -18,7 +18,7 @@ module ClusterKit
     #                                Controls training iterations - lower = faster but less accurate
     # @param nb_sampling_by_edge [Integer] Number of negative samples per edge (default: 8)
     #                                      Controls sampling quality - lower = faster but less accurate
-    def initialize(n_components: 2, n_neighbors: 15, random_seed: nil,
+    def initialize(n_components: 2, n_neighbors: 15, random_seed: nil,
                    nb_grad_batch: 10, nb_sampling_by_edge: 8)
       @n_components = n_components
       @n_neighbors = n_neighbors
@@ -29,21 +29,21 @@ module ClusterKit
       # Don't create RustUMAP yet - will be created in fit/fit_transform with adjusted parameters
       @rust_umap = nil
     end
     # Fit the model to the data (training)
     # @param data [Array<Array<Numeric>>] Training data as 2D array
     # @return [self] Returns self for method chaining
     # @note UMAP's training process inherently produces embeddings. Since the
-    #       underlying Rust implementation doesn't separate training from
+    #       underlying Rust implementation doesn't separate training from
     #       transformation, we call fit_transform but discard the embeddings.
     #       Use fit_transform if you need both training and the transformed data.
     def fit(data)
       validate_input(data)
       # Always recreate RustUMAP for fit to ensure fresh fit
       @rust_umap = nil
       create_rust_umap_with_adjusted_params(data)
       # UMAP doesn't separate training from transformation internally,
       # so we call fit_transform but discard the result
       begin
@@ -59,7 +59,7 @@ module ClusterKit
         handle_umap_error(RuntimeError.new(e.message), data)
       end
     end
     # Transform data using the fitted model
     # @param data [Array<Array<Numeric>>] Data to transform
     # @return [Array<Array<Float>>] Transformed data in reduced dimensions
@@ -71,17 +71,17 @@ module ClusterKit
         @rust_umap.transform(data)
       end
     end
     # Fit the model and transform the data in one step
     # @param data [Array<Array<Numeric>>] Training data as 2D array
     # @return [Array<Array<Float>>] Transformed data in reduced dimensions
     def fit_transform(data)
       validate_input(data)
       # Always recreate RustUMAP for fit_transform to ensure fresh fit
       @rust_umap = nil
       create_rust_umap_with_adjusted_params(data)
       begin
         result = Silence.maybe_silence do
           @rust_umap.fit_transform(data)
@@ -95,36 +95,36 @@ module ClusterKit
         handle_umap_error(RuntimeError.new(e.message), data)
       end
     end
     # Check if the model has been fitted
     # @return [Boolean] true if model is fitted, false otherwise
     def fitted?
       @fitted
     end
     # Save the fitted model to a file
     # @param path [String] Path where to save the model
     # @raise [RuntimeError] If model hasn't been fitted yet
-    def save(path)
+    def save_model(path)
       raise RuntimeError, "No model to save. Call fit or fit_transform first." unless fitted?
       # Ensure directory exists
       dir = File.dirname(path)
       FileUtils.mkdir_p(dir) unless dir == '.' || dir == '/'
       @rust_umap.save_model(path)
     end
     # Load a fitted model from a file
     # @param path [String] Path to the saved model
     # @return [UMAP] A new UMAP instance with the loaded model
     # @raise [ArgumentError] If file doesn't exist
-    def self.load(path)
+    def self.load_model(path)
       raise ArgumentError, "File not found: #{path}" unless File.exist?(path)
-      # Load the Rust model
-      rust_umap = ::ClusterKit::RustUMAP.load_model(path)
+      # Load the Rust model (access private constant)
+      rust_umap = ::ClusterKit.const_get(:RustUMAP).load_model(path)
       # Create a new UMAP instance with the loaded model
       instance = allocate
       instance.instance_variable_set(:@rust_umap, rust_umap)
@@ -133,172 +133,176 @@ module ClusterKit
       instance.instance_variable_set(:@n_components, nil)
       instance.instance_variable_set(:@n_neighbors, nil)
       instance.instance_variable_set(:@random_seed, nil)
       instance
     end
-    # Export transformed data to JSON (utility method for caching)
-    # @param data [Array<Array<Float>>] Transformed data to export
+    # Save transformed data to JSON file
+    # @param data [Array<Array<Float>>] Transformed data to save
     # @param path [String] Path where to save the data
-    def self.export_data(data, path)
+    def self.save_data(data, path)
+      FileUtils.mkdir_p(File.dirname(path)) unless File.dirname(path) == '.'
       File.write(path, JSON.pretty_generate(data))
     end
-    # Import transformed data from JSON (utility method for caching)
+    # Load transformed data from JSON file
     # @param path [String] Path to the saved data
     # @return [Array<Array<Float>>] The loaded data
-    def self.import_data(path)
+    # @raise [ArgumentError] If file doesn't exist
+    def self.load_data(path)
+      raise ArgumentError, "File not found: #{path}" unless File.exist?(path)
       JSON.parse(File.read(path))
     end
     private
     def handle_umap_error(error, data)
       error_msg = error.message
       n_samples = data.size
       case error_msg
       when /isolated point/i, /graph will not be connected/i
         raise ::ClusterKit::IsolatedPointError, <<~MSG
           UMAP found isolated points in your data that are too far from other points.
           This typically happens when:
           • Your data contains outliers that are very different from other points
           • You're using random data without inherent structure
           • The n_neighbors parameter (#{@n_neighbors}) is too high for your data distribution
           Solutions:
           1. Reduce n_neighbors (try 5 or even 3): UMAP.new(n_neighbors: 5)
           2. Remove outliers from your data before applying UMAP
           3. Ensure your data has some structure (not purely random)
           4. For small datasets (< 50 points), consider using PCA instead
           Your data: #{n_samples} samples, #{data.first&.size || 0} dimensions
         MSG
       when /assertion failed.*box_size/i
         raise ::ClusterKit::ConvergenceError, <<~MSG
           UMAP failed to converge due to numerical instability in your data.
           This typically happens when:
           • Data points are too spread out or have extreme values
           • The scale of different features varies wildly
           • There are duplicate or nearly-duplicate points
           Solutions:
           1. Normalize your data first: ClusterKit::Preprocessing.normalize(data)
-          2. Use a smaller n_neighbors value: UMAP.new(n_neighbors: 5)
+          2. Use a smaller n_neighbors value: UMAP.new(n_neighbors: 5)
           3. Check for and remove duplicate points
           4. Scale your data to a reasonable range (e.g., 0-1 or -1 to 1)
           Your data: #{n_samples} samples, #{data.first&.size || 0} dimensions
         MSG
       when /n_neighbors.*larger than/i, /too many neighbors/i
         raise ::ClusterKit::InvalidParameterError, <<~MSG
           The n_neighbors parameter (#{@n_neighbors}) is too large for your dataset size (#{n_samples}).
           UMAP needs n_neighbors to be less than the number of samples.
           Suggested value: #{[5, (n_samples * 0.1).to_i].max}
           This should have been auto-adjusted. If you're seeing this error, please report it.
         MSG
       else
         # For unknown errors, still provide some guidance
         raise ::ClusterKit::Error, <<~MSG
           UMAP encountered an error: #{error_msg}
           Common solutions:
           1. Try reducing n_neighbors (current: #{@n_neighbors})
           2. Normalize your data first
           3. Check for NaN or infinite values in your data
           4. Ensure you have at least 10 data points
           If this persists, consider using PCA for dimensionality reduction instead.
         MSG
       end
     end
     def validate_input(data, check_min_samples: true)
       raise ArgumentError, "Input must be an array" unless data.is_a?(Array)
       raise ArgumentError, "Input cannot be empty" if data.empty?
       first_row = data.first
       raise ArgumentError, "Input must be a 2D array (array of arrays)" unless first_row.is_a?(Array)
       row_length = first_row.length
       min_val = Float::INFINITY
       max_val = -Float::INFINITY
       # First validate data structure and types
       data.each_with_index do |row, i|
         unless row.is_a?(Array)
           raise ArgumentError, "Row #{i} is not an array"
         end
         if row.length != row_length
           raise ArgumentError, "All rows must have the same length (row #{i} has #{row.length} elements, expected #{row_length})"
         end
         row.each_with_index do |val, j|
           unless val.is_a?(Numeric)
             raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
           end
           # Only check for NaN/Infinite on floats
           if val.is_a?(Float) && (val.nan? || val.infinite?)
             raise ArgumentError, "Element at position [#{i}, #{j}] is NaN or Infinite"
           end
           # Track data range
           val_f = val.to_f
           min_val = val_f if val_f < min_val
           max_val = val_f if val_f > max_val
         end
       end
       # Check for sufficient data points after validating structure (only for fit operations)
       if check_min_samples && data.size < 10
         raise ::ClusterKit::InsufficientDataError, <<~MSG
           UMAP requires at least 10 data points, but only #{data.size} provided.
           For small datasets, consider:
           1. Using PCA instead: ClusterKit::Dimensionality::PCA.new(n_components: 2)
           2. Collecting more data points
           3. Using simpler visualization methods
         MSG
       end
       # Check for extreme data ranges that might cause numerical issues
       data_range = max_val - min_val
       if data_range > 1000
         warn "WARNING: Large data range detected (#{data_range.round(2)}). Consider normalizing your data to prevent numerical instability."
       end
     end
     def create_rust_umap_with_adjusted_params(data)
       # Only create if not already created
       return if @rust_umap
       n_samples = data.size
       # Automatically adjust n_neighbors if it's too high for the dataset
       # n_neighbors should be less than n_samples
       # Use a reasonable default: min(15, n_samples / 4) but at least 2
       max_neighbors = [n_samples - 1, 2].max  # At least 2, but less than n_samples
       suggested_neighbors = [[15, n_samples / 4].min.to_i, 2].max
       adjusted_n_neighbors = @n_neighbors
       if @n_neighbors > max_neighbors
         adjusted_n_neighbors = [suggested_neighbors, max_neighbors].min
         if ::ClusterKit.configuration.verbose
           warn "UMAP: Adjusted n_neighbors from #{@n_neighbors} to #{adjusted_n_neighbors} for dataset with #{n_samples} samples"
         end
       end
-      @rust_umap = ::ClusterKit::RustUMAP.new({
+      # Access the private constant from inside the module
+      @rust_umap = ::ClusterKit.const_get(:RustUMAP).new({
         n_components: @n_components,
         n_neighbors: adjusted_n_neighbors,
         random_seed: @random_seed,

data/lib/clusterkit/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module ClusterKit
-  VERSION = "0.1.0.pre.1"
+  VERSION = "0.1.0.pre.2"
 end

data/lib/clusterkit.rb CHANGED Viewed

@@ -21,20 +21,18 @@ module ClusterKit
   class DisconnectedGraphError < DataError; end
   class InsufficientDataError < DataError; end
-  # Load modules - can't use autoload with require_relative path issues
-  require_relative "clusterkit/dimensionality"
-  require_relative "clusterkit/clustering"
   # Autoload utilities
   autoload :Utils, "clusterkit/utils"
   autoload :Preprocessing, "clusterkit/preprocessing"
   autoload :Silence, "clusterkit/silence"
-  # Load the extension first
-  require_relative "clusterkit/clusterkit"
-  # Now load the modules that depend on the extension
+  # Load modules that depend on the extension
+  require_relative "clusterkit/dimensionality"
   require_relative "clusterkit/clustering"
+  # Make RustUMAP private - it's an implementation detail
+  # Users should use Dimensionality::UMAP instead
+  private_constant :RustUMAP if const_defined?(:RustUMAP)
   class << self
     # Quick UMAP embedding

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: clusterkit
 version: !ruby/object:Gem::Version
-  version: 0.1.0.pre.1
+  version: 0.1.0.pre.2
 platform: ruby
 authors:
 - Chris Petersen