RubyGems - clusterkit - Versions diffs - 0.1.1 → 0.2.0.pre.1 - Mend

clusterkit 0.1.1 → 0.2.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +4 -4
data/README.md +2 -2
data/clusterkit.gemspec +45 -0
data/docs/KNOWN_ISSUES.md +5 -5
data/docs/RUST_ERROR_HANDLING.md +6 -6
data/ext/clusterkit/Cargo.toml +4 -5
data/ext/clusterkit/extconf.rb +1 -9
data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +62 -27
data/ext/clusterkit/src/clustering.rs +114 -68
data/ext/clusterkit/src/embedder.rs +131 -48
data/ext/clusterkit/src/hnsw.rs +215 -181
data/ext/clusterkit/src/lib.rs +5 -5
data/ext/clusterkit/src/svd.rs +58 -35
data/ext/clusterkit/src/utils.rs +9 -159
data/lib/clusterkit/clustering/hdbscan.rb +17 -4
data/lib/clusterkit/clustering.rb +23 -4
data/lib/clusterkit/dimensionality/pca.rb +12 -12
data/lib/clusterkit/dimensionality/svd.rb +16 -47
data/lib/clusterkit/dimensionality/umap.rb +40 -7
data/lib/clusterkit/version.rb +1 -1
data/lib/clusterkit.rb +1 -1
metadata +20 -35
data/Cargo.lock +0 -3236
data/lib/clusterkit/data_validator.rb +0 -132

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: bb4abb036977432c8c0c57385916ce292c77cf05b904a79cf584ace3cc761295
-  data.tar.gz: af2be6cd56b642df956e919f97fd8df6d56ed729bdda41d2c1b79a7066b3ab76
+  metadata.gz: 868c555f318a974371aeae9b892641e4e69ea3bb7b38b8e7b30c16e24bba37e5
+  data.tar.gz: e9fc6e35b6065d074e5f9e2b074298b83686aa95fcb78f2cf7f98373015414db
 SHA512:
-  metadata.gz: e8c97017b1842e7eb17a4dda614e66ce2c7764200a78df37c5db28e65d9151d7582c131a47acd384a69faa24bb254075bc5e6482a9b9ddec1edbd6d9a850eb40
-  data.tar.gz: 4ac33fc9ca202390ba180c849b0e786c236368c9b877cd4bbfca71df886c36fa50a62c89274da4e44c355be5fde98cedf817874a3bc6f8db496a3ee1ed43bb29
+  metadata.gz: e25abeea0f43f2f9b1cd2171a92408d458f95c09aca86d10df3cd01602b89df4725ce2fdd99b8d0cefe3c5b21659edd2f9e7acc60cd2f49cc7aa1ee7ab815911
+  data.tar.gz: b6dd04908293679a1c16919a1416f09ce3cd6099903724ee1a98be0f1fd6c8200609ec9f792c733442847ad92a2ab4e4a3da47380ee9e5a3517d855d5a02286a

data/README.md CHANGED Viewed

@@ -720,7 +720,7 @@ COVERAGE=true bundle exec rspec
 ## Contributing
-Bug reports and pull requests are welcome on GitHub at https://github.com/scientist-labs/clusterkit.
+Bug reports and pull requests are welcome on GitHub at https://github.com/cpetersen/clusterkit.
 ## License
@@ -735,7 +735,7 @@ If you use ClusterKit in your research, please cite:
   author = {Chris Petersen},
   title = {ClusterKit: High-Performance Clustering and Dimensionality Reduction for Ruby},
   year = {2024},
-  url = {https://github.com/scientist-labs/clusterkit}
+  url = {https://github.com/cpetersen/clusterkit}
 }
 ```

data/clusterkit.gemspec ADDED Viewed

@@ -0,0 +1,45 @@
+require_relative "lib/clusterkit/version"
+Gem::Specification.new do |spec|
+  spec.name = "clusterkit"
+  spec.version = ClusterKit::VERSION
+  spec.authors = ["Chris Petersen"]
+  spec.email = ["chris@petersen.io"]
+  spec.summary = "High-performance clustering and dimensionality reduction for Ruby"
+  spec.description = "A comprehensive clustering toolkit for Ruby, providing UMAP, PCA, K-means, HDBSCAN and more. Built on top of annembed and hdbscan Rust crates for blazing-fast performance."
+  spec.homepage = "https://github.com/cpetersen/clusterkit"
+  spec.license = "MIT"
+  spec.required_ruby_version = ">= 2.7.0"
+  spec.metadata["homepage_uri"] = spec.homepage
+  spec.metadata["source_code_uri"] = spec.homepage
+  spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
+  # Specify which files should be added to the gem when it is released.
+  spec.files = Dir.chdir(__dir__) do
+    `git ls-files -z`.split("\x0").reject do |f|
+      (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
+    end + Dir["ext/**/*.rs", "ext/**/*.toml"]
+  end
+  spec.bindir = "exe"
+  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.extensions = ["ext/clusterkit/extconf.rb"]
+  # Runtime dependencies
+  # Numo is optional but recommended for better performance
+  # spec.add_dependency "numo-narray", "~> 0.9"
+  # Development dependencies
+  spec.add_development_dependency "csv"
+  spec.add_development_dependency "rake", "~> 13.0"
+  spec.add_development_dependency "rake-compiler", "~> 1.2"
+  spec.add_development_dependency "rb_sys", "~> 0.9"
+  spec.add_development_dependency "rspec", "~> 3.0"
+  spec.add_development_dependency "simplecov", "~> 0.22"
+  spec.add_development_dependency "yard", "~> 0.9"
+  # For more information and examples about making a new gem, check out our
+  # guide at: https://bundler.io/guides/creating_gem.html
+end

data/docs/KNOWN_ISSUES.md CHANGED Viewed

@@ -14,7 +14,7 @@ This gem has three main categories of limitations:
 **Reason**: UMAP needs sufficient data to construct a meaningful manifold approximation. With fewer than 10 points, the algorithm cannot create a reliable graph structure.
-**Workaround**:
+**Workaround**:
 - Use PCA for datasets with fewer than 10 points
 - The `transform` method can handle smaller datasets once the model is fitted on adequate training data
@@ -30,12 +30,12 @@ This gem has three main categories of limitations:
 **Previous Issue**: The box_size assertion would panic and crash the Ruby process.
-**Current Status**: **FIXED** in `scientist-labs/annembed:fix-box-size-panic` branch
+**Current Status**: **FIXED** in `cpetersen/annembed:fix-box-size-panic` branch
 - The `"assertion failed: (*f).abs() <= box_size"` panic has been converted to a catchable error
 - Extreme value ranges are now handled gracefully through normalization
 - NaN/Infinite values are detected and reported with clear error messages
-**Remaining Uncatchable Errors**:
+**Remaining Uncatchable Errors**:
 - Array bounds violations (accessing out-of-bounds indices)
 - Some `.unwrap()` calls on `None` or `Err` values
 - These are much less common in normal usage
@@ -98,7 +98,7 @@ def safe_umap_transform(data, options = {})
   # Save data to temporary file before processing
   temp_file = "temp_umap_data_#{Time.now.to_i}.json"
   File.write(temp_file, JSON.dump(data))
   begin
     umap = ClusterKit::Dimensionality::UMAP.new(**options)
     result = umap.fit_transform(data)
@@ -127,4 +127,4 @@ def reduce_dimensions(data, n_components: 2)
     pca.fit_transform(data)
   end
 end
-```
+```

data/docs/RUST_ERROR_HANDLING.md CHANGED Viewed

@@ -37,11 +37,11 @@ These use Rust's `assert!` or `panic!` macros and CANNOT be caught. They will cr
 | Error | Source | Location | Trigger Condition |
 |-------|--------|----------|-------------------|
-| ~~Box size assertion~~ | ~~annembed~~ | ~~`set_data_box`~~ | **FIXED in scientist-labs/annembed:fix-box-size-panic** |
+| ~~Box size assertion~~ | ~~annembed~~ | ~~`set_data_box`~~ | **FIXED in cpetersen/annembed:fix-box-size-panic** |
 | Array bounds | Various | Index operations | Accessing out-of-bounds indices |
 | Unwrap failures | Various | `.unwrap()` calls | Unwrapping `None` or `Err` |
-**Update (2025-08-19):** The box size assertion has been fixed in the `fix-box-size-panic` branch of scientist-labs/annembed. It now returns a proper `Result<(), anyhow::Error>` that can be caught and handled gracefully:
+**Update (2025-08-19):** The box size assertion has been fixed in the `fix-box-size-panic` branch of cpetersen/annembed. It now returns a proper `Result<(), anyhow::Error>` that can be caught and handled gracefully:
 ```rust
 // Previously (would panic):
@@ -96,13 +96,13 @@ when /isolated point/i
 **Previous Issue:** Would panic and crash the Ruby process
-**Current Status:** Fixed in `scientist-labs/annembed:fix-box-size-panic` branch
-- Now returns a catchable `anyhow::Error`
+**Current Status:** Fixed in `cpetersen/annembed:fix-box-size-panic` branch
+- Now returns a catchable `anyhow::Error`
 - Detects NaN/Infinite values during normalization
 - Handles constant data (max_max = 0) gracefully
 - Extreme value ranges are normalized successfully
-**User-visible behavior:**
+**User-visible behavior:**
 - Previously: Ruby process would crash with assertion failure
 - Now: Raises a catchable Ruby exception with helpful error message
@@ -161,4 +161,4 @@ when /isolated point/i
 The test suite mocks Rust errors to verify our error handling logic works correctly. However, actual panic conditions cannot be tested without crashing the test process.
-See `spec/clusterkit/error_handling_spec.rb` for error handling tests.
+See `spec/clusterkit/error_handling_spec.rb` for error handling tests.

data/ext/clusterkit/Cargo.toml CHANGED Viewed

@@ -7,9 +7,9 @@ edition = "2021"
 crate-type = ["cdylib"]
 [dependencies]
-magnus = { version = "0.8", features = ["embed"] }
-annembed = { git = "https://github.com/scientist-labs/annembed", tag = "clusterkit-0.1.1" }
-hnsw_rs = { git = "https://github.com/scientist-labs/hnswlib-rs", tag = "clusterkit-0.1.0" }
+magnus = { version = "0.6", features = ["embed"] }
+annembed = { git = "https://github.com/cpetersen/annembed", tag = "clusterkit-0.1.0" }
+hnsw_rs = { git = "https://github.com/cpetersen/hnswlib-rs", tag = "clusterkit-0.1.0" }
 hdbscan = "0.11"
 ndarray = "0.16"
 num-traits = "0.2"
@@ -22,5 +22,4 @@ rand = "0.8"
 default = ["openblas-static"]
 openblas-static = ["annembed/openblas-static"]
 openblas-system = ["annembed/openblas-system"]
-intel-mkl-static = ["annembed/intel-mkl-static"]
-macos-accelerate = ["annembed/macos-accelerate"]
+intel-mkl-static = ["annembed/intel-mkl-static"]

data/ext/clusterkit/extconf.rb CHANGED Viewed

@@ -1,12 +1,4 @@
 require "mkmf"
 require "rb_sys/mkmf"
-create_rust_makefile("clusterkit/clusterkit") do |r|
-  if ENV["CLUSTERKIT_FEATURES"]
-    r.extra_cargo_args += ["--no-default-features"]
-    r.features = ENV["CLUSTERKIT_FEATURES"].split(",")
-  elsif RUBY_PLATFORM =~ /darwin/
-    r.extra_cargo_args += ["--no-default-features"]
-    r.features = ["macos-accelerate"]
-  end
-end
+create_rust_makefile("clusterkit/clusterkit")

data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs CHANGED Viewed

@@ -1,6 +1,5 @@
-use magnus::{function, prelude::*, Error, Value, RHash, Ruby};
+use magnus::{function, prelude::*, Error, Value, RArray, RHash, Integer, TryConvert};
 use hdbscan::{Hdbscan, HdbscanHyperParams};
-use crate::utils::ruby_array_to_vec_vec_f64;
 /// Perform HDBSCAN clustering
 /// Returns a hash with labels and basic statistics
@@ -10,62 +9,98 @@ pub fn hdbscan_fit(
     min_cluster_size: usize,
     metric: String,
 ) -> Result<RHash, Error> {
-    let ruby = Ruby::get().unwrap();
-    // Convert Ruby array to Vec<Vec<f64>> using shared helper
-    let data_vec = ruby_array_to_vec_vec_f64(data)?;
-    let n_samples = data_vec.len();
+    // Convert Ruby array to ndarray
+    let rarray: RArray = TryConvert::try_convert(data)?;
+    let n_samples = rarray.len();
+    if n_samples == 0 {
+        return Err(Error::new(
+            magnus::exception::arg_error(),
+            "Data cannot be empty",
+        ));
+    }
+    // Get dimensions
+    let first_row: RArray = rarray.entry::<RArray>(0)?;
+    let n_features = first_row.len();
+    // Convert to Vec<Vec<f64>> format expected by hdbscan crate
+    let mut data_vec: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
+    for i in 0..n_samples {
+        let row: RArray = rarray.entry(i as isize)?;
+        let mut row_vec: Vec<f64> = Vec::with_capacity(n_features);
+        for j in 0..n_features {
+            let val: f64 = row.entry(j as isize)?;
+            row_vec.push(val);
+        }
+        data_vec.push(row_vec);
+    }
+    // Note: hdbscan crate doesn't support custom metrics directly
+    // We'll use the default Euclidean distance for now
     if metric != "euclidean" && metric != "l2" {
         eprintln!("Warning: Current hdbscan version only supports Euclidean distance. Using Euclidean.");
     }
     // Adjust parameters to avoid index out of bounds errors
+    // The hdbscan crate has issues when min_samples >= n_samples
     let adjusted_min_samples = min_samples.min(n_samples.saturating_sub(1)).max(1);
     let adjusted_min_cluster_size = min_cluster_size.min(n_samples).max(2);
     // Create hyperparameters
     let hyper_params = HdbscanHyperParams::builder()
         .min_cluster_size(adjusted_min_cluster_size)
         .min_samples(adjusted_min_samples)
         .build();
     // Create HDBSCAN instance and run clustering
     let clusterer = Hdbscan::new(&data_vec, hyper_params);
+    // Run the clustering algorithm - cluster() returns Result<Vec<i32>, HdbscanError>
     let labels = clusterer.cluster().map_err(|e| {
         Error::new(
-            ruby.exception_runtime_error(),
+            magnus::exception::runtime_error(),
             format!("HDBSCAN clustering failed: {:?}", e)
         )
     })?;
     // Convert results to Ruby types
-    let result = ruby.hash_new();
-    let labels_array = ruby.ary_new();
+    let ruby = magnus::Ruby::get().unwrap();
+    let result = RHash::new();
+    // Convert labels (i32 to Ruby Integer, -1 for noise)
+    let labels_array = RArray::new();
     for &label in labels.iter() {
-        labels_array.push(ruby.integer_from_i64(label as i64))?;
+        labels_array.push(Integer::from_value(
+            ruby.eval(&format!("{}", label)).unwrap()
+        ).unwrap())?;
     }
     result.aset("labels", labels_array)?;
-    let probs_array = ruby.ary_new();
+    // For now, we'll create dummy probabilities and outlier scores
+    // since the basic hdbscan crate doesn't provide these
+    // In the future, we could calculate these ourselves or use a more advanced implementation
+    // Create probabilities array (all 1.0 for clustered points, 0.0 for noise)
+    let probs_array = RArray::new();
     for &label in labels.iter() {
         let prob = if label == -1 { 0.0 } else { 1.0 };
         probs_array.push(prob)?;
     }
     result.aset("probabilities", probs_array)?;
-    let outlier_array = ruby.ary_new();
+    // Create outlier scores array (0.0 for clustered points, 1.0 for noise)
+    let outlier_array = RArray::new();
     for &label in labels.iter() {
         let score = if label == -1 { 1.0 } else { 0.0 };
         outlier_array.push(score)?;
     }
     result.aset("outlier_scores", outlier_array)?;
-    let persistence_hash = ruby.hash_new();
+    // Create empty cluster persistence hash for now
+    let persistence_hash = RHash::new();
     result.aset("cluster_persistence", persistence_hash)?;
     Ok(result)
 }
@@ -75,6 +110,6 @@ pub fn init(clustering_module: &magnus::RModule) -> Result<(), Error> {
         "hdbscan_rust",
         function!(hdbscan_fit, 4),
     )?;
     Ok(())
-}
+}

data/ext/clusterkit/src/clustering.rs CHANGED Viewed

@@ -1,52 +1,68 @@
-use magnus::{function, prelude::*, Error, Value, RArray, Ruby};
+use magnus::{function, prelude::*, Error, Value, RArray, Integer, TryConvert};
 use ndarray::{Array1, Array2, ArrayView1, Axis};
 use rand::prelude::*;
-use rand::rngs::StdRng;
-use rand::SeedableRng;
-use crate::utils::ruby_array_to_ndarray2;
 mod hdbscan_wrapper;
 pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
     let clustering_module = parent.define_module("Clustering")?;
     clustering_module.define_singleton_method(
         "kmeans_rust",
-        function!(kmeans, 4),
+        function!(kmeans, 3),
     )?;
     clustering_module.define_singleton_method(
         "kmeans_predict_rust",
         function!(kmeans_predict, 2),
     )?;
     // Initialize HDBSCAN functions
     hdbscan_wrapper::init(&clustering_module)?;
     Ok(())
 }
 /// Perform K-means clustering
 /// Returns (labels, centroids, inertia)
-fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> Result<(RArray, RArray, f64), Error> {
-    let ruby = Ruby::get().unwrap();
-    // Convert Ruby array to ndarray using shared helper
-    let data_array = ruby_array_to_ndarray2(data)?;
-    let (n_samples, n_features) = data_array.dim();
+fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64), Error> {
+    // Convert Ruby array to ndarray
+    let rarray: RArray = TryConvert::try_convert(data)?;
+    let n_samples = rarray.len();
+    if n_samples == 0 {
+        return Err(Error::new(
+            magnus::exception::arg_error(),
+            "Data cannot be empty",
+        ));
+    }
+    // Get dimensions
+    let first_row: RArray = rarray.entry::<RArray>(0)?;
+    let n_features = first_row.len();
     if k > n_samples {
         return Err(Error::new(
-            ruby.exception_arg_error(),
+            magnus::exception::arg_error(),
             format!("k ({}) cannot be larger than number of samples ({})", k, n_samples),
         ));
     }
+    // Convert to ndarray
+    let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
+    for i in 0..n_samples {
+        let row: RArray = rarray.entry(i as isize)?;
+        for j in 0..n_features {
+            let val: f64 = row.entry(j as isize)?;
+            data_array[[i, j]] = val;
+        }
+    }
     // Initialize centroids using K-means++
-    let mut centroids = kmeans_plusplus(&data_array, k, random_seed)?;
+    let mut centroids = kmeans_plusplus(&data_array, k)?;
     let mut labels = vec![0usize; n_samples];
     let mut prev_labels = vec![0usize; n_samples];
     // K-means iterations
     for iteration in 0..max_iter {
         // Assign points to nearest centroid
@@ -55,7 +71,7 @@ fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> R
             let point = data_array.row(i);
             let mut min_dist = f64::INFINITY;
             let mut best_cluster = 0;
             for (j, centroid) in centroids.axis_iter(Axis(0)).enumerate() {
                 let dist = euclidean_distance(&point, &centroid);
                 if dist < min_dist {
@@ -63,38 +79,38 @@ fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> R
                     best_cluster = j;
                 }
             }
             if labels[i] != best_cluster {
                 changed = true;
             }
             labels[i] = best_cluster;
         }
         // Check for convergence
         if !changed && iteration > 0 {
             break;
         }
         // Update centroids
         for j in 0..k {
             let mut sum = Array1::<f64>::zeros(n_features);
             let mut count = 0;
             for i in 0..n_samples {
                 if labels[i] == j {
                     sum += &data_array.row(i);
                     count += 1;
                 }
             }
             if count > 0 {
                 centroids.row_mut(j).assign(&(sum / count as f64));
             }
         }
         prev_labels.clone_from(&labels);
     }
     // Calculate inertia (sum of squared distances to nearest centroid)
     let mut inertia = 0.0;
     for i in 0..n_samples {
@@ -102,43 +118,75 @@ fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> R
         let centroid = centroids.row(labels[i]);
         inertia += euclidean_distance(&point, &centroid).powi(2);
     }
     // Convert results to Ruby arrays
-    let labels_array = ruby.ary_new();
+    let ruby = magnus::Ruby::get().unwrap();
+    let labels_array = RArray::new();
     for label in labels {
-        labels_array.push(ruby.integer_from_i64(label as i64))?;
+        labels_array.push(Integer::from_value(ruby.eval(&format!("{}", label)).unwrap()).unwrap())?;
     }
-    let centroids_array = ruby.ary_new();
+    let centroids_array = RArray::new();
     for i in 0..k {
-        let row_array = ruby.ary_new();
+        let row_array = RArray::new();
         for j in 0..n_features {
             row_array.push(centroids[[i, j]])?;
         }
         centroids_array.push(row_array)?;
     }
     Ok((labels_array, centroids_array, inertia))
 }
 /// Predict cluster labels for new data given centroids
 fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
-    let ruby = Ruby::get().unwrap();
-    // Convert inputs using shared helpers
-    let data_matrix = ruby_array_to_ndarray2(data)?;
-    let centroids_matrix = ruby_array_to_ndarray2(centroids)?;
-    let (n_samples, _) = data_matrix.dim();
+    // Convert inputs
+    let data_array: RArray = TryConvert::try_convert(data)?;
+    let centroids_array: RArray = TryConvert::try_convert(centroids)?;
+    let n_samples = data_array.len();
+    let k = centroids_array.len();
+    if n_samples == 0 {
+        return Err(Error::new(
+            magnus::exception::arg_error(),
+            "Data cannot be empty",
+        ));
+    }
+    // Get dimensions
+    let first_row: RArray = data_array.entry::<RArray>(0)?;
+    let n_features = first_row.len();
+    // Convert data to ndarray
+    let mut data_matrix = Array2::<f64>::zeros((n_samples, n_features));
+    for i in 0..n_samples {
+        let row: RArray = data_array.entry(i as isize)?;
+        for j in 0..n_features {
+            let val: f64 = row.entry(j as isize)?;
+            data_matrix[[i, j]] = val;
+        }
+    }
+    // Convert centroids to ndarray
+    let mut centroids_matrix = Array2::<f64>::zeros((k, n_features));
+    for i in 0..k {
+        let row: RArray = centroids_array.entry(i as isize)?;
+        for j in 0..n_features {
+            let val: f64 = row.entry(j as isize)?;
+            centroids_matrix[[i, j]] = val;
+        }
+    }
     // Predict labels
-    let labels_array = ruby.ary_new();
+    let ruby = magnus::Ruby::get().unwrap();
+    let labels_array = RArray::new();
     for i in 0..n_samples {
         let point = data_matrix.row(i);
         let mut min_dist = f64::INFINITY;
         let mut best_cluster = 0;
         for (j, centroid) in centroids_matrix.axis_iter(Axis(0)).enumerate() {
             let dist = euclidean_distance(&point, &centroid);
             if dist < min_dist {
@@ -146,37 +194,30 @@ fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
                 best_cluster = j;
             }
         }
-        labels_array.push(ruby.integer_from_i64(best_cluster as i64))?;
+        labels_array.push(Integer::from_value(ruby.eval(&format!("{}", best_cluster)).unwrap()).unwrap())?;
     }
     Ok(labels_array)
 }
 /// K-means++ initialization
-fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Result<Array2<f64>, Error> {
+fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
     let n_samples = data.nrows();
     let n_features = data.ncols();
-    // Use seeded RNG if seed is provided, otherwise use thread_rng
-    let mut rng: Box<dyn RngCore> = match random_seed {
-        Some(seed) => {
-            let seed_u64 = seed as u64;
-            Box::new(StdRng::seed_from_u64(seed_u64))
-        },
-        None => Box::new(thread_rng()),
-    };
+    let mut rng = thread_rng();
     let mut centroids = Array2::<f64>::zeros((k, n_features));
     // Choose first centroid randomly
     let first_idx = rng.gen_range(0..n_samples);
     centroids.row_mut(0).assign(&data.row(first_idx));
     // Choose remaining centroids
     for i in 1..k {
         let mut distances = vec![f64::INFINITY; n_samples];
+        // Calculate distance to nearest centroid for each point
         for j in 0..n_samples {
             for c in 0..i {
                 let dist = euclidean_distance(&data.row(j), &centroids.row(c));
@@ -185,20 +226,25 @@ fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Re
                 }
             }
         }
+        // Convert distances to probabilities
         let total: f64 = distances.iter().map(|d| d * d).sum();
         if total == 0.0 {
+            // All points are identical or we've selected duplicates
+            // Just use sequential points as centroids
             if i < n_samples {
                 centroids.row_mut(i).assign(&data.row(i));
             } else {
+                // Reuse first point if we run out
                 centroids.row_mut(i).assign(&data.row(0));
             }
             continue;
         }
+        // Choose next centroid with probability proportional to squared distance
         let mut cumsum = 0.0;
         let rand_val: f64 = rng.gen::<f64>() * total;
         for j in 0..n_samples {
             cumsum += distances[j] * distances[j];
             if cumsum >= rand_val {
@@ -207,7 +253,7 @@ fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Re
             }
         }
     }
     Ok(centroids)
 }
@@ -218,4 +264,4 @@ fn euclidean_distance(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
         .map(|(x, y)| (x - y).powi(2))
         .sum::<f64>()
         .sqrt()
-}
+}