clusterkit 0.1.1 → 0.2.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bb4abb036977432c8c0c57385916ce292c77cf05b904a79cf584ace3cc761295
4
- data.tar.gz: af2be6cd56b642df956e919f97fd8df6d56ed729bdda41d2c1b79a7066b3ab76
3
+ metadata.gz: 868c555f318a974371aeae9b892641e4e69ea3bb7b38b8e7b30c16e24bba37e5
4
+ data.tar.gz: e9fc6e35b6065d074e5f9e2b074298b83686aa95fcb78f2cf7f98373015414db
5
5
  SHA512:
6
- metadata.gz: e8c97017b1842e7eb17a4dda614e66ce2c7764200a78df37c5db28e65d9151d7582c131a47acd384a69faa24bb254075bc5e6482a9b9ddec1edbd6d9a850eb40
7
- data.tar.gz: 4ac33fc9ca202390ba180c849b0e786c236368c9b877cd4bbfca71df886c36fa50a62c89274da4e44c355be5fde98cedf817874a3bc6f8db496a3ee1ed43bb29
6
+ metadata.gz: e25abeea0f43f2f9b1cd2171a92408d458f95c09aca86d10df3cd01602b89df4725ce2fdd99b8d0cefe3c5b21659edd2f9e7acc60cd2f49cc7aa1ee7ab815911
7
+ data.tar.gz: b6dd04908293679a1c16919a1416f09ce3cd6099903724ee1a98be0f1fd6c8200609ec9f792c733442847ad92a2ab4e4a3da47380ee9e5a3517d855d5a02286a
data/README.md CHANGED
@@ -720,7 +720,7 @@ COVERAGE=true bundle exec rspec
720
720
 
721
721
  ## Contributing
722
722
 
723
- Bug reports and pull requests are welcome on GitHub at https://github.com/scientist-labs/clusterkit.
723
+ Bug reports and pull requests are welcome on GitHub at https://github.com/cpetersen/clusterkit.
724
724
 
725
725
  ## License
726
726
 
@@ -735,7 +735,7 @@ If you use ClusterKit in your research, please cite:
735
735
  author = {Chris Petersen},
736
736
  title = {ClusterKit: High-Performance Clustering and Dimensionality Reduction for Ruby},
737
737
  year = {2024},
738
- url = {https://github.com/scientist-labs/clusterkit}
738
+ url = {https://github.com/cpetersen/clusterkit}
739
739
  }
740
740
  ```
741
741
 
@@ -0,0 +1,45 @@
1
+ require_relative "lib/clusterkit/version"
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "clusterkit"
5
+ spec.version = ClusterKit::VERSION
6
+ spec.authors = ["Chris Petersen"]
7
+ spec.email = ["chris@petersen.io"]
8
+
9
+ spec.summary = "High-performance clustering and dimensionality reduction for Ruby"
10
+ spec.description = "A comprehensive clustering toolkit for Ruby, providing UMAP, PCA, K-means, HDBSCAN and more. Built on top of annembed and hdbscan Rust crates for blazing-fast performance."
11
+ spec.homepage = "https://github.com/cpetersen/clusterkit"
12
+ spec.license = "MIT"
13
+ spec.required_ruby_version = ">= 2.7.0"
14
+
15
+ spec.metadata["homepage_uri"] = spec.homepage
16
+ spec.metadata["source_code_uri"] = spec.homepage
17
+ spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ spec.files = Dir.chdir(__dir__) do
21
+ `git ls-files -z`.split("\x0").reject do |f|
22
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
23
+ end + Dir["ext/**/*.rs", "ext/**/*.toml"]
24
+ end
25
+ spec.bindir = "exe"
26
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
27
+ spec.require_paths = ["lib"]
28
+ spec.extensions = ["ext/clusterkit/extconf.rb"]
29
+
30
+ # Runtime dependencies
31
+ # Numo is optional but recommended for better performance
32
+ # spec.add_dependency "numo-narray", "~> 0.9"
33
+
34
+ # Development dependencies
35
+ spec.add_development_dependency "csv"
36
+ spec.add_development_dependency "rake", "~> 13.0"
37
+ spec.add_development_dependency "rake-compiler", "~> 1.2"
38
+ spec.add_development_dependency "rb_sys", "~> 0.9"
39
+ spec.add_development_dependency "rspec", "~> 3.0"
40
+ spec.add_development_dependency "simplecov", "~> 0.22"
41
+ spec.add_development_dependency "yard", "~> 0.9"
42
+
43
+ # For more information and examples about making a new gem, check out our
44
+ # guide at: https://bundler.io/guides/creating_gem.html
45
+ end
data/docs/KNOWN_ISSUES.md CHANGED
@@ -14,7 +14,7 @@ This gem has three main categories of limitations:
14
14
 
15
15
  **Reason**: UMAP needs sufficient data to construct a meaningful manifold approximation. With fewer than 10 points, the algorithm cannot create a reliable graph structure.
16
16
 
17
- **Workaround**:
17
+ **Workaround**:
18
18
  - Use PCA for datasets with fewer than 10 points
19
19
  - The `transform` method can handle smaller datasets once the model is fitted on adequate training data
20
20
 
@@ -30,12 +30,12 @@ This gem has three main categories of limitations:
30
30
 
31
31
  **Previous Issue**: The box_size assertion would panic and crash the Ruby process.
32
32
 
33
- **Current Status**: **FIXED** in `scientist-labs/annembed:fix-box-size-panic` branch
33
+ **Current Status**: **FIXED** in `cpetersen/annembed:fix-box-size-panic` branch
34
34
  - The `"assertion failed: (*f).abs() <= box_size"` panic has been converted to a catchable error
35
35
  - Extreme value ranges are now handled gracefully through normalization
36
36
  - NaN/Infinite values are detected and reported with clear error messages
37
37
 
38
- **Remaining Uncatchable Errors**:
38
+ **Remaining Uncatchable Errors**:
39
39
  - Array bounds violations (accessing out-of-bounds indices)
40
40
  - Some `.unwrap()` calls on `None` or `Err` values
41
41
  - These are much less common in normal usage
@@ -98,7 +98,7 @@ def safe_umap_transform(data, options = {})
98
98
  # Save data to temporary file before processing
99
99
  temp_file = "temp_umap_data_#{Time.now.to_i}.json"
100
100
  File.write(temp_file, JSON.dump(data))
101
-
101
+
102
102
  begin
103
103
  umap = ClusterKit::Dimensionality::UMAP.new(**options)
104
104
  result = umap.fit_transform(data)
@@ -127,4 +127,4 @@ def reduce_dimensions(data, n_components: 2)
127
127
  pca.fit_transform(data)
128
128
  end
129
129
  end
130
- ```
130
+ ```
@@ -37,11 +37,11 @@ These use Rust's `assert!` or `panic!` macros and CANNOT be caught. They will cr
37
37
 
38
38
  | Error | Source | Location | Trigger Condition |
39
39
  |-------|--------|----------|-------------------|
40
- | ~~Box size assertion~~ | ~~annembed~~ | ~~`set_data_box`~~ | **FIXED in scientist-labs/annembed:fix-box-size-panic** |
40
+ | ~~Box size assertion~~ | ~~annembed~~ | ~~`set_data_box`~~ | **FIXED in cpetersen/annembed:fix-box-size-panic** |
41
41
  | Array bounds | Various | Index operations | Accessing out-of-bounds indices |
42
42
  | Unwrap failures | Various | `.unwrap()` calls | Unwrapping `None` or `Err` |
43
43
 
44
- **Update (2025-08-19):** The box size assertion has been fixed in the `fix-box-size-panic` branch of scientist-labs/annembed. It now returns a proper `Result<(), anyhow::Error>` that can be caught and handled gracefully:
44
+ **Update (2025-08-19):** The box size assertion has been fixed in the `fix-box-size-panic` branch of cpetersen/annembed. It now returns a proper `Result<(), anyhow::Error>` that can be caught and handled gracefully:
45
45
 
46
46
  ```rust
47
47
  // Previously (would panic):
@@ -96,13 +96,13 @@ when /isolated point/i
96
96
 
97
97
  **Previous Issue:** Would panic and crash the Ruby process
98
98
 
99
- **Current Status:** Fixed in `scientist-labs/annembed:fix-box-size-panic` branch
100
- - Now returns a catchable `anyhow::Error`
99
+ **Current Status:** Fixed in `cpetersen/annembed:fix-box-size-panic` branch
100
+ - Now returns a catchable `anyhow::Error`
101
101
  - Detects NaN/Infinite values during normalization
102
102
  - Handles constant data (max_max = 0) gracefully
103
103
  - Extreme value ranges are normalized successfully
104
104
 
105
- **User-visible behavior:**
105
+ **User-visible behavior:**
106
106
  - Previously: Ruby process would crash with assertion failure
107
107
  - Now: Raises a catchable Ruby exception with helpful error message
108
108
 
@@ -161,4 +161,4 @@ when /isolated point/i
161
161
 
162
162
  The test suite mocks Rust errors to verify our error handling logic works correctly. However, actual panic conditions cannot be tested without crashing the test process.
163
163
 
164
- See `spec/clusterkit/error_handling_spec.rb` for error handling tests.
164
+ See `spec/clusterkit/error_handling_spec.rb` for error handling tests.
@@ -7,9 +7,9 @@ edition = "2021"
7
7
  crate-type = ["cdylib"]
8
8
 
9
9
  [dependencies]
10
- magnus = { version = "0.8", features = ["embed"] }
11
- annembed = { git = "https://github.com/scientist-labs/annembed", tag = "clusterkit-0.1.1" }
12
- hnsw_rs = { git = "https://github.com/scientist-labs/hnswlib-rs", tag = "clusterkit-0.1.0" }
10
+ magnus = { version = "0.6", features = ["embed"] }
11
+ annembed = { git = "https://github.com/cpetersen/annembed", tag = "clusterkit-0.1.0" }
12
+ hnsw_rs = { git = "https://github.com/cpetersen/hnswlib-rs", tag = "clusterkit-0.1.0" }
13
13
  hdbscan = "0.11"
14
14
  ndarray = "0.16"
15
15
  num-traits = "0.2"
@@ -22,5 +22,4 @@ rand = "0.8"
22
22
  default = ["openblas-static"]
23
23
  openblas-static = ["annembed/openblas-static"]
24
24
  openblas-system = ["annembed/openblas-system"]
25
- intel-mkl-static = ["annembed/intel-mkl-static"]
26
- macos-accelerate = ["annembed/macos-accelerate"]
25
+ intel-mkl-static = ["annembed/intel-mkl-static"]
@@ -1,12 +1,4 @@
1
1
  require "mkmf"
2
2
  require "rb_sys/mkmf"
3
3
 
4
- create_rust_makefile("clusterkit/clusterkit") do |r|
5
- if ENV["CLUSTERKIT_FEATURES"]
6
- r.extra_cargo_args += ["--no-default-features"]
7
- r.features = ENV["CLUSTERKIT_FEATURES"].split(",")
8
- elsif RUBY_PLATFORM =~ /darwin/
9
- r.extra_cargo_args += ["--no-default-features"]
10
- r.features = ["macos-accelerate"]
11
- end
12
- end
4
+ create_rust_makefile("clusterkit/clusterkit")
@@ -1,6 +1,5 @@
1
- use magnus::{function, prelude::*, Error, Value, RHash, Ruby};
1
+ use magnus::{function, prelude::*, Error, Value, RArray, RHash, Integer, TryConvert};
2
2
  use hdbscan::{Hdbscan, HdbscanHyperParams};
3
- use crate::utils::ruby_array_to_vec_vec_f64;
4
3
 
5
4
  /// Perform HDBSCAN clustering
6
5
  /// Returns a hash with labels and basic statistics
@@ -10,62 +9,98 @@ pub fn hdbscan_fit(
10
9
  min_cluster_size: usize,
11
10
  metric: String,
12
11
  ) -> Result<RHash, Error> {
13
- let ruby = Ruby::get().unwrap();
14
-
15
- // Convert Ruby array to Vec<Vec<f64>> using shared helper
16
- let data_vec = ruby_array_to_vec_vec_f64(data)?;
17
- let n_samples = data_vec.len();
18
-
12
+ // Convert Ruby array to ndarray
13
+ let rarray: RArray = TryConvert::try_convert(data)?;
14
+ let n_samples = rarray.len();
15
+
16
+ if n_samples == 0 {
17
+ return Err(Error::new(
18
+ magnus::exception::arg_error(),
19
+ "Data cannot be empty",
20
+ ));
21
+ }
22
+
23
+ // Get dimensions
24
+ let first_row: RArray = rarray.entry::<RArray>(0)?;
25
+ let n_features = first_row.len();
26
+
27
+ // Convert to Vec<Vec<f64>> format expected by hdbscan crate
28
+ let mut data_vec: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
29
+ for i in 0..n_samples {
30
+ let row: RArray = rarray.entry(i as isize)?;
31
+ let mut row_vec: Vec<f64> = Vec::with_capacity(n_features);
32
+ for j in 0..n_features {
33
+ let val: f64 = row.entry(j as isize)?;
34
+ row_vec.push(val);
35
+ }
36
+ data_vec.push(row_vec);
37
+ }
38
+
39
+ // Note: hdbscan crate doesn't support custom metrics directly
40
+ // We'll use the default Euclidean distance for now
19
41
  if metric != "euclidean" && metric != "l2" {
20
42
  eprintln!("Warning: Current hdbscan version only supports Euclidean distance. Using Euclidean.");
21
43
  }
22
-
44
+
23
45
  // Adjust parameters to avoid index out of bounds errors
46
+ // The hdbscan crate has issues when min_samples >= n_samples
24
47
  let adjusted_min_samples = min_samples.min(n_samples.saturating_sub(1)).max(1);
25
48
  let adjusted_min_cluster_size = min_cluster_size.min(n_samples).max(2);
26
-
49
+
27
50
  // Create hyperparameters
28
51
  let hyper_params = HdbscanHyperParams::builder()
29
52
  .min_cluster_size(adjusted_min_cluster_size)
30
53
  .min_samples(adjusted_min_samples)
31
54
  .build();
32
-
55
+
33
56
  // Create HDBSCAN instance and run clustering
34
57
  let clusterer = Hdbscan::new(&data_vec, hyper_params);
35
-
58
+
59
+ // Run the clustering algorithm - cluster() returns Result<Vec<i32>, HdbscanError>
36
60
  let labels = clusterer.cluster().map_err(|e| {
37
61
  Error::new(
38
- ruby.exception_runtime_error(),
62
+ magnus::exception::runtime_error(),
39
63
  format!("HDBSCAN clustering failed: {:?}", e)
40
64
  )
41
65
  })?;
42
-
66
+
43
67
  // Convert results to Ruby types
44
- let result = ruby.hash_new();
45
-
46
- let labels_array = ruby.ary_new();
68
+ let ruby = magnus::Ruby::get().unwrap();
69
+ let result = RHash::new();
70
+
71
+ // Convert labels (i32 to Ruby Integer, -1 for noise)
72
+ let labels_array = RArray::new();
47
73
  for &label in labels.iter() {
48
- labels_array.push(ruby.integer_from_i64(label as i64))?;
74
+ labels_array.push(Integer::from_value(
75
+ ruby.eval(&format!("{}", label)).unwrap()
76
+ ).unwrap())?;
49
77
  }
50
78
  result.aset("labels", labels_array)?;
51
-
52
- let probs_array = ruby.ary_new();
79
+
80
+ // For now, we'll create dummy probabilities and outlier scores
81
+ // since the basic hdbscan crate doesn't provide these
82
+ // In the future, we could calculate these ourselves or use a more advanced implementation
83
+
84
+ // Create probabilities array (all 1.0 for clustered points, 0.0 for noise)
85
+ let probs_array = RArray::new();
53
86
  for &label in labels.iter() {
54
87
  let prob = if label == -1 { 0.0 } else { 1.0 };
55
88
  probs_array.push(prob)?;
56
89
  }
57
90
  result.aset("probabilities", probs_array)?;
58
-
59
- let outlier_array = ruby.ary_new();
91
+
92
+ // Create outlier scores array (0.0 for clustered points, 1.0 for noise)
93
+ let outlier_array = RArray::new();
60
94
  for &label in labels.iter() {
61
95
  let score = if label == -1 { 1.0 } else { 0.0 };
62
96
  outlier_array.push(score)?;
63
97
  }
64
98
  result.aset("outlier_scores", outlier_array)?;
65
-
66
- let persistence_hash = ruby.hash_new();
99
+
100
+ // Create empty cluster persistence hash for now
101
+ let persistence_hash = RHash::new();
67
102
  result.aset("cluster_persistence", persistence_hash)?;
68
-
103
+
69
104
  Ok(result)
70
105
  }
71
106
 
@@ -75,6 +110,6 @@ pub fn init(clustering_module: &magnus::RModule) -> Result<(), Error> {
75
110
  "hdbscan_rust",
76
111
  function!(hdbscan_fit, 4),
77
112
  )?;
78
-
113
+
79
114
  Ok(())
80
- }
115
+ }
@@ -1,52 +1,68 @@
1
- use magnus::{function, prelude::*, Error, Value, RArray, Ruby};
1
+ use magnus::{function, prelude::*, Error, Value, RArray, Integer, TryConvert};
2
2
  use ndarray::{Array1, Array2, ArrayView1, Axis};
3
3
  use rand::prelude::*;
4
- use rand::rngs::StdRng;
5
- use rand::SeedableRng;
6
- use crate::utils::ruby_array_to_ndarray2;
7
4
 
8
5
  mod hdbscan_wrapper;
9
6
 
10
7
  pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
11
8
  let clustering_module = parent.define_module("Clustering")?;
12
-
9
+
13
10
  clustering_module.define_singleton_method(
14
11
  "kmeans_rust",
15
- function!(kmeans, 4),
12
+ function!(kmeans, 3),
16
13
  )?;
17
-
14
+
18
15
  clustering_module.define_singleton_method(
19
16
  "kmeans_predict_rust",
20
17
  function!(kmeans_predict, 2),
21
18
  )?;
22
-
19
+
23
20
  // Initialize HDBSCAN functions
24
21
  hdbscan_wrapper::init(&clustering_module)?;
25
-
22
+
26
23
  Ok(())
27
24
  }
28
25
 
29
26
  /// Perform K-means clustering
30
27
  /// Returns (labels, centroids, inertia)
31
- fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> Result<(RArray, RArray, f64), Error> {
32
- let ruby = Ruby::get().unwrap();
33
-
34
- // Convert Ruby array to ndarray using shared helper
35
- let data_array = ruby_array_to_ndarray2(data)?;
36
- let (n_samples, n_features) = data_array.dim();
37
-
28
+ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64), Error> {
29
+ // Convert Ruby array to ndarray
30
+ let rarray: RArray = TryConvert::try_convert(data)?;
31
+ let n_samples = rarray.len();
32
+
33
+ if n_samples == 0 {
34
+ return Err(Error::new(
35
+ magnus::exception::arg_error(),
36
+ "Data cannot be empty",
37
+ ));
38
+ }
39
+
40
+ // Get dimensions
41
+ let first_row: RArray = rarray.entry::<RArray>(0)?;
42
+ let n_features = first_row.len();
43
+
38
44
  if k > n_samples {
39
45
  return Err(Error::new(
40
- ruby.exception_arg_error(),
46
+ magnus::exception::arg_error(),
41
47
  format!("k ({}) cannot be larger than number of samples ({})", k, n_samples),
42
48
  ));
43
49
  }
44
-
50
+
51
+ // Convert to ndarray
52
+ let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
53
+ for i in 0..n_samples {
54
+ let row: RArray = rarray.entry(i as isize)?;
55
+ for j in 0..n_features {
56
+ let val: f64 = row.entry(j as isize)?;
57
+ data_array[[i, j]] = val;
58
+ }
59
+ }
60
+
45
61
  // Initialize centroids using K-means++
46
- let mut centroids = kmeans_plusplus(&data_array, k, random_seed)?;
62
+ let mut centroids = kmeans_plusplus(&data_array, k)?;
47
63
  let mut labels = vec![0usize; n_samples];
48
64
  let mut prev_labels = vec![0usize; n_samples];
49
-
65
+
50
66
  // K-means iterations
51
67
  for iteration in 0..max_iter {
52
68
  // Assign points to nearest centroid
@@ -55,7 +71,7 @@ fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> R
55
71
  let point = data_array.row(i);
56
72
  let mut min_dist = f64::INFINITY;
57
73
  let mut best_cluster = 0;
58
-
74
+
59
75
  for (j, centroid) in centroids.axis_iter(Axis(0)).enumerate() {
60
76
  let dist = euclidean_distance(&point, &centroid);
61
77
  if dist < min_dist {
@@ -63,38 +79,38 @@ fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> R
63
79
  best_cluster = j;
64
80
  }
65
81
  }
66
-
82
+
67
83
  if labels[i] != best_cluster {
68
84
  changed = true;
69
85
  }
70
86
  labels[i] = best_cluster;
71
87
  }
72
-
88
+
73
89
  // Check for convergence
74
90
  if !changed && iteration > 0 {
75
91
  break;
76
92
  }
77
-
93
+
78
94
  // Update centroids
79
95
  for j in 0..k {
80
96
  let mut sum = Array1::<f64>::zeros(n_features);
81
97
  let mut count = 0;
82
-
98
+
83
99
  for i in 0..n_samples {
84
100
  if labels[i] == j {
85
101
  sum += &data_array.row(i);
86
102
  count += 1;
87
103
  }
88
104
  }
89
-
105
+
90
106
  if count > 0 {
91
107
  centroids.row_mut(j).assign(&(sum / count as f64));
92
108
  }
93
109
  }
94
-
110
+
95
111
  prev_labels.clone_from(&labels);
96
112
  }
97
-
113
+
98
114
  // Calculate inertia (sum of squared distances to nearest centroid)
99
115
  let mut inertia = 0.0;
100
116
  for i in 0..n_samples {
@@ -102,43 +118,75 @@ fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> R
102
118
  let centroid = centroids.row(labels[i]);
103
119
  inertia += euclidean_distance(&point, &centroid).powi(2);
104
120
  }
105
-
121
+
106
122
  // Convert results to Ruby arrays
107
- let labels_array = ruby.ary_new();
123
+ let ruby = magnus::Ruby::get().unwrap();
124
+ let labels_array = RArray::new();
108
125
  for label in labels {
109
- labels_array.push(ruby.integer_from_i64(label as i64))?;
126
+ labels_array.push(Integer::from_value(ruby.eval(&format!("{}", label)).unwrap()).unwrap())?;
110
127
  }
111
-
112
- let centroids_array = ruby.ary_new();
128
+
129
+ let centroids_array = RArray::new();
113
130
  for i in 0..k {
114
- let row_array = ruby.ary_new();
131
+ let row_array = RArray::new();
115
132
  for j in 0..n_features {
116
133
  row_array.push(centroids[[i, j]])?;
117
134
  }
118
135
  centroids_array.push(row_array)?;
119
136
  }
120
-
137
+
121
138
  Ok((labels_array, centroids_array, inertia))
122
139
  }
123
140
 
124
141
  /// Predict cluster labels for new data given centroids
125
142
  fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
126
- let ruby = Ruby::get().unwrap();
127
-
128
- // Convert inputs using shared helpers
129
- let data_matrix = ruby_array_to_ndarray2(data)?;
130
- let centroids_matrix = ruby_array_to_ndarray2(centroids)?;
131
-
132
- let (n_samples, _) = data_matrix.dim();
133
-
143
+ // Convert inputs
144
+ let data_array: RArray = TryConvert::try_convert(data)?;
145
+ let centroids_array: RArray = TryConvert::try_convert(centroids)?;
146
+
147
+ let n_samples = data_array.len();
148
+ let k = centroids_array.len();
149
+
150
+ if n_samples == 0 {
151
+ return Err(Error::new(
152
+ magnus::exception::arg_error(),
153
+ "Data cannot be empty",
154
+ ));
155
+ }
156
+
157
+ // Get dimensions
158
+ let first_row: RArray = data_array.entry::<RArray>(0)?;
159
+ let n_features = first_row.len();
160
+
161
+ // Convert data to ndarray
162
+ let mut data_matrix = Array2::<f64>::zeros((n_samples, n_features));
163
+ for i in 0..n_samples {
164
+ let row: RArray = data_array.entry(i as isize)?;
165
+ for j in 0..n_features {
166
+ let val: f64 = row.entry(j as isize)?;
167
+ data_matrix[[i, j]] = val;
168
+ }
169
+ }
170
+
171
+ // Convert centroids to ndarray
172
+ let mut centroids_matrix = Array2::<f64>::zeros((k, n_features));
173
+ for i in 0..k {
174
+ let row: RArray = centroids_array.entry(i as isize)?;
175
+ for j in 0..n_features {
176
+ let val: f64 = row.entry(j as isize)?;
177
+ centroids_matrix[[i, j]] = val;
178
+ }
179
+ }
180
+
134
181
  // Predict labels
135
- let labels_array = ruby.ary_new();
136
-
182
+ let ruby = magnus::Ruby::get().unwrap();
183
+ let labels_array = RArray::new();
184
+
137
185
  for i in 0..n_samples {
138
186
  let point = data_matrix.row(i);
139
187
  let mut min_dist = f64::INFINITY;
140
188
  let mut best_cluster = 0;
141
-
189
+
142
190
  for (j, centroid) in centroids_matrix.axis_iter(Axis(0)).enumerate() {
143
191
  let dist = euclidean_distance(&point, &centroid);
144
192
  if dist < min_dist {
@@ -146,37 +194,30 @@ fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
146
194
  best_cluster = j;
147
195
  }
148
196
  }
149
-
150
- labels_array.push(ruby.integer_from_i64(best_cluster as i64))?;
197
+
198
+ labels_array.push(Integer::from_value(ruby.eval(&format!("{}", best_cluster)).unwrap()).unwrap())?;
151
199
  }
152
-
200
+
153
201
  Ok(labels_array)
154
202
  }
155
203
 
156
204
  /// K-means++ initialization
157
- fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Result<Array2<f64>, Error> {
205
+ fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
158
206
  let n_samples = data.nrows();
159
207
  let n_features = data.ncols();
160
-
161
- // Use seeded RNG if seed is provided, otherwise use thread_rng
162
- let mut rng: Box<dyn RngCore> = match random_seed {
163
- Some(seed) => {
164
- let seed_u64 = seed as u64;
165
- Box::new(StdRng::seed_from_u64(seed_u64))
166
- },
167
- None => Box::new(thread_rng()),
168
- };
169
-
208
+ let mut rng = thread_rng();
209
+
170
210
  let mut centroids = Array2::<f64>::zeros((k, n_features));
171
-
211
+
172
212
  // Choose first centroid randomly
173
213
  let first_idx = rng.gen_range(0..n_samples);
174
214
  centroids.row_mut(0).assign(&data.row(first_idx));
175
-
215
+
176
216
  // Choose remaining centroids
177
217
  for i in 1..k {
178
218
  let mut distances = vec![f64::INFINITY; n_samples];
179
-
219
+
220
+ // Calculate distance to nearest centroid for each point
180
221
  for j in 0..n_samples {
181
222
  for c in 0..i {
182
223
  let dist = euclidean_distance(&data.row(j), &centroids.row(c));
@@ -185,20 +226,25 @@ fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Re
185
226
  }
186
227
  }
187
228
  }
188
-
229
+
230
+ // Convert distances to probabilities
189
231
  let total: f64 = distances.iter().map(|d| d * d).sum();
190
232
  if total == 0.0 {
233
+ // All points are identical or we've selected duplicates
234
+ // Just use sequential points as centroids
191
235
  if i < n_samples {
192
236
  centroids.row_mut(i).assign(&data.row(i));
193
237
  } else {
238
+ // Reuse first point if we run out
194
239
  centroids.row_mut(i).assign(&data.row(0));
195
240
  }
196
241
  continue;
197
242
  }
198
-
243
+
244
+ // Choose next centroid with probability proportional to squared distance
199
245
  let mut cumsum = 0.0;
200
246
  let rand_val: f64 = rng.gen::<f64>() * total;
201
-
247
+
202
248
  for j in 0..n_samples {
203
249
  cumsum += distances[j] * distances[j];
204
250
  if cumsum >= rand_val {
@@ -207,7 +253,7 @@ fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Re
207
253
  }
208
254
  }
209
255
  }
210
-
256
+
211
257
  Ok(centroids)
212
258
  }
213
259
 
@@ -218,4 +264,4 @@ fn euclidean_distance(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
218
264
  .map(|(x, y)| (x - y).powi(2))
219
265
  .sum::<f64>()
220
266
  .sqrt()
221
- }
267
+ }