clusterkit 0.3.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.simplecov +47 -0
  4. data/CHANGELOG.md +35 -0
  5. data/CLAUDE.md +226 -0
  6. data/Cargo.lock +3228 -0
  7. data/Cargo.toml +8 -0
  8. data/Gemfile +17 -0
  9. data/IMPLEMENTATION_NOTES.md +143 -0
  10. data/LICENSE.txt +21 -0
  11. data/PYTHON_COMPARISON.md +183 -0
  12. data/README.md +744 -0
  13. data/Rakefile +259 -0
  14. data/docs/KNOWN_ISSUES.md +130 -0
  15. data/docs/RUST_ERROR_HANDLING.md +164 -0
  16. data/docs/TEST_FIXTURES.md +170 -0
  17. data/docs/UMAP_EXPLAINED.md +362 -0
  18. data/docs/UMAP_TROUBLESHOOTING.md +284 -0
  19. data/docs/VERBOSE_OUTPUT.md +84 -0
  20. data/docs/assets/clusterkit-wide.png +0 -0
  21. data/docs/assets/clusterkit.png +0 -0
  22. data/docs/assets/visualization.png +0 -0
  23. data/examples/hdbscan_example.rb +147 -0
  24. data/examples/optimal_kmeans_example.rb +96 -0
  25. data/examples/pca_example.rb +114 -0
  26. data/examples/reproducible_umap.rb +99 -0
  27. data/examples/verbose_control.rb +43 -0
  28. data/ext/clusterkit/Cargo.toml +26 -0
  29. data/ext/clusterkit/extconf.rb +23 -0
  30. data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +80 -0
  31. data/ext/clusterkit/src/clustering.rs +221 -0
  32. data/ext/clusterkit/src/embedder.rs +349 -0
  33. data/ext/clusterkit/src/hnsw.rs +579 -0
  34. data/ext/clusterkit/src/lib.rs +24 -0
  35. data/ext/clusterkit/src/svd.rs +89 -0
  36. data/ext/clusterkit/src/tests.rs +16 -0
  37. data/ext/clusterkit/src/utils.rs +183 -0
  38. data/lib/clusterkit/3.1/clusterkit.bundle +0 -0
  39. data/lib/clusterkit/3.2/clusterkit.bundle +0 -0
  40. data/lib/clusterkit/3.3/clusterkit.bundle +0 -0
  41. data/lib/clusterkit/3.4/clusterkit.bundle +0 -0
  42. data/lib/clusterkit/clustering/hdbscan.rb +164 -0
  43. data/lib/clusterkit/clustering.rb +194 -0
  44. data/lib/clusterkit/clusterkit.rb +14 -0
  45. data/lib/clusterkit/configuration.rb +24 -0
  46. data/lib/clusterkit/data_validator.rb +132 -0
  47. data/lib/clusterkit/dimensionality/pca.rb +251 -0
  48. data/lib/clusterkit/dimensionality/svd.rb +175 -0
  49. data/lib/clusterkit/dimensionality/umap.rb +282 -0
  50. data/lib/clusterkit/dimensionality.rb +29 -0
  51. data/lib/clusterkit/hdbscan_api_design.rb +142 -0
  52. data/lib/clusterkit/hnsw.rb +251 -0
  53. data/lib/clusterkit/preprocessing.rb +106 -0
  54. data/lib/clusterkit/silence.rb +42 -0
  55. data/lib/clusterkit/utils.rb +51 -0
  56. data/lib/clusterkit/version.rb +5 -0
  57. data/lib/clusterkit.rb +105 -0
  58. data/lib/tasks/visualize.rake +641 -0
  59. metadata +214 -0
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'clusterkit'
5
+
6
+ puts "PCA Example - Dimensionality Reduction and Variance Analysis"
7
+ puts "=" * 60
8
+
9
+ # Generate sample data with clear structure
10
+ # High variance in first 2 dimensions, low variance in others
11
+ def generate_structured_data(n_samples: 100, n_features: 20)
12
+ data = []
13
+
14
+ n_samples.times do
15
+ point = []
16
+
17
+ # First dimension: high variance (range ~10)
18
+ point << rand * 10
19
+
20
+ # Second dimension: medium variance (range ~5)
21
+ point << rand * 5
22
+
23
+ # Third dimension: some variance (range ~2)
24
+ point << rand * 2
25
+
26
+ # Remaining dimensions: very low variance (noise)
27
+ (n_features - 3).times do
28
+ point << rand * 0.1
29
+ end
30
+
31
+ data << point
32
+ end
33
+
34
+ data
35
+ end
36
+
37
+ # Generate data
38
+ data = generate_structured_data(n_samples: 100, n_features: 20)
39
+ puts "\nGenerated #{data.size} samples with #{data.first.size} features"
40
+
41
+ # Perform PCA with different numbers of components
42
+ [2, 3, 5, 10].each do |n_components|
43
+ puts "\n" + "-" * 40
44
+ puts "PCA with #{n_components} components:"
45
+
46
+ pca = ClusterKit::PCA.new(n_components: n_components)
47
+ transformed = pca.fit_transform(data)
48
+
49
+ puts " Transformed shape: #{transformed.size} x #{transformed.first.size}"
50
+
51
+ # Show explained variance for each component
52
+ puts " Explained variance ratio:"
53
+ pca.explained_variance_ratio.each_with_index do |ratio, i|
54
+ puts " PC#{i+1}: #{(ratio * 100).round(2)}%"
55
+ end
56
+
57
+ # Show cumulative explained variance
58
+ cumulative = pca.cumulative_explained_variance_ratio[-1]
59
+ puts " Total variance explained: #{(cumulative * 100).round(2)}%"
60
+ end
61
+
62
+ # Demonstrate reconstruction
63
+ puts "\n" + "=" * 60
64
+ puts "Reconstruction Example:"
65
+ puts "-" * 40
66
+
67
+ # Use 2 components (should capture most variance)
68
+ pca_2 = ClusterKit::PCA.new(n_components: 2)
69
+ compressed = pca_2.fit_transform(data)
70
+ reconstructed = pca_2.inverse_transform(compressed)
71
+
72
+ # Calculate reconstruction error
73
+ sample_idx = 0
74
+ original = data[sample_idx]
75
+ recon = reconstructed[sample_idx]
76
+
77
+ puts "\nOriginal data point (first 5 features):"
78
+ puts " #{original[0..4].map { |v| v.round(3) }.join(', ')}"
79
+
80
+ puts "\nReconstructed from 2 components (first 5 features):"
81
+ puts " #{recon[0..4].map { |v| v.round(3) }.join(', ')}"
82
+
83
+ # Calculate mean squared error
84
+ mse = original.zip(recon).map { |o, r| (o - r) ** 2 }.sum / original.size
85
+ puts "\nReconstruction MSE: #{mse.round(4)}"
86
+
87
+ # Demonstrate data compression ratio
88
+ original_size = data.size * data.first.size
89
+ compressed_size = compressed.size * compressed.first.size
90
+ compression_ratio = (1 - compressed_size.to_f / original_size) * 100
91
+
92
+ puts "\nData Compression:"
93
+ puts " Original size: #{original_size} values"
94
+ puts " Compressed size: #{compressed_size} values"
95
+ puts " Compression ratio: #{compression_ratio.round(1)}%"
96
+ puts " Variance retained: #{(pca_2.cumulative_explained_variance_ratio[-1] * 100).round(1)}%"
97
+
98
+ # Compare with SVD
99
+ puts "\n" + "=" * 60
100
+ puts "PCA vs SVD Comparison:"
101
+ puts "-" * 40
102
+
103
+ # PCA (with mean centering)
104
+ pca = ClusterKit::PCA.new(n_components: 2)
105
+ pca_result = pca.fit_transform(data)
106
+
107
+ # SVD (without mean centering)
108
+ u, s, vt = ClusterKit.svd(data, 2)
109
+ svd_result = u
110
+
111
+ puts "PCA result (first point): #{pca_result[0].map { |v| v.round(3) }}"
112
+ puts "SVD result (first point): #{svd_result[0].map { |v| v.round(3) }}"
113
+ puts "\nNote: PCA centers the data (subtracts mean), SVD does not."
114
+ puts "This makes PCA better for finding principal components of variation."
@@ -0,0 +1,99 @@
1
+ #!/usr/bin/env ruby
2
+ # Example: Achieving reproducibility with UMAP despite random seed issues
3
+
4
+ require_relative '../lib/clusterkit'
5
+ require 'json'
6
+
7
+ # Due to upstream limitations, UMAP doesn't give perfectly reproducible results
8
+ # even with a fixed random_seed. Here are workarounds:
9
+
10
+ # Generate sample data
11
+ srand(42)
12
+ data = []
13
+ 3.times do |cluster|
14
+ center = Array.new(50) { rand * 0.1 + cluster * 2.0 }
15
+ 30.times do
16
+ point = center.map { |c| c + (rand - 0.5) * 0.3 }
17
+ data << point
18
+ end
19
+ end
20
+
21
+ puts "Workaround 1: Cache transformed results"
22
+ puts "=" * 60
23
+
24
+ # First run: transform and save results
25
+ cache_file = "umap_results_cache.json"
26
+ if File.exist?(cache_file)
27
+ puts "Loading cached results from #{cache_file}"
28
+ embedded = JSON.parse(File.read(cache_file))
29
+ else
30
+ puts "No cache found, running UMAP..."
31
+ umap = ClusterKit::Dimensionality::UMAP.new(
32
+ n_components: 2,
33
+ n_neighbors: 5,
34
+ random_seed: 42 # Still use for *some* consistency
35
+ )
36
+ embedded = umap.fit_transform(data)
37
+
38
+ # Save results for reproducibility
39
+ File.write(cache_file, JSON.pretty_generate(embedded))
40
+ puts "Results cached to #{cache_file}"
41
+ end
42
+
43
+ puts "First 3 points:"
44
+ embedded[0..2].each_with_index do |point, i|
45
+ puts " Point #{i}: [#{point[0].round(3)}, #{point[1].round(3)}]"
46
+ end
47
+
48
+ puts "\nWorkaround 2: Save and load fitted models"
49
+ puts "=" * 60
50
+
51
+ model_file = "umap_model.bin"
52
+
53
+ # Train and save model once
54
+ if File.exist?(model_file)
55
+ puts "Loading existing model from #{model_file}"
56
+ umap = ClusterKit::Dimensionality::UMAP.load(model_file)
57
+ else
58
+ puts "Training new model..."
59
+ umap = ClusterKit::Dimensionality::UMAP.new(
60
+ n_components: 2,
61
+ n_neighbors: 5,
62
+ random_seed: 42
63
+ )
64
+ umap.fit(data)
65
+ umap.save(model_file)
66
+ puts "Model saved to #{model_file}"
67
+ end
68
+
69
+ # Now transform new data with the same model
70
+ new_data = data[0..9] # Take first 10 points as "new" data
71
+ transformed = umap.transform(new_data)
72
+ puts "Transformed 10 new points with saved model"
73
+ puts "First 3 transformed points:"
74
+ transformed[0..2].each_with_index do |point, i|
75
+ puts " Point #{i}: [#{point[0].round(3)}, #{point[1].round(3)}]"
76
+ end
77
+
78
+ puts "\nWorkaround 3: Use PCA for deterministic reduction"
79
+ puts "=" * 60
80
+
81
+ # PCA is deterministic - same input always gives same output
82
+ pca = ClusterKit::Dimensionality::PCA.new(n_components: 2)
83
+ pca_result1 = pca.fit_transform(data)
84
+ pca_result2 = pca.fit_transform(data) # Do it again
85
+
86
+ puts "PCA results are identical: #{pca_result1[0] == pca_result2[0]}"
87
+ puts "First point from run 1: [#{pca_result1[0][0].round(3)}, #{pca_result1[0][1].round(3)}]"
88
+ puts "First point from run 2: [#{pca_result2[0][0].round(3)}, #{pca_result2[0][1].round(3)}]"
89
+
90
+ puts "\nRecommendations:"
91
+ puts "-" * 40
92
+ puts "1. For production pipelines, cache UMAP results"
93
+ puts "2. For model deployment, save fitted models and reuse them"
94
+ puts "3. For testing/CI, use PCA or cached test data"
95
+ puts "4. Accept small variations in UMAP results as normal"
96
+
97
+ # Clean up example files (uncomment to remove)
98
+ # File.delete(cache_file) if File.exist?(cache_file)
99
+ # File.delete(model_file) if File.exist?(model_file)
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+ # Example demonstrating how to control verbose output from clusterkit
3
+
4
+ require 'bundler/setup'
5
+ require 'clusterkit'
6
+
7
+ # Generate some random test data
8
+ data = Array.new(50) { Array.new(20) { rand } }
9
+
10
+ puts "=" * 60
11
+ puts "clusterkit Verbose Output Control Demo"
12
+ puts "=" * 60
13
+
14
+ puts "\n1. Default behavior (quiet mode):"
15
+ puts "-" * 40
16
+ umap1 = ClusterKit::UMAP.new(n_components: 2, n_neighbors: 10)
17
+ result1 = umap1.fit_transform(data)
18
+ puts "✓ UMAP completed silently"
19
+ puts " Result shape: #{result1.length} x #{result1.first.length}"
20
+
21
+ puts "\n2. Enable verbose output:"
22
+ puts "-" * 40
23
+ ClusterKit.configure do |config|
24
+ config.verbose = true
25
+ end
26
+
27
+ umap2 = ClusterKit::UMAP.new(n_components: 2, n_neighbors: 10)
28
+ puts "Running UMAP with verbose output enabled..."
29
+ result2 = umap2.fit_transform(data)
30
+ puts "✓ UMAP completed with debug output"
31
+
32
+ puts "\n3. Back to quiet mode:"
33
+ puts "-" * 40
34
+ ClusterKit.configuration.verbose = false
35
+
36
+ umap3 = ClusterKit::UMAP.new(n_components: 2, n_neighbors: 10)
37
+ result3 = umap3.fit_transform(data)
38
+ puts "✓ UMAP completed silently again"
39
+
40
+ puts "\n" + "=" * 60
41
+ puts "You can also set verbose mode via environment variable:"
42
+ puts " ANNEMBED_VERBOSE=true ruby your_script.rb"
43
+ puts "=" * 60
@@ -0,0 +1,26 @@
1
+ [package]
2
+ name = "clusterkit"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+
6
+ [lib]
7
+ crate-type = ["cdylib"]
8
+
9
+ [dependencies]
10
+ magnus = { version = "0.8", features = ["embed"] }
11
+ annembed = { git = "https://github.com/scientist-labs/annembed", tag = "clusterkit-0.2.6" }
12
+ hnsw_rs = { git = "https://github.com/scientist-labs/hnswlib-rs", tag = "clusterkit-0.1.0" }
13
+ hdbscan = "0.11"
14
+ ndarray = "0.16"
15
+ num-traits = "0.2"
16
+ rayon = "1.7"
17
+ serde = { version = "1.0", features = ["derive"] }
18
+ bincode = "1.3"
19
+ rand = "0.8"
20
+
21
+ [features]
22
+ default = ["openblas-static"]
23
+ openblas-static = ["annembed/openblas-static"]
24
+ openblas-system = ["annembed/openblas-system"]
25
+ intel-mkl-static = ["annembed/intel-mkl-static"]
26
+ macos-accelerate = ["annembed/macos-accelerate"]
@@ -0,0 +1,23 @@
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
3
+
4
+ create_rust_makefile("clusterkit/clusterkit") do |r|
5
+ if ENV["CLUSTERKIT_FEATURES"]
6
+ # Explicit override wins (set CLUSTERKIT_FEATURES=openblas-static,... to force a backend).
7
+ r.extra_cargo_args += ["--no-default-features"]
8
+ r.features = ENV["CLUSTERKIT_FEATURES"].split(",")
9
+ elsif RUBY_PLATFORM =~ /darwin/
10
+ # macOS links the system Accelerate framework — no OpenBLAS build needed.
11
+ r.extra_cargo_args += ["--no-default-features"]
12
+ r.features = ["macos-accelerate"]
13
+ elsif RUBY_PLATFORM =~ /linux/
14
+ # Linux: link the SYSTEM OpenBLAS/LAPACK (apt: libopenblas-dev liblapack-dev
15
+ # gfortran, provided by the rust-gem-cross image) instead of the default
16
+ # `openblas-static` feature, which compiles OpenBLAS from C+Fortran source.
17
+ # rb-sys-dock does NOT forward host env to extconf and a .cargo/config.toml
18
+ # [env] only reaches cargo-spawned procs (not mkmf), so this backend choice
19
+ # must live in committed code — it cannot be passed via a workflow input.
20
+ r.extra_cargo_args += ["--no-default-features"]
21
+ r.features = ["openblas-system"]
22
+ end
23
+ end
@@ -0,0 +1,80 @@
1
+ use magnus::{function, prelude::*, Error, Value, RHash, Ruby};
2
+ use hdbscan::{Hdbscan, HdbscanHyperParams};
3
+ use crate::utils::ruby_array_to_vec_vec_f64;
4
+
5
+ /// Perform HDBSCAN clustering
6
+ /// Returns a hash with labels and basic statistics
7
+ pub fn hdbscan_fit(
8
+ data: Value,
9
+ min_samples: usize,
10
+ min_cluster_size: usize,
11
+ metric: String,
12
+ ) -> Result<RHash, Error> {
13
+ let ruby = Ruby::get().unwrap();
14
+
15
+ // Convert Ruby array to Vec<Vec<f64>> using shared helper
16
+ let data_vec = ruby_array_to_vec_vec_f64(data)?;
17
+ let n_samples = data_vec.len();
18
+
19
+ if metric != "euclidean" && metric != "l2" {
20
+ eprintln!("Warning: Current hdbscan version only supports Euclidean distance. Using Euclidean.");
21
+ }
22
+
23
+ // Adjust parameters to avoid index out of bounds errors
24
+ let adjusted_min_samples = min_samples.min(n_samples.saturating_sub(1)).max(1);
25
+ let adjusted_min_cluster_size = min_cluster_size.min(n_samples).max(2);
26
+
27
+ // Create hyperparameters
28
+ let hyper_params = HdbscanHyperParams::builder()
29
+ .min_cluster_size(adjusted_min_cluster_size)
30
+ .min_samples(adjusted_min_samples)
31
+ .build();
32
+
33
+ // Create HDBSCAN instance and run clustering
34
+ let clusterer = Hdbscan::new(&data_vec, hyper_params);
35
+
36
+ let labels = clusterer.cluster().map_err(|e| {
37
+ Error::new(
38
+ ruby.exception_runtime_error(),
39
+ format!("HDBSCAN clustering failed: {:?}", e)
40
+ )
41
+ })?;
42
+
43
+ // Convert results to Ruby types
44
+ let result = ruby.hash_new();
45
+
46
+ let labels_array = ruby.ary_new();
47
+ for &label in labels.iter() {
48
+ labels_array.push(ruby.integer_from_i64(label as i64))?;
49
+ }
50
+ result.aset("labels", labels_array)?;
51
+
52
+ let probs_array = ruby.ary_new();
53
+ for &label in labels.iter() {
54
+ let prob = if label == -1 { 0.0 } else { 1.0 };
55
+ probs_array.push(prob)?;
56
+ }
57
+ result.aset("probabilities", probs_array)?;
58
+
59
+ let outlier_array = ruby.ary_new();
60
+ for &label in labels.iter() {
61
+ let score = if label == -1 { 1.0 } else { 0.0 };
62
+ outlier_array.push(score)?;
63
+ }
64
+ result.aset("outlier_scores", outlier_array)?;
65
+
66
+ let persistence_hash = ruby.hash_new();
67
+ result.aset("cluster_persistence", persistence_hash)?;
68
+
69
+ Ok(result)
70
+ }
71
+
72
+ /// Initialize HDBSCAN module functions
73
+ pub fn init(clustering_module: &magnus::RModule) -> Result<(), Error> {
74
+ clustering_module.define_singleton_method(
75
+ "hdbscan_rust",
76
+ function!(hdbscan_fit, 4),
77
+ )?;
78
+
79
+ Ok(())
80
+ }
@@ -0,0 +1,221 @@
1
+ use magnus::{function, prelude::*, Error, Value, RArray, Ruby};
2
+ use ndarray::{Array1, Array2, ArrayView1, Axis};
3
+ use rand::prelude::*;
4
+ use rand::rngs::StdRng;
5
+ use rand::SeedableRng;
6
+ use crate::utils::ruby_array_to_ndarray2;
7
+
8
+ mod hdbscan_wrapper;
9
+
10
+ pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
11
+ let clustering_module = parent.define_module("Clustering")?;
12
+
13
+ clustering_module.define_singleton_method(
14
+ "kmeans_rust",
15
+ function!(kmeans, 4),
16
+ )?;
17
+
18
+ clustering_module.define_singleton_method(
19
+ "kmeans_predict_rust",
20
+ function!(kmeans_predict, 2),
21
+ )?;
22
+
23
+ // Initialize HDBSCAN functions
24
+ hdbscan_wrapper::init(&clustering_module)?;
25
+
26
+ Ok(())
27
+ }
28
+
29
+ /// Perform K-means clustering
30
+ /// Returns (labels, centroids, inertia)
31
+ fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> Result<(RArray, RArray, f64), Error> {
32
+ let ruby = Ruby::get().unwrap();
33
+
34
+ // Convert Ruby array to ndarray using shared helper
35
+ let data_array = ruby_array_to_ndarray2(data)?;
36
+ let (n_samples, n_features) = data_array.dim();
37
+
38
+ if k > n_samples {
39
+ return Err(Error::new(
40
+ ruby.exception_arg_error(),
41
+ format!("k ({}) cannot be larger than number of samples ({})", k, n_samples),
42
+ ));
43
+ }
44
+
45
+ // Initialize centroids using K-means++
46
+ let mut centroids = kmeans_plusplus(&data_array, k, random_seed)?;
47
+ let mut labels = vec![0usize; n_samples];
48
+ let mut prev_labels = vec![0usize; n_samples];
49
+
50
+ // K-means iterations
51
+ for iteration in 0..max_iter {
52
+ // Assign points to nearest centroid
53
+ let mut changed = false;
54
+ for i in 0..n_samples {
55
+ let point = data_array.row(i);
56
+ let mut min_dist = f64::INFINITY;
57
+ let mut best_cluster = 0;
58
+
59
+ for (j, centroid) in centroids.axis_iter(Axis(0)).enumerate() {
60
+ let dist = euclidean_distance(&point, &centroid);
61
+ if dist < min_dist {
62
+ min_dist = dist;
63
+ best_cluster = j;
64
+ }
65
+ }
66
+
67
+ if labels[i] != best_cluster {
68
+ changed = true;
69
+ }
70
+ labels[i] = best_cluster;
71
+ }
72
+
73
+ // Check for convergence
74
+ if !changed && iteration > 0 {
75
+ break;
76
+ }
77
+
78
+ // Update centroids
79
+ for j in 0..k {
80
+ let mut sum = Array1::<f64>::zeros(n_features);
81
+ let mut count = 0;
82
+
83
+ for i in 0..n_samples {
84
+ if labels[i] == j {
85
+ sum += &data_array.row(i);
86
+ count += 1;
87
+ }
88
+ }
89
+
90
+ if count > 0 {
91
+ centroids.row_mut(j).assign(&(sum / count as f64));
92
+ }
93
+ }
94
+
95
+ prev_labels.clone_from(&labels);
96
+ }
97
+
98
+ // Calculate inertia (sum of squared distances to nearest centroid)
99
+ let mut inertia = 0.0;
100
+ for i in 0..n_samples {
101
+ let point = data_array.row(i);
102
+ let centroid = centroids.row(labels[i]);
103
+ inertia += euclidean_distance(&point, &centroid).powi(2);
104
+ }
105
+
106
+ // Convert results to Ruby arrays
107
+ let labels_array = ruby.ary_new();
108
+ for label in labels {
109
+ labels_array.push(ruby.integer_from_i64(label as i64))?;
110
+ }
111
+
112
+ let centroids_array = ruby.ary_new();
113
+ for i in 0..k {
114
+ let row_array = ruby.ary_new();
115
+ for j in 0..n_features {
116
+ row_array.push(centroids[[i, j]])?;
117
+ }
118
+ centroids_array.push(row_array)?;
119
+ }
120
+
121
+ Ok((labels_array, centroids_array, inertia))
122
+ }
123
+
124
+ /// Predict cluster labels for new data given centroids
125
+ fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
126
+ let ruby = Ruby::get().unwrap();
127
+
128
+ // Convert inputs using shared helpers
129
+ let data_matrix = ruby_array_to_ndarray2(data)?;
130
+ let centroids_matrix = ruby_array_to_ndarray2(centroids)?;
131
+
132
+ let (n_samples, _) = data_matrix.dim();
133
+
134
+ // Predict labels
135
+ let labels_array = ruby.ary_new();
136
+
137
+ for i in 0..n_samples {
138
+ let point = data_matrix.row(i);
139
+ let mut min_dist = f64::INFINITY;
140
+ let mut best_cluster = 0;
141
+
142
+ for (j, centroid) in centroids_matrix.axis_iter(Axis(0)).enumerate() {
143
+ let dist = euclidean_distance(&point, &centroid);
144
+ if dist < min_dist {
145
+ min_dist = dist;
146
+ best_cluster = j;
147
+ }
148
+ }
149
+
150
+ labels_array.push(ruby.integer_from_i64(best_cluster as i64))?;
151
+ }
152
+
153
+ Ok(labels_array)
154
+ }
155
+
156
+ /// K-means++ initialization
157
+ fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Result<Array2<f64>, Error> {
158
+ let n_samples = data.nrows();
159
+ let n_features = data.ncols();
160
+
161
+ // Use seeded RNG if seed is provided, otherwise use thread_rng
162
+ let mut rng: Box<dyn RngCore> = match random_seed {
163
+ Some(seed) => {
164
+ let seed_u64 = seed as u64;
165
+ Box::new(StdRng::seed_from_u64(seed_u64))
166
+ },
167
+ None => Box::new(thread_rng()),
168
+ };
169
+
170
+ let mut centroids = Array2::<f64>::zeros((k, n_features));
171
+
172
+ // Choose first centroid randomly
173
+ let first_idx = rng.gen_range(0..n_samples);
174
+ centroids.row_mut(0).assign(&data.row(first_idx));
175
+
176
+ // Choose remaining centroids
177
+ for i in 1..k {
178
+ let mut distances = vec![f64::INFINITY; n_samples];
179
+
180
+ for j in 0..n_samples {
181
+ for c in 0..i {
182
+ let dist = euclidean_distance(&data.row(j), &centroids.row(c));
183
+ if dist < distances[j] {
184
+ distances[j] = dist;
185
+ }
186
+ }
187
+ }
188
+
189
+ let total: f64 = distances.iter().map(|d| d * d).sum();
190
+ if total == 0.0 {
191
+ if i < n_samples {
192
+ centroids.row_mut(i).assign(&data.row(i));
193
+ } else {
194
+ centroids.row_mut(i).assign(&data.row(0));
195
+ }
196
+ continue;
197
+ }
198
+
199
+ let mut cumsum = 0.0;
200
+ let rand_val: f64 = rng.gen::<f64>() * total;
201
+
202
+ for j in 0..n_samples {
203
+ cumsum += distances[j] * distances[j];
204
+ if cumsum >= rand_val {
205
+ centroids.row_mut(i).assign(&data.row(j));
206
+ break;
207
+ }
208
+ }
209
+ }
210
+
211
+ Ok(centroids)
212
+ }
213
+
214
+ /// Calculate Euclidean distance between two points
215
+ fn euclidean_distance(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
216
+ a.iter()
217
+ .zip(b.iter())
218
+ .map(|(x, y)| (x - y).powi(2))
219
+ .sum::<f64>()
220
+ .sqrt()
221
+ }