clusterkit 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.simplecov +47 -0
  4. data/CHANGELOG.md +35 -0
  5. data/CLAUDE.md +226 -0
  6. data/Cargo.toml +8 -0
  7. data/Gemfile +17 -0
  8. data/IMPLEMENTATION_NOTES.md +143 -0
  9. data/LICENSE.txt +21 -0
  10. data/PYTHON_COMPARISON.md +183 -0
  11. data/README.md +499 -0
  12. data/Rakefile +245 -0
  13. data/clusterkit.gemspec +45 -0
  14. data/docs/KNOWN_ISSUES.md +130 -0
  15. data/docs/RUST_ERROR_HANDLING.md +164 -0
  16. data/docs/TEST_FIXTURES.md +170 -0
  17. data/docs/UMAP_EXPLAINED.md +362 -0
  18. data/docs/UMAP_TROUBLESHOOTING.md +284 -0
  19. data/docs/VERBOSE_OUTPUT.md +84 -0
  20. data/examples/hdbscan_example.rb +147 -0
  21. data/examples/optimal_kmeans_example.rb +96 -0
  22. data/examples/pca_example.rb +114 -0
  23. data/examples/reproducible_umap.rb +99 -0
  24. data/examples/verbose_control.rb +43 -0
  25. data/ext/clusterkit/Cargo.toml +25 -0
  26. data/ext/clusterkit/extconf.rb +4 -0
  27. data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +115 -0
  28. data/ext/clusterkit/src/clustering.rs +267 -0
  29. data/ext/clusterkit/src/embedder.rs +413 -0
  30. data/ext/clusterkit/src/lib.rs +22 -0
  31. data/ext/clusterkit/src/svd.rs +112 -0
  32. data/ext/clusterkit/src/tests.rs +16 -0
  33. data/ext/clusterkit/src/utils.rs +33 -0
  34. data/lib/clusterkit/clustering/hdbscan.rb +177 -0
  35. data/lib/clusterkit/clustering.rb +213 -0
  36. data/lib/clusterkit/clusterkit.rb +9 -0
  37. data/lib/clusterkit/configuration.rb +24 -0
  38. data/lib/clusterkit/dimensionality/pca.rb +251 -0
  39. data/lib/clusterkit/dimensionality/svd.rb +144 -0
  40. data/lib/clusterkit/dimensionality/umap.rb +311 -0
  41. data/lib/clusterkit/dimensionality.rb +29 -0
  42. data/lib/clusterkit/hdbscan_api_design.rb +142 -0
  43. data/lib/clusterkit/preprocessing.rb +106 -0
  44. data/lib/clusterkit/silence.rb +42 -0
  45. data/lib/clusterkit/utils.rb +51 -0
  46. data/lib/clusterkit/version.rb +5 -0
  47. data/lib/clusterkit.rb +93 -0
  48. data/lib/tasks/visualize.rake +641 -0
  49. metadata +194 -0
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'clusterkit'
5
+
6
+ puts "PCA Example - Dimensionality Reduction and Variance Analysis"
7
+ puts "=" * 60
8
+
9
+ # Generate sample data with clear structure
10
+ # High variance in first 2 dimensions, low variance in others
11
+ def generate_structured_data(n_samples: 100, n_features: 20)
12
+ data = []
13
+
14
+ n_samples.times do
15
+ point = []
16
+
17
+ # First dimension: high variance (range ~10)
18
+ point << rand * 10
19
+
20
+ # Second dimension: medium variance (range ~5)
21
+ point << rand * 5
22
+
23
+ # Third dimension: some variance (range ~2)
24
+ point << rand * 2
25
+
26
+ # Remaining dimensions: very low variance (noise)
27
+ (n_features - 3).times do
28
+ point << rand * 0.1
29
+ end
30
+
31
+ data << point
32
+ end
33
+
34
+ data
35
+ end
36
+
37
+ # Generate data
38
+ data = generate_structured_data(n_samples: 100, n_features: 20)
39
+ puts "\nGenerated #{data.size} samples with #{data.first.size} features"
40
+
41
+ # Perform PCA with different numbers of components
42
+ [2, 3, 5, 10].each do |n_components|
43
+ puts "\n" + "-" * 40
44
+ puts "PCA with #{n_components} components:"
45
+
46
+ pca = ClusterKit::PCA.new(n_components: n_components)
47
+ transformed = pca.fit_transform(data)
48
+
49
+ puts " Transformed shape: #{transformed.size} x #{transformed.first.size}"
50
+
51
+ # Show explained variance for each component
52
+ puts " Explained variance ratio:"
53
+ pca.explained_variance_ratio.each_with_index do |ratio, i|
54
+ puts " PC#{i+1}: #{(ratio * 100).round(2)}%"
55
+ end
56
+
57
+ # Show cumulative explained variance
58
+ cumulative = pca.cumulative_explained_variance_ratio[-1]
59
+ puts " Total variance explained: #{(cumulative * 100).round(2)}%"
60
+ end
61
+
62
+ # Demonstrate reconstruction
63
+ puts "\n" + "=" * 60
64
+ puts "Reconstruction Example:"
65
+ puts "-" * 40
66
+
67
+ # Use 2 components (should capture most variance)
68
+ pca_2 = ClusterKit::PCA.new(n_components: 2)
69
+ compressed = pca_2.fit_transform(data)
70
+ reconstructed = pca_2.inverse_transform(compressed)
71
+
72
+ # Calculate reconstruction error
73
+ sample_idx = 0
74
+ original = data[sample_idx]
75
+ recon = reconstructed[sample_idx]
76
+
77
+ puts "\nOriginal data point (first 5 features):"
78
+ puts " #{original[0..4].map { |v| v.round(3) }.join(', ')}"
79
+
80
+ puts "\nReconstructed from 2 components (first 5 features):"
81
+ puts " #{recon[0..4].map { |v| v.round(3) }.join(', ')}"
82
+
83
+ # Calculate mean squared error
84
+ mse = original.zip(recon).map { |o, r| (o - r) ** 2 }.sum / original.size
85
+ puts "\nReconstruction MSE: #{mse.round(4)}"
86
+
87
+ # Demonstrate data compression ratio
88
+ original_size = data.size * data.first.size
89
+ compressed_size = compressed.size * compressed.first.size
90
+ compression_ratio = (1 - compressed_size.to_f / original_size) * 100
91
+
92
+ puts "\nData Compression:"
93
+ puts " Original size: #{original_size} values"
94
+ puts " Compressed size: #{compressed_size} values"
95
+ puts " Compression ratio: #{compression_ratio.round(1)}%"
96
+ puts " Variance retained: #{(pca_2.cumulative_explained_variance_ratio[-1] * 100).round(1)}%"
97
+
98
+ # Compare with SVD
99
+ puts "\n" + "=" * 60
100
+ puts "PCA vs SVD Comparison:"
101
+ puts "-" * 40
102
+
103
+ # PCA (with mean centering)
104
+ pca = ClusterKit::PCA.new(n_components: 2)
105
+ pca_result = pca.fit_transform(data)
106
+
107
+ # SVD (without mean centering)
108
+ u, s, vt = ClusterKit.svd(data, 2)
109
+ svd_result = u
110
+
111
+ puts "PCA result (first point): #{pca_result[0].map { |v| v.round(3) }}"
112
+ puts "SVD result (first point): #{svd_result[0].map { |v| v.round(3) }}"
113
+ puts "\nNote: PCA centers the data (subtracts mean), SVD does not."
114
+ puts "This makes PCA better for finding principal components of variation."
@@ -0,0 +1,99 @@
1
+ #!/usr/bin/env ruby
2
+ # Example: Achieving reproducibility with UMAP despite random seed issues
3
+
4
+ require_relative '../lib/clusterkit'
5
+ require 'json'
6
+
7
+ # Due to upstream limitations, UMAP doesn't give perfectly reproducible results
8
+ # even with a fixed random_seed. Here are workarounds:
9
+
10
+ # Generate sample data
11
+ srand(42)
12
+ data = []
13
+ 3.times do |cluster|
14
+ center = Array.new(50) { rand * 0.1 + cluster * 2.0 }
15
+ 30.times do
16
+ point = center.map { |c| c + (rand - 0.5) * 0.3 }
17
+ data << point
18
+ end
19
+ end
20
+
21
+ puts "Workaround 1: Cache transformed results"
22
+ puts "=" * 60
23
+
24
+ # First run: transform and save results
25
+ cache_file = "umap_results_cache.json"
26
+ if File.exist?(cache_file)
27
+ puts "Loading cached results from #{cache_file}"
28
+ embedded = JSON.parse(File.read(cache_file))
29
+ else
30
+ puts "No cache found, running UMAP..."
31
+ umap = ClusterKit::Dimensionality::UMAP.new(
32
+ n_components: 2,
33
+ n_neighbors: 5,
34
+ random_seed: 42 # Still use for *some* consistency
35
+ )
36
+ embedded = umap.fit_transform(data)
37
+
38
+ # Save results for reproducibility
39
+ File.write(cache_file, JSON.pretty_generate(embedded))
40
+ puts "Results cached to #{cache_file}"
41
+ end
42
+
43
+ puts "First 3 points:"
44
+ embedded[0..2].each_with_index do |point, i|
45
+ puts " Point #{i}: [#{point[0].round(3)}, #{point[1].round(3)}]"
46
+ end
47
+
48
+ puts "\nWorkaround 2: Save and load fitted models"
49
+ puts "=" * 60
50
+
51
+ model_file = "umap_model.bin"
52
+
53
+ # Train and save model once
54
+ if File.exist?(model_file)
55
+ puts "Loading existing model from #{model_file}"
56
+ umap = ClusterKit::Dimensionality::UMAP.load(model_file)
57
+ else
58
+ puts "Training new model..."
59
+ umap = ClusterKit::Dimensionality::UMAP.new(
60
+ n_components: 2,
61
+ n_neighbors: 5,
62
+ random_seed: 42
63
+ )
64
+ umap.fit(data)
65
+ umap.save(model_file)
66
+ puts "Model saved to #{model_file}"
67
+ end
68
+
69
+ # Now transform new data with the same model
70
+ new_data = data[0..9] # Take first 10 points as "new" data
71
+ transformed = umap.transform(new_data)
72
+ puts "Transformed 10 new points with saved model"
73
+ puts "First 3 transformed points:"
74
+ transformed[0..2].each_with_index do |point, i|
75
+ puts " Point #{i}: [#{point[0].round(3)}, #{point[1].round(3)}]"
76
+ end
77
+
78
+ puts "\nWorkaround 3: Use PCA for deterministic reduction"
79
+ puts "=" * 60
80
+
81
+ # PCA is deterministic - same input always gives same output
82
+ pca = ClusterKit::Dimensionality::PCA.new(n_components: 2)
83
+ pca_result1 = pca.fit_transform(data)
84
+ pca_result2 = pca.fit_transform(data) # Do it again
85
+
86
+ puts "PCA results are identical: #{pca_result1[0] == pca_result2[0]}"
87
+ puts "First point from run 1: [#{pca_result1[0][0].round(3)}, #{pca_result1[0][1].round(3)}]"
88
+ puts "First point from run 2: [#{pca_result2[0][0].round(3)}, #{pca_result2[0][1].round(3)}]"
89
+
90
+ puts "\nRecommendations:"
91
+ puts "-" * 40
92
+ puts "1. For production pipelines, cache UMAP results"
93
+ puts "2. For model deployment, save fitted models and reuse them"
94
+ puts "3. For testing/CI, use PCA or cached test data"
95
+ puts "4. Accept small variations in UMAP results as normal"
96
+
97
+ # Clean up example files (uncomment to remove)
98
+ # File.delete(cache_file) if File.exist?(cache_file)
99
+ # File.delete(model_file) if File.exist?(model_file)
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+ # Example demonstrating how to control verbose output from clusterkit
3
+
4
+ require 'bundler/setup'
5
+ require 'clusterkit'
6
+
7
+ # Generate some random test data
8
+ data = Array.new(50) { Array.new(20) { rand } }
9
+
10
+ puts "=" * 60
11
+ puts "clusterkit Verbose Output Control Demo"
12
+ puts "=" * 60
13
+
14
+ puts "\n1. Default behavior (quiet mode):"
15
+ puts "-" * 40
16
+ umap1 = ClusterKit::UMAP.new(n_components: 2, n_neighbors: 10)
17
+ result1 = umap1.fit_transform(data)
18
+ puts "✓ UMAP completed silently"
19
+ puts " Result shape: #{result1.length} x #{result1.first.length}"
20
+
21
+ puts "\n2. Enable verbose output:"
22
+ puts "-" * 40
23
+ ClusterKit.configure do |config|
24
+ config.verbose = true
25
+ end
26
+
27
+ umap2 = ClusterKit::UMAP.new(n_components: 2, n_neighbors: 10)
28
+ puts "Running UMAP with verbose output enabled..."
29
+ result2 = umap2.fit_transform(data)
30
+ puts "✓ UMAP completed with debug output"
31
+
32
+ puts "\n3. Back to quiet mode:"
33
+ puts "-" * 40
34
+ ClusterKit.configuration.verbose = false
35
+
36
+ umap3 = ClusterKit::UMAP.new(n_components: 2, n_neighbors: 10)
37
+ result3 = umap3.fit_transform(data)
38
+ puts "✓ UMAP completed silently again"
39
+
40
+ puts "\n" + "=" * 60
41
+ puts "You can also set verbose mode via environment variable:"
42
+ puts " ANNEMBED_VERBOSE=true ruby your_script.rb"
43
+ puts "=" * 60
@@ -0,0 +1,25 @@
1
+ [package]
2
+ name = "clusterkit"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+
6
+ [lib]
7
+ crate-type = ["cdylib"]
8
+
9
+ [dependencies]
10
+ magnus = { version = "0.6", features = ["embed"] }
11
+ annembed = { git = "https://github.com/cpetersen/annembed", tag = "clusterkit-0.1.0" }
12
+ hnsw_rs = { git = "https://github.com/cpetersen/hnswlib-rs", tag = "clusterkit-0.1.0" }
13
+ hdbscan = "0.11"
14
+ ndarray = "0.16"
15
+ num-traits = "0.2"
16
+ rayon = "1.7"
17
+ serde = { version = "1.0", features = ["derive"] }
18
+ bincode = "1.3"
19
+ rand = "0.8"
20
+
21
+ [features]
22
+ default = ["openblas-static"]
23
+ openblas-static = ["annembed/openblas-static"]
24
+ openblas-system = ["annembed/openblas-system"]
25
+ intel-mkl-static = ["annembed/intel-mkl-static"]
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
3
+
4
+ create_rust_makefile("clusterkit/clusterkit")
@@ -0,0 +1,115 @@
1
+ use magnus::{function, prelude::*, Error, Value, RArray, RHash, Integer, TryConvert};
2
+ use hdbscan::{Hdbscan, HdbscanHyperParams};
3
+
4
+ /// Perform HDBSCAN clustering
5
+ /// Returns a hash with labels and basic statistics
6
+ pub fn hdbscan_fit(
7
+ data: Value,
8
+ min_samples: usize,
9
+ min_cluster_size: usize,
10
+ metric: String,
11
+ ) -> Result<RHash, Error> {
12
+ // Convert Ruby array to ndarray
13
+ let rarray: RArray = TryConvert::try_convert(data)?;
14
+ let n_samples = rarray.len();
15
+
16
+ if n_samples == 0 {
17
+ return Err(Error::new(
18
+ magnus::exception::arg_error(),
19
+ "Data cannot be empty",
20
+ ));
21
+ }
22
+
23
+ // Get dimensions
24
+ let first_row: RArray = rarray.entry::<RArray>(0)?;
25
+ let n_features = first_row.len();
26
+
27
+ // Convert to Vec<Vec<f64>> format expected by hdbscan crate
28
+ let mut data_vec: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
29
+ for i in 0..n_samples {
30
+ let row: RArray = rarray.entry(i as isize)?;
31
+ let mut row_vec: Vec<f64> = Vec::with_capacity(n_features);
32
+ for j in 0..n_features {
33
+ let val: f64 = row.entry(j as isize)?;
34
+ row_vec.push(val);
35
+ }
36
+ data_vec.push(row_vec);
37
+ }
38
+
39
+ // Note: hdbscan crate doesn't support custom metrics directly
40
+ // We'll use the default Euclidean distance for now
41
+ if metric != "euclidean" && metric != "l2" {
42
+ eprintln!("Warning: Current hdbscan version only supports Euclidean distance. Using Euclidean.");
43
+ }
44
+
45
+ // Adjust parameters to avoid index out of bounds errors
46
+ // The hdbscan crate has issues when min_samples >= n_samples
47
+ let adjusted_min_samples = min_samples.min(n_samples.saturating_sub(1)).max(1);
48
+ let adjusted_min_cluster_size = min_cluster_size.min(n_samples).max(2);
49
+
50
+ // Create hyperparameters
51
+ let hyper_params = HdbscanHyperParams::builder()
52
+ .min_cluster_size(adjusted_min_cluster_size)
53
+ .min_samples(adjusted_min_samples)
54
+ .build();
55
+
56
+ // Create HDBSCAN instance and run clustering
57
+ let clusterer = Hdbscan::new(&data_vec, hyper_params);
58
+
59
+ // Run the clustering algorithm - cluster() returns Result<Vec<i32>, HdbscanError>
60
+ let labels = clusterer.cluster().map_err(|e| {
61
+ Error::new(
62
+ magnus::exception::runtime_error(),
63
+ format!("HDBSCAN clustering failed: {:?}", e)
64
+ )
65
+ })?;
66
+
67
+ // Convert results to Ruby types
68
+ let ruby = magnus::Ruby::get().unwrap();
69
+ let result = RHash::new();
70
+
71
+ // Convert labels (i32 to Ruby Integer, -1 for noise)
72
+ let labels_array = RArray::new();
73
+ for &label in labels.iter() {
74
+ labels_array.push(Integer::from_value(
75
+ ruby.eval(&format!("{}", label)).unwrap()
76
+ ).unwrap())?;
77
+ }
78
+ result.aset("labels", labels_array)?;
79
+
80
+ // For now, we'll create dummy probabilities and outlier scores
81
+ // since the basic hdbscan crate doesn't provide these
82
+ // In the future, we could calculate these ourselves or use a more advanced implementation
83
+
84
+ // Create probabilities array (all 1.0 for clustered points, 0.0 for noise)
85
+ let probs_array = RArray::new();
86
+ for &label in labels.iter() {
87
+ let prob = if label == -1 { 0.0 } else { 1.0 };
88
+ probs_array.push(prob)?;
89
+ }
90
+ result.aset("probabilities", probs_array)?;
91
+
92
+ // Create outlier scores array (0.0 for clustered points, 1.0 for noise)
93
+ let outlier_array = RArray::new();
94
+ for &label in labels.iter() {
95
+ let score = if label == -1 { 1.0 } else { 0.0 };
96
+ outlier_array.push(score)?;
97
+ }
98
+ result.aset("outlier_scores", outlier_array)?;
99
+
100
+ // Create empty cluster persistence hash for now
101
+ let persistence_hash = RHash::new();
102
+ result.aset("cluster_persistence", persistence_hash)?;
103
+
104
+ Ok(result)
105
+ }
106
+
107
+ /// Initialize HDBSCAN module functions
108
+ pub fn init(clustering_module: &magnus::RModule) -> Result<(), Error> {
109
+ clustering_module.define_singleton_method(
110
+ "hdbscan_rust",
111
+ function!(hdbscan_fit, 4),
112
+ )?;
113
+
114
+ Ok(())
115
+ }
@@ -0,0 +1,267 @@
1
+ use magnus::{function, prelude::*, Error, Value, RArray, Integer, TryConvert};
2
+ use ndarray::{Array1, Array2, ArrayView1, Axis};
3
+ use rand::prelude::*;
4
+
5
+ mod hdbscan_wrapper;
6
+
7
+ pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
8
+ let clustering_module = parent.define_module("Clustering")?;
9
+
10
+ clustering_module.define_singleton_method(
11
+ "kmeans_rust",
12
+ function!(kmeans, 3),
13
+ )?;
14
+
15
+ clustering_module.define_singleton_method(
16
+ "kmeans_predict_rust",
17
+ function!(kmeans_predict, 2),
18
+ )?;
19
+
20
+ // Initialize HDBSCAN functions
21
+ hdbscan_wrapper::init(&clustering_module)?;
22
+
23
+ Ok(())
24
+ }
25
+
26
+ /// Perform K-means clustering
27
+ /// Returns (labels, centroids, inertia)
28
+ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64), Error> {
29
+ // Convert Ruby array to ndarray
30
+ let rarray: RArray = TryConvert::try_convert(data)?;
31
+ let n_samples = rarray.len();
32
+
33
+ if n_samples == 0 {
34
+ return Err(Error::new(
35
+ magnus::exception::arg_error(),
36
+ "Data cannot be empty",
37
+ ));
38
+ }
39
+
40
+ // Get dimensions
41
+ let first_row: RArray = rarray.entry::<RArray>(0)?;
42
+ let n_features = first_row.len();
43
+
44
+ if k > n_samples {
45
+ return Err(Error::new(
46
+ magnus::exception::arg_error(),
47
+ format!("k ({}) cannot be larger than number of samples ({})", k, n_samples),
48
+ ));
49
+ }
50
+
51
+ // Convert to ndarray
52
+ let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
53
+ for i in 0..n_samples {
54
+ let row: RArray = rarray.entry(i as isize)?;
55
+ for j in 0..n_features {
56
+ let val: f64 = row.entry(j as isize)?;
57
+ data_array[[i, j]] = val;
58
+ }
59
+ }
60
+
61
+ // Initialize centroids using K-means++
62
+ let mut centroids = kmeans_plusplus(&data_array, k)?;
63
+ let mut labels = vec![0usize; n_samples];
64
+ let mut prev_labels = vec![0usize; n_samples];
65
+
66
+ // K-means iterations
67
+ for iteration in 0..max_iter {
68
+ // Assign points to nearest centroid
69
+ let mut changed = false;
70
+ for i in 0..n_samples {
71
+ let point = data_array.row(i);
72
+ let mut min_dist = f64::INFINITY;
73
+ let mut best_cluster = 0;
74
+
75
+ for (j, centroid) in centroids.axis_iter(Axis(0)).enumerate() {
76
+ let dist = euclidean_distance(&point, &centroid);
77
+ if dist < min_dist {
78
+ min_dist = dist;
79
+ best_cluster = j;
80
+ }
81
+ }
82
+
83
+ if labels[i] != best_cluster {
84
+ changed = true;
85
+ }
86
+ labels[i] = best_cluster;
87
+ }
88
+
89
+ // Check for convergence
90
+ if !changed && iteration > 0 {
91
+ break;
92
+ }
93
+
94
+ // Update centroids
95
+ for j in 0..k {
96
+ let mut sum = Array1::<f64>::zeros(n_features);
97
+ let mut count = 0;
98
+
99
+ for i in 0..n_samples {
100
+ if labels[i] == j {
101
+ sum += &data_array.row(i);
102
+ count += 1;
103
+ }
104
+ }
105
+
106
+ if count > 0 {
107
+ centroids.row_mut(j).assign(&(sum / count as f64));
108
+ }
109
+ }
110
+
111
+ prev_labels.clone_from(&labels);
112
+ }
113
+
114
+ // Calculate inertia (sum of squared distances to nearest centroid)
115
+ let mut inertia = 0.0;
116
+ for i in 0..n_samples {
117
+ let point = data_array.row(i);
118
+ let centroid = centroids.row(labels[i]);
119
+ inertia += euclidean_distance(&point, &centroid).powi(2);
120
+ }
121
+
122
+ // Convert results to Ruby arrays
123
+ let ruby = magnus::Ruby::get().unwrap();
124
+ let labels_array = RArray::new();
125
+ for label in labels {
126
+ labels_array.push(Integer::from_value(ruby.eval(&format!("{}", label)).unwrap()).unwrap())?;
127
+ }
128
+
129
+ let centroids_array = RArray::new();
130
+ for i in 0..k {
131
+ let row_array = RArray::new();
132
+ for j in 0..n_features {
133
+ row_array.push(centroids[[i, j]])?;
134
+ }
135
+ centroids_array.push(row_array)?;
136
+ }
137
+
138
+ Ok((labels_array, centroids_array, inertia))
139
+ }
140
+
141
+ /// Predict cluster labels for new data given centroids
142
+ fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
143
+ // Convert inputs
144
+ let data_array: RArray = TryConvert::try_convert(data)?;
145
+ let centroids_array: RArray = TryConvert::try_convert(centroids)?;
146
+
147
+ let n_samples = data_array.len();
148
+ let k = centroids_array.len();
149
+
150
+ if n_samples == 0 {
151
+ return Err(Error::new(
152
+ magnus::exception::arg_error(),
153
+ "Data cannot be empty",
154
+ ));
155
+ }
156
+
157
+ // Get dimensions
158
+ let first_row: RArray = data_array.entry::<RArray>(0)?;
159
+ let n_features = first_row.len();
160
+
161
+ // Convert data to ndarray
162
+ let mut data_matrix = Array2::<f64>::zeros((n_samples, n_features));
163
+ for i in 0..n_samples {
164
+ let row: RArray = data_array.entry(i as isize)?;
165
+ for j in 0..n_features {
166
+ let val: f64 = row.entry(j as isize)?;
167
+ data_matrix[[i, j]] = val;
168
+ }
169
+ }
170
+
171
+ // Convert centroids to ndarray
172
+ let mut centroids_matrix = Array2::<f64>::zeros((k, n_features));
173
+ for i in 0..k {
174
+ let row: RArray = centroids_array.entry(i as isize)?;
175
+ for j in 0..n_features {
176
+ let val: f64 = row.entry(j as isize)?;
177
+ centroids_matrix[[i, j]] = val;
178
+ }
179
+ }
180
+
181
+ // Predict labels
182
+ let ruby = magnus::Ruby::get().unwrap();
183
+ let labels_array = RArray::new();
184
+
185
+ for i in 0..n_samples {
186
+ let point = data_matrix.row(i);
187
+ let mut min_dist = f64::INFINITY;
188
+ let mut best_cluster = 0;
189
+
190
+ for (j, centroid) in centroids_matrix.axis_iter(Axis(0)).enumerate() {
191
+ let dist = euclidean_distance(&point, &centroid);
192
+ if dist < min_dist {
193
+ min_dist = dist;
194
+ best_cluster = j;
195
+ }
196
+ }
197
+
198
+ labels_array.push(Integer::from_value(ruby.eval(&format!("{}", best_cluster)).unwrap()).unwrap())?;
199
+ }
200
+
201
+ Ok(labels_array)
202
+ }
203
+
204
+ /// K-means++ initialization
205
+ fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
206
+ let n_samples = data.nrows();
207
+ let n_features = data.ncols();
208
+ let mut rng = thread_rng();
209
+
210
+ let mut centroids = Array2::<f64>::zeros((k, n_features));
211
+
212
+ // Choose first centroid randomly
213
+ let first_idx = rng.gen_range(0..n_samples);
214
+ centroids.row_mut(0).assign(&data.row(first_idx));
215
+
216
+ // Choose remaining centroids
217
+ for i in 1..k {
218
+ let mut distances = vec![f64::INFINITY; n_samples];
219
+
220
+ // Calculate distance to nearest centroid for each point
221
+ for j in 0..n_samples {
222
+ for c in 0..i {
223
+ let dist = euclidean_distance(&data.row(j), &centroids.row(c));
224
+ if dist < distances[j] {
225
+ distances[j] = dist;
226
+ }
227
+ }
228
+ }
229
+
230
+ // Convert distances to probabilities
231
+ let total: f64 = distances.iter().map(|d| d * d).sum();
232
+ if total == 0.0 {
233
+ // All points are identical or we've selected duplicates
234
+ // Just use sequential points as centroids
235
+ if i < n_samples {
236
+ centroids.row_mut(i).assign(&data.row(i));
237
+ } else {
238
+ // Reuse first point if we run out
239
+ centroids.row_mut(i).assign(&data.row(0));
240
+ }
241
+ continue;
242
+ }
243
+
244
+ // Choose next centroid with probability proportional to squared distance
245
+ let mut cumsum = 0.0;
246
+ let rand_val: f64 = rng.gen::<f64>() * total;
247
+
248
+ for j in 0..n_samples {
249
+ cumsum += distances[j] * distances[j];
250
+ if cumsum >= rand_val {
251
+ centroids.row_mut(i).assign(&data.row(j));
252
+ break;
253
+ }
254
+ }
255
+ }
256
+
257
+ Ok(centroids)
258
+ }
259
+
260
+ /// Calculate Euclidean distance between two points
261
+ fn euclidean_distance(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
262
+ a.iter()
263
+ .zip(b.iter())
264
+ .map(|(x, y)| (x - y).powi(2))
265
+ .sum::<f64>()
266
+ .sqrt()
267
+ }