clusterkit 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.simplecov +47 -0
- data/CHANGELOG.md +35 -0
- data/CLAUDE.md +226 -0
- data/Cargo.toml +8 -0
- data/Gemfile +17 -0
- data/IMPLEMENTATION_NOTES.md +143 -0
- data/LICENSE.txt +21 -0
- data/PYTHON_COMPARISON.md +183 -0
- data/README.md +499 -0
- data/Rakefile +245 -0
- data/clusterkit.gemspec +45 -0
- data/docs/KNOWN_ISSUES.md +130 -0
- data/docs/RUST_ERROR_HANDLING.md +164 -0
- data/docs/TEST_FIXTURES.md +170 -0
- data/docs/UMAP_EXPLAINED.md +362 -0
- data/docs/UMAP_TROUBLESHOOTING.md +284 -0
- data/docs/VERBOSE_OUTPUT.md +84 -0
- data/examples/hdbscan_example.rb +147 -0
- data/examples/optimal_kmeans_example.rb +96 -0
- data/examples/pca_example.rb +114 -0
- data/examples/reproducible_umap.rb +99 -0
- data/examples/verbose_control.rb +43 -0
- data/ext/clusterkit/Cargo.toml +25 -0
- data/ext/clusterkit/extconf.rb +4 -0
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +115 -0
- data/ext/clusterkit/src/clustering.rs +267 -0
- data/ext/clusterkit/src/embedder.rs +413 -0
- data/ext/clusterkit/src/lib.rs +22 -0
- data/ext/clusterkit/src/svd.rs +112 -0
- data/ext/clusterkit/src/tests.rs +16 -0
- data/ext/clusterkit/src/utils.rs +33 -0
- data/lib/clusterkit/clustering/hdbscan.rb +177 -0
- data/lib/clusterkit/clustering.rb +213 -0
- data/lib/clusterkit/clusterkit.rb +9 -0
- data/lib/clusterkit/configuration.rb +24 -0
- data/lib/clusterkit/dimensionality/pca.rb +251 -0
- data/lib/clusterkit/dimensionality/svd.rb +144 -0
- data/lib/clusterkit/dimensionality/umap.rb +311 -0
- data/lib/clusterkit/dimensionality.rb +29 -0
- data/lib/clusterkit/hdbscan_api_design.rb +142 -0
- data/lib/clusterkit/preprocessing.rb +106 -0
- data/lib/clusterkit/silence.rb +42 -0
- data/lib/clusterkit/utils.rb +51 -0
- data/lib/clusterkit/version.rb +5 -0
- data/lib/clusterkit.rb +93 -0
- data/lib/tasks/visualize.rake +641 -0
- metadata +194 -0
@@ -0,0 +1,114 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'clusterkit'
|
5
|
+
|
6
|
+
puts "PCA Example - Dimensionality Reduction and Variance Analysis"
|
7
|
+
puts "=" * 60
|
8
|
+
|
9
|
+
# Generate sample data with clear structure
|
10
|
+
# High variance in first 2 dimensions, low variance in others
|
11
|
+
def generate_structured_data(n_samples: 100, n_features: 20)
|
12
|
+
data = []
|
13
|
+
|
14
|
+
n_samples.times do
|
15
|
+
point = []
|
16
|
+
|
17
|
+
# First dimension: high variance (range ~10)
|
18
|
+
point << rand * 10
|
19
|
+
|
20
|
+
# Second dimension: medium variance (range ~5)
|
21
|
+
point << rand * 5
|
22
|
+
|
23
|
+
# Third dimension: some variance (range ~2)
|
24
|
+
point << rand * 2
|
25
|
+
|
26
|
+
# Remaining dimensions: very low variance (noise)
|
27
|
+
(n_features - 3).times do
|
28
|
+
point << rand * 0.1
|
29
|
+
end
|
30
|
+
|
31
|
+
data << point
|
32
|
+
end
|
33
|
+
|
34
|
+
data
|
35
|
+
end
|
36
|
+
|
37
|
+
# Generate data
|
38
|
+
data = generate_structured_data(n_samples: 100, n_features: 20)
|
39
|
+
puts "\nGenerated #{data.size} samples with #{data.first.size} features"
|
40
|
+
|
41
|
+
# Perform PCA with different numbers of components
|
42
|
+
[2, 3, 5, 10].each do |n_components|
|
43
|
+
puts "\n" + "-" * 40
|
44
|
+
puts "PCA with #{n_components} components:"
|
45
|
+
|
46
|
+
pca = ClusterKit::PCA.new(n_components: n_components)
|
47
|
+
transformed = pca.fit_transform(data)
|
48
|
+
|
49
|
+
puts " Transformed shape: #{transformed.size} x #{transformed.first.size}"
|
50
|
+
|
51
|
+
# Show explained variance for each component
|
52
|
+
puts " Explained variance ratio:"
|
53
|
+
pca.explained_variance_ratio.each_with_index do |ratio, i|
|
54
|
+
puts " PC#{i+1}: #{(ratio * 100).round(2)}%"
|
55
|
+
end
|
56
|
+
|
57
|
+
# Show cumulative explained variance
|
58
|
+
cumulative = pca.cumulative_explained_variance_ratio[-1]
|
59
|
+
puts " Total variance explained: #{(cumulative * 100).round(2)}%"
|
60
|
+
end
|
61
|
+
|
62
|
+
# Demonstrate reconstruction
|
63
|
+
puts "\n" + "=" * 60
|
64
|
+
puts "Reconstruction Example:"
|
65
|
+
puts "-" * 40
|
66
|
+
|
67
|
+
# Use 2 components (should capture most variance)
|
68
|
+
pca_2 = ClusterKit::PCA.new(n_components: 2)
|
69
|
+
compressed = pca_2.fit_transform(data)
|
70
|
+
reconstructed = pca_2.inverse_transform(compressed)
|
71
|
+
|
72
|
+
# Calculate reconstruction error
|
73
|
+
sample_idx = 0
|
74
|
+
original = data[sample_idx]
|
75
|
+
recon = reconstructed[sample_idx]
|
76
|
+
|
77
|
+
puts "\nOriginal data point (first 5 features):"
|
78
|
+
puts " #{original[0..4].map { |v| v.round(3) }.join(', ')}"
|
79
|
+
|
80
|
+
puts "\nReconstructed from 2 components (first 5 features):"
|
81
|
+
puts " #{recon[0..4].map { |v| v.round(3) }.join(', ')}"
|
82
|
+
|
83
|
+
# Calculate mean squared error
|
84
|
+
mse = original.zip(recon).map { |o, r| (o - r) ** 2 }.sum / original.size
|
85
|
+
puts "\nReconstruction MSE: #{mse.round(4)}"
|
86
|
+
|
87
|
+
# Demonstrate data compression ratio
|
88
|
+
original_size = data.size * data.first.size
|
89
|
+
compressed_size = compressed.size * compressed.first.size
|
90
|
+
compression_ratio = (1 - compressed_size.to_f / original_size) * 100
|
91
|
+
|
92
|
+
puts "\nData Compression:"
|
93
|
+
puts " Original size: #{original_size} values"
|
94
|
+
puts " Compressed size: #{compressed_size} values"
|
95
|
+
puts " Compression ratio: #{compression_ratio.round(1)}%"
|
96
|
+
puts " Variance retained: #{(pca_2.cumulative_explained_variance_ratio[-1] * 100).round(1)}%"
|
97
|
+
|
98
|
+
# Compare with SVD
|
99
|
+
puts "\n" + "=" * 60
|
100
|
+
puts "PCA vs SVD Comparison:"
|
101
|
+
puts "-" * 40
|
102
|
+
|
103
|
+
# PCA (with mean centering)
|
104
|
+
pca = ClusterKit::PCA.new(n_components: 2)
|
105
|
+
pca_result = pca.fit_transform(data)
|
106
|
+
|
107
|
+
# SVD (without mean centering)
|
108
|
+
u, s, vt = ClusterKit.svd(data, 2)
|
109
|
+
svd_result = u
|
110
|
+
|
111
|
+
puts "PCA result (first point): #{pca_result[0].map { |v| v.round(3) }}"
|
112
|
+
puts "SVD result (first point): #{svd_result[0].map { |v| v.round(3) }}"
|
113
|
+
puts "\nNote: PCA centers the data (subtracts mean), SVD does not."
|
114
|
+
puts "This makes PCA better for finding principal components of variation."
|
@@ -0,0 +1,99 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Example: Achieving reproducibility with UMAP despite random seed issues
|
3
|
+
|
4
|
+
require_relative '../lib/clusterkit'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
# Due to upstream limitations, UMAP doesn't give perfectly reproducible results
|
8
|
+
# even with a fixed random_seed. Here are workarounds:
|
9
|
+
|
10
|
+
# Generate sample data
|
11
|
+
srand(42)
|
12
|
+
data = []
|
13
|
+
3.times do |cluster|
|
14
|
+
center = Array.new(50) { rand * 0.1 + cluster * 2.0 }
|
15
|
+
30.times do
|
16
|
+
point = center.map { |c| c + (rand - 0.5) * 0.3 }
|
17
|
+
data << point
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
puts "Workaround 1: Cache transformed results"
|
22
|
+
puts "=" * 60
|
23
|
+
|
24
|
+
# First run: transform and save results
|
25
|
+
cache_file = "umap_results_cache.json"
|
26
|
+
if File.exist?(cache_file)
|
27
|
+
puts "Loading cached results from #{cache_file}"
|
28
|
+
embedded = JSON.parse(File.read(cache_file))
|
29
|
+
else
|
30
|
+
puts "No cache found, running UMAP..."
|
31
|
+
umap = ClusterKit::Dimensionality::UMAP.new(
|
32
|
+
n_components: 2,
|
33
|
+
n_neighbors: 5,
|
34
|
+
random_seed: 42 # Still use for *some* consistency
|
35
|
+
)
|
36
|
+
embedded = umap.fit_transform(data)
|
37
|
+
|
38
|
+
# Save results for reproducibility
|
39
|
+
File.write(cache_file, JSON.pretty_generate(embedded))
|
40
|
+
puts "Results cached to #{cache_file}"
|
41
|
+
end
|
42
|
+
|
43
|
+
puts "First 3 points:"
|
44
|
+
embedded[0..2].each_with_index do |point, i|
|
45
|
+
puts " Point #{i}: [#{point[0].round(3)}, #{point[1].round(3)}]"
|
46
|
+
end
|
47
|
+
|
48
|
+
puts "\nWorkaround 2: Save and load fitted models"
|
49
|
+
puts "=" * 60
|
50
|
+
|
51
|
+
model_file = "umap_model.bin"
|
52
|
+
|
53
|
+
# Train and save model once
|
54
|
+
if File.exist?(model_file)
|
55
|
+
puts "Loading existing model from #{model_file}"
|
56
|
+
umap = ClusterKit::Dimensionality::UMAP.load(model_file)
|
57
|
+
else
|
58
|
+
puts "Training new model..."
|
59
|
+
umap = ClusterKit::Dimensionality::UMAP.new(
|
60
|
+
n_components: 2,
|
61
|
+
n_neighbors: 5,
|
62
|
+
random_seed: 42
|
63
|
+
)
|
64
|
+
umap.fit(data)
|
65
|
+
umap.save(model_file)
|
66
|
+
puts "Model saved to #{model_file}"
|
67
|
+
end
|
68
|
+
|
69
|
+
# Now transform new data with the same model
|
70
|
+
new_data = data[0..9] # Take first 10 points as "new" data
|
71
|
+
transformed = umap.transform(new_data)
|
72
|
+
puts "Transformed 10 new points with saved model"
|
73
|
+
puts "First 3 transformed points:"
|
74
|
+
transformed[0..2].each_with_index do |point, i|
|
75
|
+
puts " Point #{i}: [#{point[0].round(3)}, #{point[1].round(3)}]"
|
76
|
+
end
|
77
|
+
|
78
|
+
puts "\nWorkaround 3: Use PCA for deterministic reduction"
|
79
|
+
puts "=" * 60
|
80
|
+
|
81
|
+
# PCA is deterministic - same input always gives same output
|
82
|
+
pca = ClusterKit::Dimensionality::PCA.new(n_components: 2)
|
83
|
+
pca_result1 = pca.fit_transform(data)
|
84
|
+
pca_result2 = pca.fit_transform(data) # Do it again
|
85
|
+
|
86
|
+
puts "PCA results are identical: #{pca_result1[0] == pca_result2[0]}"
|
87
|
+
puts "First point from run 1: [#{pca_result1[0][0].round(3)}, #{pca_result1[0][1].round(3)}]"
|
88
|
+
puts "First point from run 2: [#{pca_result2[0][0].round(3)}, #{pca_result2[0][1].round(3)}]"
|
89
|
+
|
90
|
+
puts "\nRecommendations:"
|
91
|
+
puts "-" * 40
|
92
|
+
puts "1. For production pipelines, cache UMAP results"
|
93
|
+
puts "2. For model deployment, save fitted models and reuse them"
|
94
|
+
puts "3. For testing/CI, use PCA or cached test data"
|
95
|
+
puts "4. Accept small variations in UMAP results as normal"
|
96
|
+
|
97
|
+
# Clean up example files (uncomment to remove)
|
98
|
+
# File.delete(cache_file) if File.exist?(cache_file)
|
99
|
+
# File.delete(model_file) if File.exist?(model_file)
|
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Example demonstrating how to control verbose output from clusterkit
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'clusterkit'
|
6
|
+
|
7
|
+
# Generate some random test data
|
8
|
+
data = Array.new(50) { Array.new(20) { rand } }
|
9
|
+
|
10
|
+
puts "=" * 60
|
11
|
+
puts "clusterkit Verbose Output Control Demo"
|
12
|
+
puts "=" * 60
|
13
|
+
|
14
|
+
puts "\n1. Default behavior (quiet mode):"
|
15
|
+
puts "-" * 40
|
16
|
+
umap1 = ClusterKit::UMAP.new(n_components: 2, n_neighbors: 10)
|
17
|
+
result1 = umap1.fit_transform(data)
|
18
|
+
puts "✓ UMAP completed silently"
|
19
|
+
puts " Result shape: #{result1.length} x #{result1.first.length}"
|
20
|
+
|
21
|
+
puts "\n2. Enable verbose output:"
|
22
|
+
puts "-" * 40
|
23
|
+
ClusterKit.configure do |config|
|
24
|
+
config.verbose = true
|
25
|
+
end
|
26
|
+
|
27
|
+
umap2 = ClusterKit::UMAP.new(n_components: 2, n_neighbors: 10)
|
28
|
+
puts "Running UMAP with verbose output enabled..."
|
29
|
+
result2 = umap2.fit_transform(data)
|
30
|
+
puts "✓ UMAP completed with debug output"
|
31
|
+
|
32
|
+
puts "\n3. Back to quiet mode:"
|
33
|
+
puts "-" * 40
|
34
|
+
ClusterKit.configuration.verbose = false
|
35
|
+
|
36
|
+
umap3 = ClusterKit::UMAP.new(n_components: 2, n_neighbors: 10)
|
37
|
+
result3 = umap3.fit_transform(data)
|
38
|
+
puts "✓ UMAP completed silently again"
|
39
|
+
|
40
|
+
puts "\n" + "=" * 60
|
41
|
+
puts "You can also set verbose mode via environment variable:"
|
42
|
+
puts " ANNEMBED_VERBOSE=true ruby your_script.rb"
|
43
|
+
puts "=" * 60
|
@@ -0,0 +1,25 @@
|
|
1
|
+
[package]
|
2
|
+
name = "clusterkit"
|
3
|
+
version = "0.1.0"
|
4
|
+
edition = "2021"
|
5
|
+
|
6
|
+
[lib]
|
7
|
+
crate-type = ["cdylib"]
|
8
|
+
|
9
|
+
[dependencies]
|
10
|
+
magnus = { version = "0.6", features = ["embed"] }
|
11
|
+
annembed = { git = "https://github.com/cpetersen/annembed", tag = "clusterkit-0.1.0" }
|
12
|
+
hnsw_rs = { git = "https://github.com/cpetersen/hnswlib-rs", tag = "clusterkit-0.1.0" }
|
13
|
+
hdbscan = "0.11"
|
14
|
+
ndarray = "0.16"
|
15
|
+
num-traits = "0.2"
|
16
|
+
rayon = "1.7"
|
17
|
+
serde = { version = "1.0", features = ["derive"] }
|
18
|
+
bincode = "1.3"
|
19
|
+
rand = "0.8"
|
20
|
+
|
21
|
+
[features]
|
22
|
+
default = ["openblas-static"]
|
23
|
+
openblas-static = ["annembed/openblas-static"]
|
24
|
+
openblas-system = ["annembed/openblas-system"]
|
25
|
+
intel-mkl-static = ["annembed/intel-mkl-static"]
|
@@ -0,0 +1,115 @@
|
|
1
|
+
use magnus::{function, prelude::*, Error, Value, RArray, RHash, Integer, TryConvert};
|
2
|
+
use hdbscan::{Hdbscan, HdbscanHyperParams};
|
3
|
+
|
4
|
+
/// Perform HDBSCAN clustering
|
5
|
+
/// Returns a hash with labels and basic statistics
|
6
|
+
pub fn hdbscan_fit(
|
7
|
+
data: Value,
|
8
|
+
min_samples: usize,
|
9
|
+
min_cluster_size: usize,
|
10
|
+
metric: String,
|
11
|
+
) -> Result<RHash, Error> {
|
12
|
+
// Convert Ruby array to ndarray
|
13
|
+
let rarray: RArray = TryConvert::try_convert(data)?;
|
14
|
+
let n_samples = rarray.len();
|
15
|
+
|
16
|
+
if n_samples == 0 {
|
17
|
+
return Err(Error::new(
|
18
|
+
magnus::exception::arg_error(),
|
19
|
+
"Data cannot be empty",
|
20
|
+
));
|
21
|
+
}
|
22
|
+
|
23
|
+
// Get dimensions
|
24
|
+
let first_row: RArray = rarray.entry::<RArray>(0)?;
|
25
|
+
let n_features = first_row.len();
|
26
|
+
|
27
|
+
// Convert to Vec<Vec<f64>> format expected by hdbscan crate
|
28
|
+
let mut data_vec: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
|
29
|
+
for i in 0..n_samples {
|
30
|
+
let row: RArray = rarray.entry(i as isize)?;
|
31
|
+
let mut row_vec: Vec<f64> = Vec::with_capacity(n_features);
|
32
|
+
for j in 0..n_features {
|
33
|
+
let val: f64 = row.entry(j as isize)?;
|
34
|
+
row_vec.push(val);
|
35
|
+
}
|
36
|
+
data_vec.push(row_vec);
|
37
|
+
}
|
38
|
+
|
39
|
+
// Note: hdbscan crate doesn't support custom metrics directly
|
40
|
+
// We'll use the default Euclidean distance for now
|
41
|
+
if metric != "euclidean" && metric != "l2" {
|
42
|
+
eprintln!("Warning: Current hdbscan version only supports Euclidean distance. Using Euclidean.");
|
43
|
+
}
|
44
|
+
|
45
|
+
// Adjust parameters to avoid index out of bounds errors
|
46
|
+
// The hdbscan crate has issues when min_samples >= n_samples
|
47
|
+
let adjusted_min_samples = min_samples.min(n_samples.saturating_sub(1)).max(1);
|
48
|
+
let adjusted_min_cluster_size = min_cluster_size.min(n_samples).max(2);
|
49
|
+
|
50
|
+
// Create hyperparameters
|
51
|
+
let hyper_params = HdbscanHyperParams::builder()
|
52
|
+
.min_cluster_size(adjusted_min_cluster_size)
|
53
|
+
.min_samples(adjusted_min_samples)
|
54
|
+
.build();
|
55
|
+
|
56
|
+
// Create HDBSCAN instance and run clustering
|
57
|
+
let clusterer = Hdbscan::new(&data_vec, hyper_params);
|
58
|
+
|
59
|
+
// Run the clustering algorithm - cluster() returns Result<Vec<i32>, HdbscanError>
|
60
|
+
let labels = clusterer.cluster().map_err(|e| {
|
61
|
+
Error::new(
|
62
|
+
magnus::exception::runtime_error(),
|
63
|
+
format!("HDBSCAN clustering failed: {:?}", e)
|
64
|
+
)
|
65
|
+
})?;
|
66
|
+
|
67
|
+
// Convert results to Ruby types
|
68
|
+
let ruby = magnus::Ruby::get().unwrap();
|
69
|
+
let result = RHash::new();
|
70
|
+
|
71
|
+
// Convert labels (i32 to Ruby Integer, -1 for noise)
|
72
|
+
let labels_array = RArray::new();
|
73
|
+
for &label in labels.iter() {
|
74
|
+
labels_array.push(Integer::from_value(
|
75
|
+
ruby.eval(&format!("{}", label)).unwrap()
|
76
|
+
).unwrap())?;
|
77
|
+
}
|
78
|
+
result.aset("labels", labels_array)?;
|
79
|
+
|
80
|
+
// For now, we'll create dummy probabilities and outlier scores
|
81
|
+
// since the basic hdbscan crate doesn't provide these
|
82
|
+
// In the future, we could calculate these ourselves or use a more advanced implementation
|
83
|
+
|
84
|
+
// Create probabilities array (all 1.0 for clustered points, 0.0 for noise)
|
85
|
+
let probs_array = RArray::new();
|
86
|
+
for &label in labels.iter() {
|
87
|
+
let prob = if label == -1 { 0.0 } else { 1.0 };
|
88
|
+
probs_array.push(prob)?;
|
89
|
+
}
|
90
|
+
result.aset("probabilities", probs_array)?;
|
91
|
+
|
92
|
+
// Create outlier scores array (0.0 for clustered points, 1.0 for noise)
|
93
|
+
let outlier_array = RArray::new();
|
94
|
+
for &label in labels.iter() {
|
95
|
+
let score = if label == -1 { 1.0 } else { 0.0 };
|
96
|
+
outlier_array.push(score)?;
|
97
|
+
}
|
98
|
+
result.aset("outlier_scores", outlier_array)?;
|
99
|
+
|
100
|
+
// Create empty cluster persistence hash for now
|
101
|
+
let persistence_hash = RHash::new();
|
102
|
+
result.aset("cluster_persistence", persistence_hash)?;
|
103
|
+
|
104
|
+
Ok(result)
|
105
|
+
}
|
106
|
+
|
107
|
+
/// Initialize HDBSCAN module functions
|
108
|
+
pub fn init(clustering_module: &magnus::RModule) -> Result<(), Error> {
|
109
|
+
clustering_module.define_singleton_method(
|
110
|
+
"hdbscan_rust",
|
111
|
+
function!(hdbscan_fit, 4),
|
112
|
+
)?;
|
113
|
+
|
114
|
+
Ok(())
|
115
|
+
}
|
@@ -0,0 +1,267 @@
|
|
1
|
+
use magnus::{function, prelude::*, Error, Value, RArray, Integer, TryConvert};
|
2
|
+
use ndarray::{Array1, Array2, ArrayView1, Axis};
|
3
|
+
use rand::prelude::*;
|
4
|
+
|
5
|
+
mod hdbscan_wrapper;
|
6
|
+
|
7
|
+
pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
8
|
+
let clustering_module = parent.define_module("Clustering")?;
|
9
|
+
|
10
|
+
clustering_module.define_singleton_method(
|
11
|
+
"kmeans_rust",
|
12
|
+
function!(kmeans, 3),
|
13
|
+
)?;
|
14
|
+
|
15
|
+
clustering_module.define_singleton_method(
|
16
|
+
"kmeans_predict_rust",
|
17
|
+
function!(kmeans_predict, 2),
|
18
|
+
)?;
|
19
|
+
|
20
|
+
// Initialize HDBSCAN functions
|
21
|
+
hdbscan_wrapper::init(&clustering_module)?;
|
22
|
+
|
23
|
+
Ok(())
|
24
|
+
}
|
25
|
+
|
26
|
+
/// Perform K-means clustering
|
27
|
+
/// Returns (labels, centroids, inertia)
|
28
|
+
fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64), Error> {
|
29
|
+
// Convert Ruby array to ndarray
|
30
|
+
let rarray: RArray = TryConvert::try_convert(data)?;
|
31
|
+
let n_samples = rarray.len();
|
32
|
+
|
33
|
+
if n_samples == 0 {
|
34
|
+
return Err(Error::new(
|
35
|
+
magnus::exception::arg_error(),
|
36
|
+
"Data cannot be empty",
|
37
|
+
));
|
38
|
+
}
|
39
|
+
|
40
|
+
// Get dimensions
|
41
|
+
let first_row: RArray = rarray.entry::<RArray>(0)?;
|
42
|
+
let n_features = first_row.len();
|
43
|
+
|
44
|
+
if k > n_samples {
|
45
|
+
return Err(Error::new(
|
46
|
+
magnus::exception::arg_error(),
|
47
|
+
format!("k ({}) cannot be larger than number of samples ({})", k, n_samples),
|
48
|
+
));
|
49
|
+
}
|
50
|
+
|
51
|
+
// Convert to ndarray
|
52
|
+
let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
|
53
|
+
for i in 0..n_samples {
|
54
|
+
let row: RArray = rarray.entry(i as isize)?;
|
55
|
+
for j in 0..n_features {
|
56
|
+
let val: f64 = row.entry(j as isize)?;
|
57
|
+
data_array[[i, j]] = val;
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
// Initialize centroids using K-means++
|
62
|
+
let mut centroids = kmeans_plusplus(&data_array, k)?;
|
63
|
+
let mut labels = vec![0usize; n_samples];
|
64
|
+
let mut prev_labels = vec![0usize; n_samples];
|
65
|
+
|
66
|
+
// K-means iterations
|
67
|
+
for iteration in 0..max_iter {
|
68
|
+
// Assign points to nearest centroid
|
69
|
+
let mut changed = false;
|
70
|
+
for i in 0..n_samples {
|
71
|
+
let point = data_array.row(i);
|
72
|
+
let mut min_dist = f64::INFINITY;
|
73
|
+
let mut best_cluster = 0;
|
74
|
+
|
75
|
+
for (j, centroid) in centroids.axis_iter(Axis(0)).enumerate() {
|
76
|
+
let dist = euclidean_distance(&point, ¢roid);
|
77
|
+
if dist < min_dist {
|
78
|
+
min_dist = dist;
|
79
|
+
best_cluster = j;
|
80
|
+
}
|
81
|
+
}
|
82
|
+
|
83
|
+
if labels[i] != best_cluster {
|
84
|
+
changed = true;
|
85
|
+
}
|
86
|
+
labels[i] = best_cluster;
|
87
|
+
}
|
88
|
+
|
89
|
+
// Check for convergence
|
90
|
+
if !changed && iteration > 0 {
|
91
|
+
break;
|
92
|
+
}
|
93
|
+
|
94
|
+
// Update centroids
|
95
|
+
for j in 0..k {
|
96
|
+
let mut sum = Array1::<f64>::zeros(n_features);
|
97
|
+
let mut count = 0;
|
98
|
+
|
99
|
+
for i in 0..n_samples {
|
100
|
+
if labels[i] == j {
|
101
|
+
sum += &data_array.row(i);
|
102
|
+
count += 1;
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
if count > 0 {
|
107
|
+
centroids.row_mut(j).assign(&(sum / count as f64));
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
prev_labels.clone_from(&labels);
|
112
|
+
}
|
113
|
+
|
114
|
+
// Calculate inertia (sum of squared distances to nearest centroid)
|
115
|
+
let mut inertia = 0.0;
|
116
|
+
for i in 0..n_samples {
|
117
|
+
let point = data_array.row(i);
|
118
|
+
let centroid = centroids.row(labels[i]);
|
119
|
+
inertia += euclidean_distance(&point, ¢roid).powi(2);
|
120
|
+
}
|
121
|
+
|
122
|
+
// Convert results to Ruby arrays
|
123
|
+
let ruby = magnus::Ruby::get().unwrap();
|
124
|
+
let labels_array = RArray::new();
|
125
|
+
for label in labels {
|
126
|
+
labels_array.push(Integer::from_value(ruby.eval(&format!("{}", label)).unwrap()).unwrap())?;
|
127
|
+
}
|
128
|
+
|
129
|
+
let centroids_array = RArray::new();
|
130
|
+
for i in 0..k {
|
131
|
+
let row_array = RArray::new();
|
132
|
+
for j in 0..n_features {
|
133
|
+
row_array.push(centroids[[i, j]])?;
|
134
|
+
}
|
135
|
+
centroids_array.push(row_array)?;
|
136
|
+
}
|
137
|
+
|
138
|
+
Ok((labels_array, centroids_array, inertia))
|
139
|
+
}
|
140
|
+
|
141
|
+
/// Predict cluster labels for new data given centroids
|
142
|
+
fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
|
143
|
+
// Convert inputs
|
144
|
+
let data_array: RArray = TryConvert::try_convert(data)?;
|
145
|
+
let centroids_array: RArray = TryConvert::try_convert(centroids)?;
|
146
|
+
|
147
|
+
let n_samples = data_array.len();
|
148
|
+
let k = centroids_array.len();
|
149
|
+
|
150
|
+
if n_samples == 0 {
|
151
|
+
return Err(Error::new(
|
152
|
+
magnus::exception::arg_error(),
|
153
|
+
"Data cannot be empty",
|
154
|
+
));
|
155
|
+
}
|
156
|
+
|
157
|
+
// Get dimensions
|
158
|
+
let first_row: RArray = data_array.entry::<RArray>(0)?;
|
159
|
+
let n_features = first_row.len();
|
160
|
+
|
161
|
+
// Convert data to ndarray
|
162
|
+
let mut data_matrix = Array2::<f64>::zeros((n_samples, n_features));
|
163
|
+
for i in 0..n_samples {
|
164
|
+
let row: RArray = data_array.entry(i as isize)?;
|
165
|
+
for j in 0..n_features {
|
166
|
+
let val: f64 = row.entry(j as isize)?;
|
167
|
+
data_matrix[[i, j]] = val;
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
// Convert centroids to ndarray
|
172
|
+
let mut centroids_matrix = Array2::<f64>::zeros((k, n_features));
|
173
|
+
for i in 0..k {
|
174
|
+
let row: RArray = centroids_array.entry(i as isize)?;
|
175
|
+
for j in 0..n_features {
|
176
|
+
let val: f64 = row.entry(j as isize)?;
|
177
|
+
centroids_matrix[[i, j]] = val;
|
178
|
+
}
|
179
|
+
}
|
180
|
+
|
181
|
+
// Predict labels
|
182
|
+
let ruby = magnus::Ruby::get().unwrap();
|
183
|
+
let labels_array = RArray::new();
|
184
|
+
|
185
|
+
for i in 0..n_samples {
|
186
|
+
let point = data_matrix.row(i);
|
187
|
+
let mut min_dist = f64::INFINITY;
|
188
|
+
let mut best_cluster = 0;
|
189
|
+
|
190
|
+
for (j, centroid) in centroids_matrix.axis_iter(Axis(0)).enumerate() {
|
191
|
+
let dist = euclidean_distance(&point, ¢roid);
|
192
|
+
if dist < min_dist {
|
193
|
+
min_dist = dist;
|
194
|
+
best_cluster = j;
|
195
|
+
}
|
196
|
+
}
|
197
|
+
|
198
|
+
labels_array.push(Integer::from_value(ruby.eval(&format!("{}", best_cluster)).unwrap()).unwrap())?;
|
199
|
+
}
|
200
|
+
|
201
|
+
Ok(labels_array)
|
202
|
+
}
|
203
|
+
|
204
|
+
/// K-means++ initialization
|
205
|
+
fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
|
206
|
+
let n_samples = data.nrows();
|
207
|
+
let n_features = data.ncols();
|
208
|
+
let mut rng = thread_rng();
|
209
|
+
|
210
|
+
let mut centroids = Array2::<f64>::zeros((k, n_features));
|
211
|
+
|
212
|
+
// Choose first centroid randomly
|
213
|
+
let first_idx = rng.gen_range(0..n_samples);
|
214
|
+
centroids.row_mut(0).assign(&data.row(first_idx));
|
215
|
+
|
216
|
+
// Choose remaining centroids
|
217
|
+
for i in 1..k {
|
218
|
+
let mut distances = vec![f64::INFINITY; n_samples];
|
219
|
+
|
220
|
+
// Calculate distance to nearest centroid for each point
|
221
|
+
for j in 0..n_samples {
|
222
|
+
for c in 0..i {
|
223
|
+
let dist = euclidean_distance(&data.row(j), ¢roids.row(c));
|
224
|
+
if dist < distances[j] {
|
225
|
+
distances[j] = dist;
|
226
|
+
}
|
227
|
+
}
|
228
|
+
}
|
229
|
+
|
230
|
+
// Convert distances to probabilities
|
231
|
+
let total: f64 = distances.iter().map(|d| d * d).sum();
|
232
|
+
if total == 0.0 {
|
233
|
+
// All points are identical or we've selected duplicates
|
234
|
+
// Just use sequential points as centroids
|
235
|
+
if i < n_samples {
|
236
|
+
centroids.row_mut(i).assign(&data.row(i));
|
237
|
+
} else {
|
238
|
+
// Reuse first point if we run out
|
239
|
+
centroids.row_mut(i).assign(&data.row(0));
|
240
|
+
}
|
241
|
+
continue;
|
242
|
+
}
|
243
|
+
|
244
|
+
// Choose next centroid with probability proportional to squared distance
|
245
|
+
let mut cumsum = 0.0;
|
246
|
+
let rand_val: f64 = rng.gen::<f64>() * total;
|
247
|
+
|
248
|
+
for j in 0..n_samples {
|
249
|
+
cumsum += distances[j] * distances[j];
|
250
|
+
if cumsum >= rand_val {
|
251
|
+
centroids.row_mut(i).assign(&data.row(j));
|
252
|
+
break;
|
253
|
+
}
|
254
|
+
}
|
255
|
+
}
|
256
|
+
|
257
|
+
Ok(centroids)
|
258
|
+
}
|
259
|
+
|
260
|
+
/// Calculate Euclidean distance between two points
|
261
|
+
fn euclidean_distance(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
|
262
|
+
a.iter()
|
263
|
+
.zip(b.iter())
|
264
|
+
.map(|(x, y)| (x - y).powi(2))
|
265
|
+
.sum::<f64>()
|
266
|
+
.sqrt()
|
267
|
+
}
|