clusterkit 0.1.1 → 0.2.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/clusterkit.gemspec +45 -0
- data/docs/KNOWN_ISSUES.md +5 -5
- data/docs/RUST_ERROR_HANDLING.md +6 -6
- data/ext/clusterkit/Cargo.toml +4 -5
- data/ext/clusterkit/extconf.rb +1 -9
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +62 -27
- data/ext/clusterkit/src/clustering.rs +114 -68
- data/ext/clusterkit/src/embedder.rs +131 -48
- data/ext/clusterkit/src/hnsw.rs +215 -181
- data/ext/clusterkit/src/lib.rs +5 -5
- data/ext/clusterkit/src/svd.rs +58 -35
- data/ext/clusterkit/src/utils.rs +9 -159
- data/lib/clusterkit/clustering/hdbscan.rb +17 -4
- data/lib/clusterkit/clustering.rb +23 -4
- data/lib/clusterkit/dimensionality/pca.rb +12 -12
- data/lib/clusterkit/dimensionality/svd.rb +16 -47
- data/lib/clusterkit/dimensionality/umap.rb +40 -7
- data/lib/clusterkit/version.rb +1 -1
- data/lib/clusterkit.rb +1 -1
- metadata +20 -35
- data/Cargo.lock +0 -3236
- data/lib/clusterkit/data_validator.rb +0 -132
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 868c555f318a974371aeae9b892641e4e69ea3bb7b38b8e7b30c16e24bba37e5
|
|
4
|
+
data.tar.gz: e9fc6e35b6065d074e5f9e2b074298b83686aa95fcb78f2cf7f98373015414db
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e25abeea0f43f2f9b1cd2171a92408d458f95c09aca86d10df3cd01602b89df4725ce2fdd99b8d0cefe3c5b21659edd2f9e7acc60cd2f49cc7aa1ee7ab815911
|
|
7
|
+
data.tar.gz: b6dd04908293679a1c16919a1416f09ce3cd6099903724ee1a98be0f1fd6c8200609ec9f792c733442847ad92a2ab4e4a3da47380ee9e5a3517d855d5a02286a
|
data/README.md
CHANGED
|
@@ -720,7 +720,7 @@ COVERAGE=true bundle exec rspec
|
|
|
720
720
|
|
|
721
721
|
## Contributing
|
|
722
722
|
|
|
723
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
|
723
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/cpetersen/clusterkit.
|
|
724
724
|
|
|
725
725
|
## License
|
|
726
726
|
|
|
@@ -735,7 +735,7 @@ If you use ClusterKit in your research, please cite:
|
|
|
735
735
|
author = {Chris Petersen},
|
|
736
736
|
title = {ClusterKit: High-Performance Clustering and Dimensionality Reduction for Ruby},
|
|
737
737
|
year = {2024},
|
|
738
|
-
url = {https://github.com/
|
|
738
|
+
url = {https://github.com/cpetersen/clusterkit}
|
|
739
739
|
}
|
|
740
740
|
```
|
|
741
741
|
|
data/clusterkit.gemspec
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
require_relative "lib/clusterkit/version"
|
|
2
|
+
|
|
3
|
+
Gem::Specification.new do |spec|
|
|
4
|
+
spec.name = "clusterkit"
|
|
5
|
+
spec.version = ClusterKit::VERSION
|
|
6
|
+
spec.authors = ["Chris Petersen"]
|
|
7
|
+
spec.email = ["chris@petersen.io"]
|
|
8
|
+
|
|
9
|
+
spec.summary = "High-performance clustering and dimensionality reduction for Ruby"
|
|
10
|
+
spec.description = "A comprehensive clustering toolkit for Ruby, providing UMAP, PCA, K-means, HDBSCAN and more. Built on top of annembed and hdbscan Rust crates for blazing-fast performance."
|
|
11
|
+
spec.homepage = "https://github.com/cpetersen/clusterkit"
|
|
12
|
+
spec.license = "MIT"
|
|
13
|
+
spec.required_ruby_version = ">= 2.7.0"
|
|
14
|
+
|
|
15
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
|
16
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
|
17
|
+
spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
|
|
18
|
+
|
|
19
|
+
# Specify which files should be added to the gem when it is released.
|
|
20
|
+
spec.files = Dir.chdir(__dir__) do
|
|
21
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
|
22
|
+
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
|
23
|
+
end + Dir["ext/**/*.rs", "ext/**/*.toml"]
|
|
24
|
+
end
|
|
25
|
+
spec.bindir = "exe"
|
|
26
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
27
|
+
spec.require_paths = ["lib"]
|
|
28
|
+
spec.extensions = ["ext/clusterkit/extconf.rb"]
|
|
29
|
+
|
|
30
|
+
# Runtime dependencies
|
|
31
|
+
# Numo is optional but recommended for better performance
|
|
32
|
+
# spec.add_dependency "numo-narray", "~> 0.9"
|
|
33
|
+
|
|
34
|
+
# Development dependencies
|
|
35
|
+
spec.add_development_dependency "csv"
|
|
36
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
|
37
|
+
spec.add_development_dependency "rake-compiler", "~> 1.2"
|
|
38
|
+
spec.add_development_dependency "rb_sys", "~> 0.9"
|
|
39
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
|
40
|
+
spec.add_development_dependency "simplecov", "~> 0.22"
|
|
41
|
+
spec.add_development_dependency "yard", "~> 0.9"
|
|
42
|
+
|
|
43
|
+
# For more information and examples about making a new gem, check out our
|
|
44
|
+
# guide at: https://bundler.io/guides/creating_gem.html
|
|
45
|
+
end
|
data/docs/KNOWN_ISSUES.md
CHANGED
|
@@ -14,7 +14,7 @@ This gem has three main categories of limitations:
|
|
|
14
14
|
|
|
15
15
|
**Reason**: UMAP needs sufficient data to construct a meaningful manifold approximation. With fewer than 10 points, the algorithm cannot create a reliable graph structure.
|
|
16
16
|
|
|
17
|
-
**Workaround**:
|
|
17
|
+
**Workaround**:
|
|
18
18
|
- Use PCA for datasets with fewer than 10 points
|
|
19
19
|
- The `transform` method can handle smaller datasets once the model is fitted on adequate training data
|
|
20
20
|
|
|
@@ -30,12 +30,12 @@ This gem has three main categories of limitations:
|
|
|
30
30
|
|
|
31
31
|
**Previous Issue**: The box_size assertion would panic and crash the Ruby process.
|
|
32
32
|
|
|
33
|
-
**Current Status**: **FIXED** in `
|
|
33
|
+
**Current Status**: **FIXED** in `cpetersen/annembed:fix-box-size-panic` branch
|
|
34
34
|
- The `"assertion failed: (*f).abs() <= box_size"` panic has been converted to a catchable error
|
|
35
35
|
- Extreme value ranges are now handled gracefully through normalization
|
|
36
36
|
- NaN/Infinite values are detected and reported with clear error messages
|
|
37
37
|
|
|
38
|
-
**Remaining Uncatchable Errors**:
|
|
38
|
+
**Remaining Uncatchable Errors**:
|
|
39
39
|
- Array bounds violations (accessing out-of-bounds indices)
|
|
40
40
|
- Some `.unwrap()` calls on `None` or `Err` values
|
|
41
41
|
- These are much less common in normal usage
|
|
@@ -98,7 +98,7 @@ def safe_umap_transform(data, options = {})
|
|
|
98
98
|
# Save data to temporary file before processing
|
|
99
99
|
temp_file = "temp_umap_data_#{Time.now.to_i}.json"
|
|
100
100
|
File.write(temp_file, JSON.dump(data))
|
|
101
|
-
|
|
101
|
+
|
|
102
102
|
begin
|
|
103
103
|
umap = ClusterKit::Dimensionality::UMAP.new(**options)
|
|
104
104
|
result = umap.fit_transform(data)
|
|
@@ -127,4 +127,4 @@ def reduce_dimensions(data, n_components: 2)
|
|
|
127
127
|
pca.fit_transform(data)
|
|
128
128
|
end
|
|
129
129
|
end
|
|
130
|
-
```
|
|
130
|
+
```
|
data/docs/RUST_ERROR_HANDLING.md
CHANGED
|
@@ -37,11 +37,11 @@ These use Rust's `assert!` or `panic!` macros and CANNOT be caught. They will cr
|
|
|
37
37
|
|
|
38
38
|
| Error | Source | Location | Trigger Condition |
|
|
39
39
|
|-------|--------|----------|-------------------|
|
|
40
|
-
| ~~Box size assertion~~ | ~~annembed~~ | ~~`set_data_box`~~ | **FIXED in
|
|
40
|
+
| ~~Box size assertion~~ | ~~annembed~~ | ~~`set_data_box`~~ | **FIXED in cpetersen/annembed:fix-box-size-panic** |
|
|
41
41
|
| Array bounds | Various | Index operations | Accessing out-of-bounds indices |
|
|
42
42
|
| Unwrap failures | Various | `.unwrap()` calls | Unwrapping `None` or `Err` |
|
|
43
43
|
|
|
44
|
-
**Update (2025-08-19):** The box size assertion has been fixed in the `fix-box-size-panic` branch of
|
|
44
|
+
**Update (2025-08-19):** The box size assertion has been fixed in the `fix-box-size-panic` branch of cpetersen/annembed. It now returns a proper `Result<(), anyhow::Error>` that can be caught and handled gracefully:
|
|
45
45
|
|
|
46
46
|
```rust
|
|
47
47
|
// Previously (would panic):
|
|
@@ -96,13 +96,13 @@ when /isolated point/i
|
|
|
96
96
|
|
|
97
97
|
**Previous Issue:** Would panic and crash the Ruby process
|
|
98
98
|
|
|
99
|
-
**Current Status:** Fixed in `
|
|
100
|
-
- Now returns a catchable `anyhow::Error`
|
|
99
|
+
**Current Status:** Fixed in `cpetersen/annembed:fix-box-size-panic` branch
|
|
100
|
+
- Now returns a catchable `anyhow::Error`
|
|
101
101
|
- Detects NaN/Infinite values during normalization
|
|
102
102
|
- Handles constant data (max_max = 0) gracefully
|
|
103
103
|
- Extreme value ranges are normalized successfully
|
|
104
104
|
|
|
105
|
-
**User-visible behavior:**
|
|
105
|
+
**User-visible behavior:**
|
|
106
106
|
- Previously: Ruby process would crash with assertion failure
|
|
107
107
|
- Now: Raises a catchable Ruby exception with helpful error message
|
|
108
108
|
|
|
@@ -161,4 +161,4 @@ when /isolated point/i
|
|
|
161
161
|
|
|
162
162
|
The test suite mocks Rust errors to verify our error handling logic works correctly. However, actual panic conditions cannot be tested without crashing the test process.
|
|
163
163
|
|
|
164
|
-
See `spec/clusterkit/error_handling_spec.rb` for error handling tests.
|
|
164
|
+
See `spec/clusterkit/error_handling_spec.rb` for error handling tests.
|
data/ext/clusterkit/Cargo.toml
CHANGED
|
@@ -7,9 +7,9 @@ edition = "2021"
|
|
|
7
7
|
crate-type = ["cdylib"]
|
|
8
8
|
|
|
9
9
|
[dependencies]
|
|
10
|
-
magnus = { version = "0.
|
|
11
|
-
annembed = { git = "https://github.com/
|
|
12
|
-
hnsw_rs = { git = "https://github.com/
|
|
10
|
+
magnus = { version = "0.6", features = ["embed"] }
|
|
11
|
+
annembed = { git = "https://github.com/cpetersen/annembed", tag = "clusterkit-0.1.0" }
|
|
12
|
+
hnsw_rs = { git = "https://github.com/cpetersen/hnswlib-rs", tag = "clusterkit-0.1.0" }
|
|
13
13
|
hdbscan = "0.11"
|
|
14
14
|
ndarray = "0.16"
|
|
15
15
|
num-traits = "0.2"
|
|
@@ -22,5 +22,4 @@ rand = "0.8"
|
|
|
22
22
|
default = ["openblas-static"]
|
|
23
23
|
openblas-static = ["annembed/openblas-static"]
|
|
24
24
|
openblas-system = ["annembed/openblas-system"]
|
|
25
|
-
intel-mkl-static = ["annembed/intel-mkl-static"]
|
|
26
|
-
macos-accelerate = ["annembed/macos-accelerate"]
|
|
25
|
+
intel-mkl-static = ["annembed/intel-mkl-static"]
|
data/ext/clusterkit/extconf.rb
CHANGED
|
@@ -1,12 +1,4 @@
|
|
|
1
1
|
require "mkmf"
|
|
2
2
|
require "rb_sys/mkmf"
|
|
3
3
|
|
|
4
|
-
create_rust_makefile("clusterkit/clusterkit")
|
|
5
|
-
if ENV["CLUSTERKIT_FEATURES"]
|
|
6
|
-
r.extra_cargo_args += ["--no-default-features"]
|
|
7
|
-
r.features = ENV["CLUSTERKIT_FEATURES"].split(",")
|
|
8
|
-
elsif RUBY_PLATFORM =~ /darwin/
|
|
9
|
-
r.extra_cargo_args += ["--no-default-features"]
|
|
10
|
-
r.features = ["macos-accelerate"]
|
|
11
|
-
end
|
|
12
|
-
end
|
|
4
|
+
create_rust_makefile("clusterkit/clusterkit")
|
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
use magnus::{function, prelude::*, Error, Value, RHash,
|
|
1
|
+
use magnus::{function, prelude::*, Error, Value, RArray, RHash, Integer, TryConvert};
|
|
2
2
|
use hdbscan::{Hdbscan, HdbscanHyperParams};
|
|
3
|
-
use crate::utils::ruby_array_to_vec_vec_f64;
|
|
4
3
|
|
|
5
4
|
/// Perform HDBSCAN clustering
|
|
6
5
|
/// Returns a hash with labels and basic statistics
|
|
@@ -10,62 +9,98 @@ pub fn hdbscan_fit(
|
|
|
10
9
|
min_cluster_size: usize,
|
|
11
10
|
metric: String,
|
|
12
11
|
) -> Result<RHash, Error> {
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
12
|
+
// Convert Ruby array to ndarray
|
|
13
|
+
let rarray: RArray = TryConvert::try_convert(data)?;
|
|
14
|
+
let n_samples = rarray.len();
|
|
15
|
+
|
|
16
|
+
if n_samples == 0 {
|
|
17
|
+
return Err(Error::new(
|
|
18
|
+
magnus::exception::arg_error(),
|
|
19
|
+
"Data cannot be empty",
|
|
20
|
+
));
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// Get dimensions
|
|
24
|
+
let first_row: RArray = rarray.entry::<RArray>(0)?;
|
|
25
|
+
let n_features = first_row.len();
|
|
26
|
+
|
|
27
|
+
// Convert to Vec<Vec<f64>> format expected by hdbscan crate
|
|
28
|
+
let mut data_vec: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
|
|
29
|
+
for i in 0..n_samples {
|
|
30
|
+
let row: RArray = rarray.entry(i as isize)?;
|
|
31
|
+
let mut row_vec: Vec<f64> = Vec::with_capacity(n_features);
|
|
32
|
+
for j in 0..n_features {
|
|
33
|
+
let val: f64 = row.entry(j as isize)?;
|
|
34
|
+
row_vec.push(val);
|
|
35
|
+
}
|
|
36
|
+
data_vec.push(row_vec);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// Note: hdbscan crate doesn't support custom metrics directly
|
|
40
|
+
// We'll use the default Euclidean distance for now
|
|
19
41
|
if metric != "euclidean" && metric != "l2" {
|
|
20
42
|
eprintln!("Warning: Current hdbscan version only supports Euclidean distance. Using Euclidean.");
|
|
21
43
|
}
|
|
22
|
-
|
|
44
|
+
|
|
23
45
|
// Adjust parameters to avoid index out of bounds errors
|
|
46
|
+
// The hdbscan crate has issues when min_samples >= n_samples
|
|
24
47
|
let adjusted_min_samples = min_samples.min(n_samples.saturating_sub(1)).max(1);
|
|
25
48
|
let adjusted_min_cluster_size = min_cluster_size.min(n_samples).max(2);
|
|
26
|
-
|
|
49
|
+
|
|
27
50
|
// Create hyperparameters
|
|
28
51
|
let hyper_params = HdbscanHyperParams::builder()
|
|
29
52
|
.min_cluster_size(adjusted_min_cluster_size)
|
|
30
53
|
.min_samples(adjusted_min_samples)
|
|
31
54
|
.build();
|
|
32
|
-
|
|
55
|
+
|
|
33
56
|
// Create HDBSCAN instance and run clustering
|
|
34
57
|
let clusterer = Hdbscan::new(&data_vec, hyper_params);
|
|
35
|
-
|
|
58
|
+
|
|
59
|
+
// Run the clustering algorithm - cluster() returns Result<Vec<i32>, HdbscanError>
|
|
36
60
|
let labels = clusterer.cluster().map_err(|e| {
|
|
37
61
|
Error::new(
|
|
38
|
-
|
|
62
|
+
magnus::exception::runtime_error(),
|
|
39
63
|
format!("HDBSCAN clustering failed: {:?}", e)
|
|
40
64
|
)
|
|
41
65
|
})?;
|
|
42
|
-
|
|
66
|
+
|
|
43
67
|
// Convert results to Ruby types
|
|
44
|
-
let
|
|
45
|
-
|
|
46
|
-
|
|
68
|
+
let ruby = magnus::Ruby::get().unwrap();
|
|
69
|
+
let result = RHash::new();
|
|
70
|
+
|
|
71
|
+
// Convert labels (i32 to Ruby Integer, -1 for noise)
|
|
72
|
+
let labels_array = RArray::new();
|
|
47
73
|
for &label in labels.iter() {
|
|
48
|
-
labels_array.push(
|
|
74
|
+
labels_array.push(Integer::from_value(
|
|
75
|
+
ruby.eval(&format!("{}", label)).unwrap()
|
|
76
|
+
).unwrap())?;
|
|
49
77
|
}
|
|
50
78
|
result.aset("labels", labels_array)?;
|
|
51
|
-
|
|
52
|
-
|
|
79
|
+
|
|
80
|
+
// For now, we'll create dummy probabilities and outlier scores
|
|
81
|
+
// since the basic hdbscan crate doesn't provide these
|
|
82
|
+
// In the future, we could calculate these ourselves or use a more advanced implementation
|
|
83
|
+
|
|
84
|
+
// Create probabilities array (all 1.0 for clustered points, 0.0 for noise)
|
|
85
|
+
let probs_array = RArray::new();
|
|
53
86
|
for &label in labels.iter() {
|
|
54
87
|
let prob = if label == -1 { 0.0 } else { 1.0 };
|
|
55
88
|
probs_array.push(prob)?;
|
|
56
89
|
}
|
|
57
90
|
result.aset("probabilities", probs_array)?;
|
|
58
|
-
|
|
59
|
-
|
|
91
|
+
|
|
92
|
+
// Create outlier scores array (0.0 for clustered points, 1.0 for noise)
|
|
93
|
+
let outlier_array = RArray::new();
|
|
60
94
|
for &label in labels.iter() {
|
|
61
95
|
let score = if label == -1 { 1.0 } else { 0.0 };
|
|
62
96
|
outlier_array.push(score)?;
|
|
63
97
|
}
|
|
64
98
|
result.aset("outlier_scores", outlier_array)?;
|
|
65
|
-
|
|
66
|
-
|
|
99
|
+
|
|
100
|
+
// Create empty cluster persistence hash for now
|
|
101
|
+
let persistence_hash = RHash::new();
|
|
67
102
|
result.aset("cluster_persistence", persistence_hash)?;
|
|
68
|
-
|
|
103
|
+
|
|
69
104
|
Ok(result)
|
|
70
105
|
}
|
|
71
106
|
|
|
@@ -75,6 +110,6 @@ pub fn init(clustering_module: &magnus::RModule) -> Result<(), Error> {
|
|
|
75
110
|
"hdbscan_rust",
|
|
76
111
|
function!(hdbscan_fit, 4),
|
|
77
112
|
)?;
|
|
78
|
-
|
|
113
|
+
|
|
79
114
|
Ok(())
|
|
80
|
-
}
|
|
115
|
+
}
|
|
@@ -1,52 +1,68 @@
|
|
|
1
|
-
use magnus::{function, prelude::*, Error, Value, RArray,
|
|
1
|
+
use magnus::{function, prelude::*, Error, Value, RArray, Integer, TryConvert};
|
|
2
2
|
use ndarray::{Array1, Array2, ArrayView1, Axis};
|
|
3
3
|
use rand::prelude::*;
|
|
4
|
-
use rand::rngs::StdRng;
|
|
5
|
-
use rand::SeedableRng;
|
|
6
|
-
use crate::utils::ruby_array_to_ndarray2;
|
|
7
4
|
|
|
8
5
|
mod hdbscan_wrapper;
|
|
9
6
|
|
|
10
7
|
pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
|
11
8
|
let clustering_module = parent.define_module("Clustering")?;
|
|
12
|
-
|
|
9
|
+
|
|
13
10
|
clustering_module.define_singleton_method(
|
|
14
11
|
"kmeans_rust",
|
|
15
|
-
function!(kmeans,
|
|
12
|
+
function!(kmeans, 3),
|
|
16
13
|
)?;
|
|
17
|
-
|
|
14
|
+
|
|
18
15
|
clustering_module.define_singleton_method(
|
|
19
16
|
"kmeans_predict_rust",
|
|
20
17
|
function!(kmeans_predict, 2),
|
|
21
18
|
)?;
|
|
22
|
-
|
|
19
|
+
|
|
23
20
|
// Initialize HDBSCAN functions
|
|
24
21
|
hdbscan_wrapper::init(&clustering_module)?;
|
|
25
|
-
|
|
22
|
+
|
|
26
23
|
Ok(())
|
|
27
24
|
}
|
|
28
25
|
|
|
29
26
|
/// Perform K-means clustering
|
|
30
27
|
/// Returns (labels, centroids, inertia)
|
|
31
|
-
fn kmeans(data: Value, k: usize, max_iter: usize
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
28
|
+
fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64), Error> {
|
|
29
|
+
// Convert Ruby array to ndarray
|
|
30
|
+
let rarray: RArray = TryConvert::try_convert(data)?;
|
|
31
|
+
let n_samples = rarray.len();
|
|
32
|
+
|
|
33
|
+
if n_samples == 0 {
|
|
34
|
+
return Err(Error::new(
|
|
35
|
+
magnus::exception::arg_error(),
|
|
36
|
+
"Data cannot be empty",
|
|
37
|
+
));
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Get dimensions
|
|
41
|
+
let first_row: RArray = rarray.entry::<RArray>(0)?;
|
|
42
|
+
let n_features = first_row.len();
|
|
43
|
+
|
|
38
44
|
if k > n_samples {
|
|
39
45
|
return Err(Error::new(
|
|
40
|
-
|
|
46
|
+
magnus::exception::arg_error(),
|
|
41
47
|
format!("k ({}) cannot be larger than number of samples ({})", k, n_samples),
|
|
42
48
|
));
|
|
43
49
|
}
|
|
44
|
-
|
|
50
|
+
|
|
51
|
+
// Convert to ndarray
|
|
52
|
+
let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
|
|
53
|
+
for i in 0..n_samples {
|
|
54
|
+
let row: RArray = rarray.entry(i as isize)?;
|
|
55
|
+
for j in 0..n_features {
|
|
56
|
+
let val: f64 = row.entry(j as isize)?;
|
|
57
|
+
data_array[[i, j]] = val;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
45
61
|
// Initialize centroids using K-means++
|
|
46
|
-
let mut centroids = kmeans_plusplus(&data_array, k
|
|
62
|
+
let mut centroids = kmeans_plusplus(&data_array, k)?;
|
|
47
63
|
let mut labels = vec![0usize; n_samples];
|
|
48
64
|
let mut prev_labels = vec![0usize; n_samples];
|
|
49
|
-
|
|
65
|
+
|
|
50
66
|
// K-means iterations
|
|
51
67
|
for iteration in 0..max_iter {
|
|
52
68
|
// Assign points to nearest centroid
|
|
@@ -55,7 +71,7 @@ fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> R
|
|
|
55
71
|
let point = data_array.row(i);
|
|
56
72
|
let mut min_dist = f64::INFINITY;
|
|
57
73
|
let mut best_cluster = 0;
|
|
58
|
-
|
|
74
|
+
|
|
59
75
|
for (j, centroid) in centroids.axis_iter(Axis(0)).enumerate() {
|
|
60
76
|
let dist = euclidean_distance(&point, ¢roid);
|
|
61
77
|
if dist < min_dist {
|
|
@@ -63,38 +79,38 @@ fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> R
|
|
|
63
79
|
best_cluster = j;
|
|
64
80
|
}
|
|
65
81
|
}
|
|
66
|
-
|
|
82
|
+
|
|
67
83
|
if labels[i] != best_cluster {
|
|
68
84
|
changed = true;
|
|
69
85
|
}
|
|
70
86
|
labels[i] = best_cluster;
|
|
71
87
|
}
|
|
72
|
-
|
|
88
|
+
|
|
73
89
|
// Check for convergence
|
|
74
90
|
if !changed && iteration > 0 {
|
|
75
91
|
break;
|
|
76
92
|
}
|
|
77
|
-
|
|
93
|
+
|
|
78
94
|
// Update centroids
|
|
79
95
|
for j in 0..k {
|
|
80
96
|
let mut sum = Array1::<f64>::zeros(n_features);
|
|
81
97
|
let mut count = 0;
|
|
82
|
-
|
|
98
|
+
|
|
83
99
|
for i in 0..n_samples {
|
|
84
100
|
if labels[i] == j {
|
|
85
101
|
sum += &data_array.row(i);
|
|
86
102
|
count += 1;
|
|
87
103
|
}
|
|
88
104
|
}
|
|
89
|
-
|
|
105
|
+
|
|
90
106
|
if count > 0 {
|
|
91
107
|
centroids.row_mut(j).assign(&(sum / count as f64));
|
|
92
108
|
}
|
|
93
109
|
}
|
|
94
|
-
|
|
110
|
+
|
|
95
111
|
prev_labels.clone_from(&labels);
|
|
96
112
|
}
|
|
97
|
-
|
|
113
|
+
|
|
98
114
|
// Calculate inertia (sum of squared distances to nearest centroid)
|
|
99
115
|
let mut inertia = 0.0;
|
|
100
116
|
for i in 0..n_samples {
|
|
@@ -102,43 +118,75 @@ fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> R
|
|
|
102
118
|
let centroid = centroids.row(labels[i]);
|
|
103
119
|
inertia += euclidean_distance(&point, ¢roid).powi(2);
|
|
104
120
|
}
|
|
105
|
-
|
|
121
|
+
|
|
106
122
|
// Convert results to Ruby arrays
|
|
107
|
-
let
|
|
123
|
+
let ruby = magnus::Ruby::get().unwrap();
|
|
124
|
+
let labels_array = RArray::new();
|
|
108
125
|
for label in labels {
|
|
109
|
-
labels_array.push(ruby.
|
|
126
|
+
labels_array.push(Integer::from_value(ruby.eval(&format!("{}", label)).unwrap()).unwrap())?;
|
|
110
127
|
}
|
|
111
|
-
|
|
112
|
-
let centroids_array =
|
|
128
|
+
|
|
129
|
+
let centroids_array = RArray::new();
|
|
113
130
|
for i in 0..k {
|
|
114
|
-
let row_array =
|
|
131
|
+
let row_array = RArray::new();
|
|
115
132
|
for j in 0..n_features {
|
|
116
133
|
row_array.push(centroids[[i, j]])?;
|
|
117
134
|
}
|
|
118
135
|
centroids_array.push(row_array)?;
|
|
119
136
|
}
|
|
120
|
-
|
|
137
|
+
|
|
121
138
|
Ok((labels_array, centroids_array, inertia))
|
|
122
139
|
}
|
|
123
140
|
|
|
124
141
|
/// Predict cluster labels for new data given centroids
|
|
125
142
|
fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
let
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
143
|
+
// Convert inputs
|
|
144
|
+
let data_array: RArray = TryConvert::try_convert(data)?;
|
|
145
|
+
let centroids_array: RArray = TryConvert::try_convert(centroids)?;
|
|
146
|
+
|
|
147
|
+
let n_samples = data_array.len();
|
|
148
|
+
let k = centroids_array.len();
|
|
149
|
+
|
|
150
|
+
if n_samples == 0 {
|
|
151
|
+
return Err(Error::new(
|
|
152
|
+
magnus::exception::arg_error(),
|
|
153
|
+
"Data cannot be empty",
|
|
154
|
+
));
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Get dimensions
|
|
158
|
+
let first_row: RArray = data_array.entry::<RArray>(0)?;
|
|
159
|
+
let n_features = first_row.len();
|
|
160
|
+
|
|
161
|
+
// Convert data to ndarray
|
|
162
|
+
let mut data_matrix = Array2::<f64>::zeros((n_samples, n_features));
|
|
163
|
+
for i in 0..n_samples {
|
|
164
|
+
let row: RArray = data_array.entry(i as isize)?;
|
|
165
|
+
for j in 0..n_features {
|
|
166
|
+
let val: f64 = row.entry(j as isize)?;
|
|
167
|
+
data_matrix[[i, j]] = val;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Convert centroids to ndarray
|
|
172
|
+
let mut centroids_matrix = Array2::<f64>::zeros((k, n_features));
|
|
173
|
+
for i in 0..k {
|
|
174
|
+
let row: RArray = centroids_array.entry(i as isize)?;
|
|
175
|
+
for j in 0..n_features {
|
|
176
|
+
let val: f64 = row.entry(j as isize)?;
|
|
177
|
+
centroids_matrix[[i, j]] = val;
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
134
181
|
// Predict labels
|
|
135
|
-
let
|
|
136
|
-
|
|
182
|
+
let ruby = magnus::Ruby::get().unwrap();
|
|
183
|
+
let labels_array = RArray::new();
|
|
184
|
+
|
|
137
185
|
for i in 0..n_samples {
|
|
138
186
|
let point = data_matrix.row(i);
|
|
139
187
|
let mut min_dist = f64::INFINITY;
|
|
140
188
|
let mut best_cluster = 0;
|
|
141
|
-
|
|
189
|
+
|
|
142
190
|
for (j, centroid) in centroids_matrix.axis_iter(Axis(0)).enumerate() {
|
|
143
191
|
let dist = euclidean_distance(&point, ¢roid);
|
|
144
192
|
if dist < min_dist {
|
|
@@ -146,37 +194,30 @@ fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
|
|
|
146
194
|
best_cluster = j;
|
|
147
195
|
}
|
|
148
196
|
}
|
|
149
|
-
|
|
150
|
-
labels_array.push(ruby.
|
|
197
|
+
|
|
198
|
+
labels_array.push(Integer::from_value(ruby.eval(&format!("{}", best_cluster)).unwrap()).unwrap())?;
|
|
151
199
|
}
|
|
152
|
-
|
|
200
|
+
|
|
153
201
|
Ok(labels_array)
|
|
154
202
|
}
|
|
155
203
|
|
|
156
204
|
/// K-means++ initialization
|
|
157
|
-
fn kmeans_plusplus(data: &Array2<f64>, k: usize
|
|
205
|
+
fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
|
|
158
206
|
let n_samples = data.nrows();
|
|
159
207
|
let n_features = data.ncols();
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
let mut rng: Box<dyn RngCore> = match random_seed {
|
|
163
|
-
Some(seed) => {
|
|
164
|
-
let seed_u64 = seed as u64;
|
|
165
|
-
Box::new(StdRng::seed_from_u64(seed_u64))
|
|
166
|
-
},
|
|
167
|
-
None => Box::new(thread_rng()),
|
|
168
|
-
};
|
|
169
|
-
|
|
208
|
+
let mut rng = thread_rng();
|
|
209
|
+
|
|
170
210
|
let mut centroids = Array2::<f64>::zeros((k, n_features));
|
|
171
|
-
|
|
211
|
+
|
|
172
212
|
// Choose first centroid randomly
|
|
173
213
|
let first_idx = rng.gen_range(0..n_samples);
|
|
174
214
|
centroids.row_mut(0).assign(&data.row(first_idx));
|
|
175
|
-
|
|
215
|
+
|
|
176
216
|
// Choose remaining centroids
|
|
177
217
|
for i in 1..k {
|
|
178
218
|
let mut distances = vec![f64::INFINITY; n_samples];
|
|
179
|
-
|
|
219
|
+
|
|
220
|
+
// Calculate distance to nearest centroid for each point
|
|
180
221
|
for j in 0..n_samples {
|
|
181
222
|
for c in 0..i {
|
|
182
223
|
let dist = euclidean_distance(&data.row(j), ¢roids.row(c));
|
|
@@ -185,20 +226,25 @@ fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Re
|
|
|
185
226
|
}
|
|
186
227
|
}
|
|
187
228
|
}
|
|
188
|
-
|
|
229
|
+
|
|
230
|
+
// Convert distances to probabilities
|
|
189
231
|
let total: f64 = distances.iter().map(|d| d * d).sum();
|
|
190
232
|
if total == 0.0 {
|
|
233
|
+
// All points are identical or we've selected duplicates
|
|
234
|
+
// Just use sequential points as centroids
|
|
191
235
|
if i < n_samples {
|
|
192
236
|
centroids.row_mut(i).assign(&data.row(i));
|
|
193
237
|
} else {
|
|
238
|
+
// Reuse first point if we run out
|
|
194
239
|
centroids.row_mut(i).assign(&data.row(0));
|
|
195
240
|
}
|
|
196
241
|
continue;
|
|
197
242
|
}
|
|
198
|
-
|
|
243
|
+
|
|
244
|
+
// Choose next centroid with probability proportional to squared distance
|
|
199
245
|
let mut cumsum = 0.0;
|
|
200
246
|
let rand_val: f64 = rng.gen::<f64>() * total;
|
|
201
|
-
|
|
247
|
+
|
|
202
248
|
for j in 0..n_samples {
|
|
203
249
|
cumsum += distances[j] * distances[j];
|
|
204
250
|
if cumsum >= rand_val {
|
|
@@ -207,7 +253,7 @@ fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Re
|
|
|
207
253
|
}
|
|
208
254
|
}
|
|
209
255
|
}
|
|
210
|
-
|
|
256
|
+
|
|
211
257
|
Ok(centroids)
|
|
212
258
|
}
|
|
213
259
|
|
|
@@ -218,4 +264,4 @@ fn euclidean_distance(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
|
|
|
218
264
|
.map(|(x, y)| (x - y).powi(2))
|
|
219
265
|
.sum::<f64>()
|
|
220
266
|
.sqrt()
|
|
221
|
-
}
|
|
267
|
+
}
|