clusterkit 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +11 -11
- data/ext/clusterkit/Cargo.toml +1 -1
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +23 -36
- data/ext/clusterkit/src/clustering.rs +47 -53
- data/ext/clusterkit/src/embedder.rs +44 -52
- data/ext/clusterkit/src/hnsw.rs +181 -215
- data/ext/clusterkit/src/lib.rs +5 -5
- data/ext/clusterkit/src/svd.rs +31 -33
- data/ext/clusterkit/src/utils.rs +24 -21
- data/lib/clusterkit/version.rb +1 -1
- data/lib/clusterkit.rb +1 -1
- metadata +17 -4
- data/clusterkit.gemspec +0 -45
data/ext/clusterkit/src/lib.rs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
use magnus::{
|
|
1
|
+
use magnus::{Error, Ruby};
|
|
2
2
|
|
|
3
3
|
mod embedder;
|
|
4
4
|
mod svd;
|
|
@@ -10,15 +10,15 @@ mod hnsw;
|
|
|
10
10
|
mod tests;
|
|
11
11
|
|
|
12
12
|
#[magnus::init]
|
|
13
|
-
fn init() -> Result<(), Error> {
|
|
14
|
-
let module = define_module("ClusterKit")?;
|
|
15
|
-
|
|
13
|
+
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
14
|
+
let module = ruby.define_module("ClusterKit")?;
|
|
15
|
+
|
|
16
16
|
// Initialize submodules
|
|
17
17
|
embedder::init(&module)?;
|
|
18
18
|
svd::init(&module)?;
|
|
19
19
|
utils::init(&module)?;
|
|
20
20
|
clustering::init(&module)?;
|
|
21
21
|
hnsw::init(&module)?;
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
Ok(())
|
|
24
24
|
}
|
data/ext/clusterkit/src/svd.rs
CHANGED
|
@@ -1,91 +1,89 @@
|
|
|
1
|
-
use magnus::{function, prelude::*, Error, Value, RArray};
|
|
1
|
+
use magnus::{function, prelude::*, Error, Value, RArray, Ruby};
|
|
2
2
|
use annembed::tools::svdapprox::{SvdApprox, RangeApproxMode, RangeRank, MatRepr};
|
|
3
3
|
use crate::utils::ruby_array_to_ndarray2;
|
|
4
4
|
|
|
5
5
|
pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
|
6
6
|
let svd_module = parent.define_module("SVD")?;
|
|
7
|
-
|
|
7
|
+
|
|
8
8
|
svd_module.define_singleton_method(
|
|
9
9
|
"randomized_svd_rust",
|
|
10
10
|
function!(randomized_svd, 3),
|
|
11
11
|
)?;
|
|
12
|
-
|
|
12
|
+
|
|
13
13
|
Ok(())
|
|
14
14
|
}
|
|
15
15
|
|
|
16
16
|
fn randomized_svd(matrix: Value, k: usize, n_iter: usize) -> Result<RArray, Error> {
|
|
17
|
+
let ruby = Ruby::get().unwrap();
|
|
18
|
+
|
|
17
19
|
// Convert Ruby array to ndarray using shared helper
|
|
18
20
|
let matrix_data = ruby_array_to_ndarray2(matrix)?;
|
|
19
21
|
let (n_rows, n_cols) = matrix_data.dim();
|
|
20
|
-
|
|
22
|
+
|
|
21
23
|
if k > n_rows.min(n_cols) {
|
|
22
24
|
return Err(Error::new(
|
|
23
|
-
|
|
25
|
+
ruby.exception_arg_error(),
|
|
24
26
|
format!("k ({}) cannot be larger than min(rows, cols) = {}", k, n_rows.min(n_cols)),
|
|
25
27
|
));
|
|
26
28
|
}
|
|
27
|
-
|
|
29
|
+
|
|
28
30
|
// Create MatRepr for the full matrix
|
|
29
31
|
let mat_repr = MatRepr::from_array2(matrix_data.clone());
|
|
30
|
-
|
|
32
|
+
|
|
31
33
|
// Create SvdApprox instance
|
|
32
34
|
let mut svd_approx = SvdApprox::new(&mat_repr);
|
|
33
|
-
|
|
35
|
+
|
|
34
36
|
// Set up parameters for randomized SVD
|
|
35
|
-
// Use RANK mode to specify the desired rank
|
|
36
37
|
let params = RangeApproxMode::RANK(RangeRank::new(k, n_iter));
|
|
37
|
-
|
|
38
|
+
|
|
38
39
|
// Perform SVD
|
|
39
40
|
let svd_result = svd_approx.direct_svd(params)
|
|
40
|
-
.map_err(|e| Error::new(
|
|
41
|
-
|
|
42
|
-
// Extract U, S, V from the result
|
|
41
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e))?;
|
|
42
|
+
|
|
43
|
+
// Extract U, S, V from the result
|
|
43
44
|
let u_matrix = svd_result.u.ok_or_else(|| {
|
|
44
|
-
Error::new(
|
|
45
|
+
Error::new(ruby.exception_runtime_error(), "No U matrix in SVD result")
|
|
45
46
|
})?;
|
|
46
|
-
|
|
47
|
+
|
|
47
48
|
let s_values = svd_result.s.ok_or_else(|| {
|
|
48
|
-
Error::new(
|
|
49
|
+
Error::new(ruby.exception_runtime_error(), "No S values in SVD result")
|
|
49
50
|
})?;
|
|
50
|
-
|
|
51
|
+
|
|
51
52
|
let vt_matrix = svd_result.vt.ok_or_else(|| {
|
|
52
|
-
Error::new(
|
|
53
|
+
Error::new(ruby.exception_runtime_error(), "No V^T matrix in SVD result")
|
|
53
54
|
})?;
|
|
54
|
-
|
|
55
|
+
|
|
55
56
|
// Convert results to Ruby arrays
|
|
56
|
-
|
|
57
|
-
let u_ruby = RArray::new();
|
|
57
|
+
let u_ruby = ruby.ary_new();
|
|
58
58
|
let u_shape = u_matrix.shape();
|
|
59
59
|
for i in 0..u_shape[0] {
|
|
60
|
-
let row =
|
|
60
|
+
let row = ruby.ary_new();
|
|
61
61
|
for j in 0..u_shape[1] {
|
|
62
62
|
row.push(u_matrix[[i, j]])?;
|
|
63
63
|
}
|
|
64
64
|
u_ruby.push(row)?;
|
|
65
65
|
}
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
let s_ruby = RArray::new();
|
|
66
|
+
|
|
67
|
+
let s_ruby = ruby.ary_new();
|
|
69
68
|
for val in s_values.iter() {
|
|
70
69
|
s_ruby.push(*val)?;
|
|
71
70
|
}
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
let v_ruby = RArray::new();
|
|
71
|
+
|
|
72
|
+
let v_ruby = ruby.ary_new();
|
|
75
73
|
let vt_shape = vt_matrix.shape();
|
|
76
74
|
for i in 0..vt_shape[0] {
|
|
77
|
-
let row =
|
|
75
|
+
let row = ruby.ary_new();
|
|
78
76
|
for j in 0..vt_shape[1] {
|
|
79
77
|
row.push(vt_matrix[[i, j]])?;
|
|
80
78
|
}
|
|
81
79
|
v_ruby.push(row)?;
|
|
82
80
|
}
|
|
83
|
-
|
|
81
|
+
|
|
84
82
|
// Return [U, S, V^T] as a Ruby array
|
|
85
|
-
let result =
|
|
83
|
+
let result = ruby.ary_new();
|
|
86
84
|
result.push(u_ruby)?;
|
|
87
85
|
result.push(s_ruby)?;
|
|
88
86
|
result.push(v_ruby)?;
|
|
89
|
-
|
|
87
|
+
|
|
90
88
|
Ok(result)
|
|
91
|
-
}
|
|
89
|
+
}
|
data/ext/clusterkit/src/utils.rs
CHANGED
|
@@ -1,34 +1,34 @@
|
|
|
1
|
-
use magnus::{function, prelude::*, Error, Value, RArray, TryConvert, Float, Integer};
|
|
1
|
+
use magnus::{function, prelude::*, Error, Value, RArray, TryConvert, Float, Integer, Ruby};
|
|
2
2
|
use ndarray::Array2;
|
|
3
3
|
|
|
4
4
|
pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
|
5
5
|
let utils_module = parent.define_module("Utils")?;
|
|
6
|
-
|
|
6
|
+
|
|
7
7
|
utils_module.define_singleton_method(
|
|
8
8
|
"estimate_intrinsic_dimension_rust",
|
|
9
9
|
function!(estimate_intrinsic_dimension, 2),
|
|
10
10
|
)?;
|
|
11
|
-
|
|
11
|
+
|
|
12
12
|
utils_module.define_singleton_method(
|
|
13
13
|
"estimate_hubness_rust",
|
|
14
14
|
function!(estimate_hubness, 1),
|
|
15
15
|
)?;
|
|
16
|
-
|
|
16
|
+
|
|
17
17
|
Ok(())
|
|
18
18
|
}
|
|
19
19
|
|
|
20
20
|
fn estimate_intrinsic_dimension(_data: Value, _k_neighbors: usize) -> Result<f64, Error> {
|
|
21
|
-
|
|
21
|
+
let ruby = Ruby::get().unwrap();
|
|
22
22
|
Err(Error::new(
|
|
23
|
-
|
|
23
|
+
ruby.exception_not_imp_error(),
|
|
24
24
|
"Dimension estimation not implemented yet",
|
|
25
25
|
))
|
|
26
26
|
}
|
|
27
27
|
|
|
28
28
|
fn estimate_hubness(_data: Value) -> Result<Value, Error> {
|
|
29
|
-
|
|
29
|
+
let ruby = Ruby::get().unwrap();
|
|
30
30
|
Err(Error::new(
|
|
31
|
-
|
|
31
|
+
ruby.exception_not_imp_error(),
|
|
32
32
|
"Hubness estimation not implemented yet",
|
|
33
33
|
))
|
|
34
34
|
}
|
|
@@ -36,12 +36,13 @@ fn estimate_hubness(_data: Value) -> Result<Value, Error> {
|
|
|
36
36
|
/// Convert Ruby 2D array to ndarray Array2<f64>
|
|
37
37
|
/// Handles validation and provides consistent error messages
|
|
38
38
|
pub fn ruby_array_to_ndarray2(data: Value) -> Result<Array2<f64>, Error> {
|
|
39
|
+
let ruby = Ruby::get().unwrap();
|
|
39
40
|
let rarray: RArray = TryConvert::try_convert(data)?;
|
|
40
41
|
let n_samples = rarray.len();
|
|
41
42
|
|
|
42
43
|
if n_samples == 0 {
|
|
43
44
|
return Err(Error::new(
|
|
44
|
-
|
|
45
|
+
ruby.exception_arg_error(),
|
|
45
46
|
"Data cannot be empty",
|
|
46
47
|
));
|
|
47
48
|
}
|
|
@@ -52,7 +53,7 @@ pub fn ruby_array_to_ndarray2(data: Value) -> Result<Array2<f64>, Error> {
|
|
|
52
53
|
|
|
53
54
|
if n_features == 0 {
|
|
54
55
|
return Err(Error::new(
|
|
55
|
-
|
|
56
|
+
ruby.exception_arg_error(),
|
|
56
57
|
"Data rows cannot be empty",
|
|
57
58
|
));
|
|
58
59
|
}
|
|
@@ -61,11 +62,11 @@ pub fn ruby_array_to_ndarray2(data: Value) -> Result<Array2<f64>, Error> {
|
|
|
61
62
|
let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
|
|
62
63
|
for i in 0..n_samples {
|
|
63
64
|
let row: RArray = rarray.entry(i as isize)?;
|
|
64
|
-
|
|
65
|
+
|
|
65
66
|
// Validate row length consistency
|
|
66
67
|
if row.len() != n_features {
|
|
67
68
|
return Err(Error::new(
|
|
68
|
-
|
|
69
|
+
ruby.exception_arg_error(),
|
|
69
70
|
format!("Row {} has {} elements, expected {}", i, row.len(), n_features),
|
|
70
71
|
));
|
|
71
72
|
}
|
|
@@ -80,14 +81,15 @@ pub fn ruby_array_to_ndarray2(data: Value) -> Result<Array2<f64>, Error> {
|
|
|
80
81
|
}
|
|
81
82
|
|
|
82
83
|
/// Convert Ruby 2D array to Vec<Vec<f64>>
|
|
83
|
-
/// Handles validation and provides consistent error messages
|
|
84
|
+
/// Handles validation and provides consistent error messages
|
|
84
85
|
pub fn ruby_array_to_vec_vec_f64(data: Value) -> Result<Vec<Vec<f64>>, Error> {
|
|
86
|
+
let ruby = Ruby::get().unwrap();
|
|
85
87
|
let rarray: RArray = TryConvert::try_convert(data)?;
|
|
86
88
|
let n_samples = rarray.len();
|
|
87
89
|
|
|
88
90
|
if n_samples == 0 {
|
|
89
91
|
return Err(Error::new(
|
|
90
|
-
|
|
92
|
+
ruby.exception_arg_error(),
|
|
91
93
|
"Data cannot be empty",
|
|
92
94
|
));
|
|
93
95
|
}
|
|
@@ -98,13 +100,13 @@ pub fn ruby_array_to_vec_vec_f64(data: Value) -> Result<Vec<Vec<f64>>, Error> {
|
|
|
98
100
|
for i in 0..n_samples {
|
|
99
101
|
let row: RArray = rarray.entry(i as isize)?;
|
|
100
102
|
let n_features = row.len();
|
|
101
|
-
|
|
103
|
+
|
|
102
104
|
// Check row length consistency
|
|
103
105
|
match expected_features {
|
|
104
106
|
Some(expected) => {
|
|
105
107
|
if n_features != expected {
|
|
106
108
|
return Err(Error::new(
|
|
107
|
-
|
|
109
|
+
ruby.exception_arg_error(),
|
|
108
110
|
format!("Row {} has {} elements, expected {}", i, n_features, expected),
|
|
109
111
|
));
|
|
110
112
|
}
|
|
@@ -126,12 +128,13 @@ pub fn ruby_array_to_vec_vec_f64(data: Value) -> Result<Vec<Vec<f64>>, Error> {
|
|
|
126
128
|
/// Convert Ruby 2D array to Vec<Vec<f32>>
|
|
127
129
|
/// For algorithms that require f32 precision (like UMAP)
|
|
128
130
|
pub fn ruby_array_to_vec_vec_f32(data: Value) -> Result<Vec<Vec<f32>>, Error> {
|
|
131
|
+
let ruby = Ruby::get().unwrap();
|
|
129
132
|
let rarray: RArray = TryConvert::try_convert(data)?;
|
|
130
133
|
let array_len = rarray.len();
|
|
131
134
|
|
|
132
135
|
if array_len == 0 {
|
|
133
136
|
return Err(Error::new(
|
|
134
|
-
|
|
137
|
+
ruby.exception_arg_error(),
|
|
135
138
|
"Input data cannot be empty",
|
|
136
139
|
));
|
|
137
140
|
}
|
|
@@ -142,7 +145,7 @@ pub fn ruby_array_to_vec_vec_f32(data: Value) -> Result<Vec<Vec<f32>>, Error> {
|
|
|
142
145
|
let row = rarray.entry::<Value>(i as isize)?;
|
|
143
146
|
let row_array = RArray::try_convert(row).map_err(|_| {
|
|
144
147
|
Error::new(
|
|
145
|
-
|
|
148
|
+
ruby.exception_type_error(),
|
|
146
149
|
"Expected array of arrays (2D array)",
|
|
147
150
|
)
|
|
148
151
|
})?;
|
|
@@ -158,7 +161,7 @@ pub fn ruby_array_to_vec_vec_f32(data: Value) -> Result<Vec<Vec<f32>>, Error> {
|
|
|
158
161
|
i.to_i64()? as f32
|
|
159
162
|
} else {
|
|
160
163
|
return Err(Error::new(
|
|
161
|
-
|
|
164
|
+
ruby.exception_type_error(),
|
|
162
165
|
"All values must be numeric",
|
|
163
166
|
));
|
|
164
167
|
};
|
|
@@ -168,7 +171,7 @@ pub fn ruby_array_to_vec_vec_f32(data: Value) -> Result<Vec<Vec<f32>>, Error> {
|
|
|
168
171
|
// Validate row length consistency
|
|
169
172
|
if !rust_data.is_empty() && rust_row.len() != rust_data[0].len() {
|
|
170
173
|
return Err(Error::new(
|
|
171
|
-
|
|
174
|
+
ruby.exception_arg_error(),
|
|
172
175
|
"All rows must have the same length",
|
|
173
176
|
));
|
|
174
177
|
}
|
|
@@ -177,4 +180,4 @@ pub fn ruby_array_to_vec_vec_f32(data: Value) -> Result<Vec<Vec<f32>>, Error> {
|
|
|
177
180
|
}
|
|
178
181
|
|
|
179
182
|
Ok(rust_data)
|
|
180
|
-
}
|
|
183
|
+
}
|
data/lib/clusterkit/version.rb
CHANGED
data/lib/clusterkit.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: clusterkit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Chris Petersen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-03-24 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -24,6 +24,20 @@ dependencies:
|
|
|
24
24
|
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
26
|
version: '0.9'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: benchmark
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ">="
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '0'
|
|
34
|
+
type: :development
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - ">="
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '0'
|
|
27
41
|
- !ruby/object:Gem::Dependency
|
|
28
42
|
name: csv
|
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -130,7 +144,6 @@ files:
|
|
|
130
144
|
- PYTHON_COMPARISON.md
|
|
131
145
|
- README.md
|
|
132
146
|
- Rakefile
|
|
133
|
-
- clusterkit.gemspec
|
|
134
147
|
- docs/KNOWN_ISSUES.md
|
|
135
148
|
- docs/RUST_ERROR_HANDLING.md
|
|
136
149
|
- docs/TEST_FIXTURES.md
|
|
@@ -194,7 +207,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
194
207
|
- !ruby/object:Gem::Version
|
|
195
208
|
version: '0'
|
|
196
209
|
requirements: []
|
|
197
|
-
rubygems_version: 3.5.
|
|
210
|
+
rubygems_version: 3.5.22
|
|
198
211
|
signing_key:
|
|
199
212
|
specification_version: 4
|
|
200
213
|
summary: High-performance clustering and dimensionality reduction for Ruby
|
data/clusterkit.gemspec
DELETED
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
require_relative "lib/clusterkit/version"
|
|
2
|
-
|
|
3
|
-
Gem::Specification.new do |spec|
|
|
4
|
-
spec.name = "clusterkit"
|
|
5
|
-
spec.version = ClusterKit::VERSION
|
|
6
|
-
spec.authors = ["Chris Petersen"]
|
|
7
|
-
spec.email = ["chris@petersen.io"]
|
|
8
|
-
|
|
9
|
-
spec.summary = "High-performance clustering and dimensionality reduction for Ruby"
|
|
10
|
-
spec.description = "A comprehensive clustering toolkit for Ruby, providing UMAP, PCA, K-means, HDBSCAN and more. Built on top of annembed and hdbscan Rust crates for blazing-fast performance."
|
|
11
|
-
spec.homepage = "https://github.com/scientist-labs/clusterkit"
|
|
12
|
-
spec.license = "MIT"
|
|
13
|
-
spec.required_ruby_version = ">= 2.7.0"
|
|
14
|
-
|
|
15
|
-
spec.metadata["homepage_uri"] = spec.homepage
|
|
16
|
-
spec.metadata["source_code_uri"] = spec.homepage
|
|
17
|
-
spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
|
|
18
|
-
|
|
19
|
-
# Specify which files should be added to the gem when it is released.
|
|
20
|
-
spec.files = Dir.chdir(__dir__) do
|
|
21
|
-
`git ls-files -z`.split("\x0").reject do |f|
|
|
22
|
-
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
|
23
|
-
end + Dir["ext/**/*.rs", "ext/**/*.toml"]
|
|
24
|
-
end
|
|
25
|
-
spec.bindir = "exe"
|
|
26
|
-
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
27
|
-
spec.require_paths = ["lib"]
|
|
28
|
-
spec.extensions = ["ext/clusterkit/extconf.rb"]
|
|
29
|
-
|
|
30
|
-
# Runtime dependencies
|
|
31
|
-
# Numo is optional but recommended for better performance
|
|
32
|
-
# spec.add_dependency "numo-narray", "~> 0.9"
|
|
33
|
-
spec.add_dependency "rb_sys", "~> 0.9"
|
|
34
|
-
|
|
35
|
-
# Development dependencies
|
|
36
|
-
spec.add_development_dependency "csv"
|
|
37
|
-
spec.add_development_dependency "rake", "~> 13.0"
|
|
38
|
-
spec.add_development_dependency "rake-compiler", "~> 1.2"
|
|
39
|
-
spec.add_development_dependency "rspec", "~> 3.0"
|
|
40
|
-
spec.add_development_dependency "simplecov", "~> 0.22"
|
|
41
|
-
spec.add_development_dependency "yard", "~> 0.9"
|
|
42
|
-
|
|
43
|
-
# For more information and examples about making a new gem, check out our
|
|
44
|
-
# guide at: https://bundler.io/guides/creating_gem.html
|
|
45
|
-
end
|