clusterkit 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/clusterkit.gemspec +2 -2
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +5 -27
- data/ext/clusterkit/src/clustering.rs +26 -66
- data/ext/clusterkit/src/embedder.rs +6 -81
- data/ext/clusterkit/src/svd.rs +5 -26
- data/ext/clusterkit/src/utils.rs +148 -1
- data/lib/clusterkit/clustering/hdbscan.rb +4 -17
- data/lib/clusterkit/clustering.rb +4 -23
- data/lib/clusterkit/data_validator.rb +132 -0
- data/lib/clusterkit/dimensionality/pca.rb +12 -12
- data/lib/clusterkit/dimensionality/svd.rb +47 -16
- data/lib/clusterkit/dimensionality/umap.rb +7 -40
- data/lib/clusterkit/version.rb +1 -1
- metadata +17 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ed8756bb9b2d2cf253c849f529b18aa57c0043fe0e0b65997fffc62522d21fd6
|
4
|
+
data.tar.gz: adcb4a5186e042a0be6d9e712b8dbff5751c469295ccbef9034bf40688658c21
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3f16b15661cf7db9a42f080dc5cb8835ded9f36518f0f3e095ea50bf7c93555906a1b3233744b11551cb2d45c1b3a9b1d2e5a64051d770b277e6a0669e0698b2
|
7
|
+
data.tar.gz: 36e4df00fbfb48b44bdcacbcb29e9a077c5711cc5cbc79d0ae9091c604db489bea06fa1c354ad89a87aa50e5c7dc8fe2b75f426e1942ae01d46887dda72b5ab1
|
data/clusterkit.gemspec
CHANGED
@@ -30,16 +30,16 @@ Gem::Specification.new do |spec|
|
|
30
30
|
# Runtime dependencies
|
31
31
|
# Numo is optional but recommended for better performance
|
32
32
|
# spec.add_dependency "numo-narray", "~> 0.9"
|
33
|
+
spec.add_dependency "rb_sys", "~> 0.9"
|
33
34
|
|
34
35
|
# Development dependencies
|
35
36
|
spec.add_development_dependency "csv"
|
36
37
|
spec.add_development_dependency "rake", "~> 13.0"
|
37
38
|
spec.add_development_dependency "rake-compiler", "~> 1.2"
|
38
|
-
spec.add_development_dependency "rb_sys", "~> 0.9"
|
39
39
|
spec.add_development_dependency "rspec", "~> 3.0"
|
40
40
|
spec.add_development_dependency "simplecov", "~> 0.22"
|
41
41
|
spec.add_development_dependency "yard", "~> 0.9"
|
42
42
|
|
43
43
|
# For more information and examples about making a new gem, check out our
|
44
44
|
# guide at: https://bundler.io/guides/creating_gem.html
|
45
|
-
end
|
45
|
+
end
|
@@ -1,5 +1,6 @@
|
|
1
|
-
use magnus::{function, prelude::*, Error, Value, RArray, RHash, Integer
|
1
|
+
use magnus::{function, prelude::*, Error, Value, RArray, RHash, Integer};
|
2
2
|
use hdbscan::{Hdbscan, HdbscanHyperParams};
|
3
|
+
use crate::utils::ruby_array_to_vec_vec_f64;
|
3
4
|
|
4
5
|
/// Perform HDBSCAN clustering
|
5
6
|
/// Returns a hash with labels and basic statistics
|
@@ -9,32 +10,9 @@ pub fn hdbscan_fit(
|
|
9
10
|
min_cluster_size: usize,
|
10
11
|
metric: String,
|
11
12
|
) -> Result<RHash, Error> {
|
12
|
-
// Convert Ruby array to
|
13
|
-
let
|
14
|
-
let n_samples =
|
15
|
-
|
16
|
-
if n_samples == 0 {
|
17
|
-
return Err(Error::new(
|
18
|
-
magnus::exception::arg_error(),
|
19
|
-
"Data cannot be empty",
|
20
|
-
));
|
21
|
-
}
|
22
|
-
|
23
|
-
// Get dimensions
|
24
|
-
let first_row: RArray = rarray.entry::<RArray>(0)?;
|
25
|
-
let n_features = first_row.len();
|
26
|
-
|
27
|
-
// Convert to Vec<Vec<f64>> format expected by hdbscan crate
|
28
|
-
let mut data_vec: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
|
29
|
-
for i in 0..n_samples {
|
30
|
-
let row: RArray = rarray.entry(i as isize)?;
|
31
|
-
let mut row_vec: Vec<f64> = Vec::with_capacity(n_features);
|
32
|
-
for j in 0..n_features {
|
33
|
-
let val: f64 = row.entry(j as isize)?;
|
34
|
-
row_vec.push(val);
|
35
|
-
}
|
36
|
-
data_vec.push(row_vec);
|
37
|
-
}
|
13
|
+
// Convert Ruby array to Vec<Vec<f64>> using shared helper
|
14
|
+
let data_vec = ruby_array_to_vec_vec_f64(data)?;
|
15
|
+
let n_samples = data_vec.len();
|
38
16
|
|
39
17
|
// Note: hdbscan crate doesn't support custom metrics directly
|
40
18
|
// We'll use the default Euclidean distance for now
|
@@ -1,6 +1,9 @@
|
|
1
|
-
use magnus::{function, prelude::*, Error, Value, RArray, Integer
|
1
|
+
use magnus::{function, prelude::*, Error, Value, RArray, Integer};
|
2
2
|
use ndarray::{Array1, Array2, ArrayView1, Axis};
|
3
3
|
use rand::prelude::*;
|
4
|
+
use rand::rngs::StdRng;
|
5
|
+
use rand::SeedableRng;
|
6
|
+
use crate::utils::{ruby_array_to_ndarray2};
|
4
7
|
|
5
8
|
mod hdbscan_wrapper;
|
6
9
|
|
@@ -9,7 +12,7 @@ pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
|
9
12
|
|
10
13
|
clustering_module.define_singleton_method(
|
11
14
|
"kmeans_rust",
|
12
|
-
function!(kmeans,
|
15
|
+
function!(kmeans, 4),
|
13
16
|
)?;
|
14
17
|
|
15
18
|
clustering_module.define_singleton_method(
|
@@ -25,21 +28,10 @@ pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
|
25
28
|
|
26
29
|
/// Perform K-means clustering
|
27
30
|
/// Returns (labels, centroids, inertia)
|
28
|
-
fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64), Error> {
|
29
|
-
// Convert Ruby array to ndarray
|
30
|
-
let
|
31
|
-
let n_samples =
|
32
|
-
|
33
|
-
if n_samples == 0 {
|
34
|
-
return Err(Error::new(
|
35
|
-
magnus::exception::arg_error(),
|
36
|
-
"Data cannot be empty",
|
37
|
-
));
|
38
|
-
}
|
39
|
-
|
40
|
-
// Get dimensions
|
41
|
-
let first_row: RArray = rarray.entry::<RArray>(0)?;
|
42
|
-
let n_features = first_row.len();
|
31
|
+
fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> Result<(RArray, RArray, f64), Error> {
|
32
|
+
// Convert Ruby array to ndarray using shared helper
|
33
|
+
let data_array = ruby_array_to_ndarray2(data)?;
|
34
|
+
let (n_samples, n_features) = data_array.dim();
|
43
35
|
|
44
36
|
if k > n_samples {
|
45
37
|
return Err(Error::new(
|
@@ -48,18 +40,8 @@ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64
|
|
48
40
|
));
|
49
41
|
}
|
50
42
|
|
51
|
-
// Convert to ndarray
|
52
|
-
let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
|
53
|
-
for i in 0..n_samples {
|
54
|
-
let row: RArray = rarray.entry(i as isize)?;
|
55
|
-
for j in 0..n_features {
|
56
|
-
let val: f64 = row.entry(j as isize)?;
|
57
|
-
data_array[[i, j]] = val;
|
58
|
-
}
|
59
|
-
}
|
60
|
-
|
61
43
|
// Initialize centroids using K-means++
|
62
|
-
let mut centroids = kmeans_plusplus(&data_array, k)?;
|
44
|
+
let mut centroids = kmeans_plusplus(&data_array, k, random_seed)?;
|
63
45
|
let mut labels = vec![0usize; n_samples];
|
64
46
|
let mut prev_labels = vec![0usize; n_samples];
|
65
47
|
|
@@ -140,43 +122,12 @@ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64
|
|
140
122
|
|
141
123
|
/// Predict cluster labels for new data given centroids
|
142
124
|
fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
|
143
|
-
// Convert inputs
|
144
|
-
let
|
145
|
-
let
|
146
|
-
|
147
|
-
let n_samples = data_array.len();
|
148
|
-
let k = centroids_array.len();
|
149
|
-
|
150
|
-
if n_samples == 0 {
|
151
|
-
return Err(Error::new(
|
152
|
-
magnus::exception::arg_error(),
|
153
|
-
"Data cannot be empty",
|
154
|
-
));
|
155
|
-
}
|
156
|
-
|
157
|
-
// Get dimensions
|
158
|
-
let first_row: RArray = data_array.entry::<RArray>(0)?;
|
159
|
-
let n_features = first_row.len();
|
160
|
-
|
161
|
-
// Convert data to ndarray
|
162
|
-
let mut data_matrix = Array2::<f64>::zeros((n_samples, n_features));
|
163
|
-
for i in 0..n_samples {
|
164
|
-
let row: RArray = data_array.entry(i as isize)?;
|
165
|
-
for j in 0..n_features {
|
166
|
-
let val: f64 = row.entry(j as isize)?;
|
167
|
-
data_matrix[[i, j]] = val;
|
168
|
-
}
|
169
|
-
}
|
125
|
+
// Convert inputs using shared helpers
|
126
|
+
let data_matrix = ruby_array_to_ndarray2(data)?;
|
127
|
+
let centroids_matrix = ruby_array_to_ndarray2(centroids)?;
|
170
128
|
|
171
|
-
|
172
|
-
let
|
173
|
-
for i in 0..k {
|
174
|
-
let row: RArray = centroids_array.entry(i as isize)?;
|
175
|
-
for j in 0..n_features {
|
176
|
-
let val: f64 = row.entry(j as isize)?;
|
177
|
-
centroids_matrix[[i, j]] = val;
|
178
|
-
}
|
179
|
-
}
|
129
|
+
let (n_samples, _) = data_matrix.dim();
|
130
|
+
let (_k, _) = centroids_matrix.dim();
|
180
131
|
|
181
132
|
// Predict labels
|
182
133
|
let ruby = magnus::Ruby::get().unwrap();
|
@@ -202,10 +153,19 @@ fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
|
|
202
153
|
}
|
203
154
|
|
204
155
|
/// K-means++ initialization
|
205
|
-
fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
|
156
|
+
fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Result<Array2<f64>, Error> {
|
206
157
|
let n_samples = data.nrows();
|
207
158
|
let n_features = data.ncols();
|
208
|
-
|
159
|
+
|
160
|
+
// Use seeded RNG if seed is provided, otherwise use thread_rng
|
161
|
+
let mut rng: Box<dyn RngCore> = match random_seed {
|
162
|
+
Some(seed) => {
|
163
|
+
// Convert i64 to u64 for seeding (negative numbers wrap around)
|
164
|
+
let seed_u64 = seed as u64;
|
165
|
+
Box::new(StdRng::seed_from_u64(seed_u64))
|
166
|
+
},
|
167
|
+
None => Box::new(thread_rng()),
|
168
|
+
};
|
209
169
|
|
210
170
|
let mut centroids = Array2::<f64>::zeros((k, n_features));
|
211
171
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
use magnus::{Error, RArray, RHash, Value, TryConvert, Integer,
|
1
|
+
use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Module, Object};
|
2
2
|
use magnus::value::ReprValue;
|
3
3
|
use hnsw_rs::prelude::*;
|
4
4
|
use annembed::prelude::*;
|
@@ -7,6 +7,7 @@ use std::io::{Write, Read};
|
|
7
7
|
use std::cell::RefCell;
|
8
8
|
use bincode;
|
9
9
|
use serde::{Serialize, Deserialize};
|
10
|
+
use crate::utils::ruby_array_to_vec_vec_f32;
|
10
11
|
|
11
12
|
// Simple struct to serialize UMAP results
|
12
13
|
#[derive(Serialize, Deserialize)]
|
@@ -124,61 +125,8 @@ impl RustUMAP {
|
|
124
125
|
}
|
125
126
|
|
126
127
|
fn fit_transform(&self, data: Value) -> Result<RArray, Error> {
|
127
|
-
// Convert Ruby array to Rust Vec<Vec<
|
128
|
-
let
|
129
|
-
let mut rust_data: Vec<Vec<f64>> = Vec::new();
|
130
|
-
|
131
|
-
// Get array length
|
132
|
-
let array_len = ruby_array.len();
|
133
|
-
|
134
|
-
for i in 0..array_len {
|
135
|
-
let row = ruby_array.entry::<Value>(i as isize)?;
|
136
|
-
let row_array = RArray::try_convert(row).map_err(|_| {
|
137
|
-
Error::new(
|
138
|
-
magnus::exception::type_error(),
|
139
|
-
"Expected array of arrays (2D array)",
|
140
|
-
)
|
141
|
-
})?;
|
142
|
-
|
143
|
-
let mut rust_row: Vec<f64> = Vec::new();
|
144
|
-
let row_len = row_array.len();
|
145
|
-
|
146
|
-
for j in 0..row_len {
|
147
|
-
let val = row_array.entry::<Value>(j as isize)?;
|
148
|
-
let float_val = if let Ok(f) = Float::try_convert(val) {
|
149
|
-
f.to_f64()
|
150
|
-
} else if let Ok(i) = Integer::try_convert(val) {
|
151
|
-
i.to_i64()? as f64
|
152
|
-
} else {
|
153
|
-
return Err(Error::new(
|
154
|
-
magnus::exception::type_error(),
|
155
|
-
"All values must be numeric",
|
156
|
-
));
|
157
|
-
};
|
158
|
-
rust_row.push(float_val);
|
159
|
-
}
|
160
|
-
|
161
|
-
if !rust_data.is_empty() && rust_row.len() != rust_data[0].len() {
|
162
|
-
return Err(Error::new(
|
163
|
-
magnus::exception::arg_error(),
|
164
|
-
"All rows must have the same length",
|
165
|
-
));
|
166
|
-
}
|
167
|
-
|
168
|
-
rust_data.push(rust_row);
|
169
|
-
}
|
170
|
-
|
171
|
-
if rust_data.is_empty() {
|
172
|
-
return Err(Error::new(
|
173
|
-
magnus::exception::arg_error(),
|
174
|
-
"Input data cannot be empty",
|
175
|
-
));
|
176
|
-
}
|
177
|
-
|
178
|
-
// Convert to Vec<Vec<f32>> for HNSW
|
179
|
-
let data_f32: Vec<Vec<f32>> = rust_data.iter()
|
180
|
-
.map(|row| row.iter().map(|&x| x as f32).collect())
|
181
|
-
.collect();
|
128
|
+
// Convert Ruby array to Rust Vec<Vec<f32>> using shared helper
|
129
|
+
let data_f32 = ruby_array_to_vec_vec_f32(data)?;
|
182
130
|
|
183
131
|
// Build HNSW graph
|
184
132
|
let ef_c = 50;
|
@@ -331,31 +279,8 @@ impl RustUMAP {
|
|
331
279
|
let training_embeddings_ref = training_embeddings.as_ref()
|
332
280
|
.ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No embeddings available."))?;
|
333
281
|
|
334
|
-
// Convert input data to Rust format
|
335
|
-
let
|
336
|
-
let mut new_data: Vec<Vec<f32>> = Vec::new();
|
337
|
-
|
338
|
-
for i in 0..ruby_array.len() {
|
339
|
-
let row = ruby_array.entry::<Value>(i as isize)?;
|
340
|
-
let row_array = RArray::try_convert(row)?;
|
341
|
-
let mut rust_row: Vec<f32> = Vec::new();
|
342
|
-
|
343
|
-
for j in 0..row_array.len() {
|
344
|
-
let val = row_array.entry::<Value>(j as isize)?;
|
345
|
-
let float_val = if let Ok(f) = Float::try_convert(val) {
|
346
|
-
f.to_f64() as f32
|
347
|
-
} else if let Ok(i) = Integer::try_convert(val) {
|
348
|
-
i.to_i64()? as f32
|
349
|
-
} else {
|
350
|
-
return Err(Error::new(
|
351
|
-
magnus::exception::type_error(),
|
352
|
-
"All values must be numeric",
|
353
|
-
));
|
354
|
-
};
|
355
|
-
rust_row.push(float_val);
|
356
|
-
}
|
357
|
-
new_data.push(rust_row);
|
358
|
-
}
|
282
|
+
// Convert input data to Rust format using shared helper
|
283
|
+
let new_data = ruby_array_to_vec_vec_f32(data)?;
|
359
284
|
|
360
285
|
// For each new point, find k nearest neighbors in training data
|
361
286
|
// and average their embeddings (weighted by distance)
|
data/ext/clusterkit/src/svd.rs
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
use magnus::{function, prelude::*, Error, Value, RArray
|
1
|
+
use magnus::{function, prelude::*, Error, Value, RArray};
|
2
2
|
use annembed::tools::svdapprox::{SvdApprox, RangeApproxMode, RangeRank, MatRepr};
|
3
|
-
use
|
3
|
+
use crate::utils::ruby_array_to_ndarray2;
|
4
4
|
|
5
5
|
pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
6
6
|
let svd_module = parent.define_module("SVD")?;
|
@@ -14,20 +14,9 @@ pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
|
14
14
|
}
|
15
15
|
|
16
16
|
fn randomized_svd(matrix: Value, k: usize, n_iter: usize) -> Result<RArray, Error> {
|
17
|
-
// Convert Ruby array to ndarray
|
18
|
-
let
|
19
|
-
|
20
|
-
// Check if it's a 2D array
|
21
|
-
let first_row: RArray = rarray.entry::<RArray>(0)?;
|
22
|
-
let n_rows = rarray.len();
|
23
|
-
let n_cols = first_row.len();
|
24
|
-
|
25
|
-
if n_rows == 0 || n_cols == 0 {
|
26
|
-
return Err(Error::new(
|
27
|
-
magnus::exception::arg_error(),
|
28
|
-
"Matrix cannot be empty",
|
29
|
-
));
|
30
|
-
}
|
17
|
+
// Convert Ruby array to ndarray using shared helper
|
18
|
+
let matrix_data = ruby_array_to_ndarray2(matrix)?;
|
19
|
+
let (n_rows, n_cols) = matrix_data.dim();
|
31
20
|
|
32
21
|
if k > n_rows.min(n_cols) {
|
33
22
|
return Err(Error::new(
|
@@ -36,16 +25,6 @@ fn randomized_svd(matrix: Value, k: usize, n_iter: usize) -> Result<RArray, Erro
|
|
36
25
|
));
|
37
26
|
}
|
38
27
|
|
39
|
-
// Convert to ndarray Array2
|
40
|
-
let mut matrix_data = Array2::<f64>::zeros((n_rows, n_cols));
|
41
|
-
for i in 0..n_rows {
|
42
|
-
let row: RArray = rarray.entry(i as isize)?;
|
43
|
-
for j in 0..n_cols {
|
44
|
-
let val: f64 = row.entry(j as isize)?;
|
45
|
-
matrix_data[[i, j]] = val;
|
46
|
-
}
|
47
|
-
}
|
48
|
-
|
49
28
|
// Create MatRepr for the full matrix
|
50
29
|
let mat_repr = MatRepr::from_array2(matrix_data.clone());
|
51
30
|
|
data/ext/clusterkit/src/utils.rs
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
use magnus::{function, prelude::*, Error, Value};
|
1
|
+
use magnus::{function, prelude::*, Error, Value, RArray, TryConvert, Float, Integer};
|
2
|
+
use ndarray::Array2;
|
2
3
|
|
3
4
|
pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
4
5
|
let utils_module = parent.define_module("Utils")?;
|
@@ -30,4 +31,150 @@ fn estimate_hubness(_data: Value) -> Result<Value, Error> {
|
|
30
31
|
magnus::exception::not_imp_error(),
|
31
32
|
"Hubness estimation not implemented yet",
|
32
33
|
))
|
34
|
+
}
|
35
|
+
|
36
|
+
/// Convert Ruby 2D array to ndarray Array2<f64>
|
37
|
+
/// Handles validation and provides consistent error messages
|
38
|
+
pub fn ruby_array_to_ndarray2(data: Value) -> Result<Array2<f64>, Error> {
|
39
|
+
let rarray: RArray = TryConvert::try_convert(data)?;
|
40
|
+
let n_samples = rarray.len();
|
41
|
+
|
42
|
+
if n_samples == 0 {
|
43
|
+
return Err(Error::new(
|
44
|
+
magnus::exception::arg_error(),
|
45
|
+
"Data cannot be empty",
|
46
|
+
));
|
47
|
+
}
|
48
|
+
|
49
|
+
// Get dimensions from first row
|
50
|
+
let first_row: RArray = rarray.entry::<RArray>(0)?;
|
51
|
+
let n_features = first_row.len();
|
52
|
+
|
53
|
+
if n_features == 0 {
|
54
|
+
return Err(Error::new(
|
55
|
+
magnus::exception::arg_error(),
|
56
|
+
"Data rows cannot be empty",
|
57
|
+
));
|
58
|
+
}
|
59
|
+
|
60
|
+
// Create ndarray and populate
|
61
|
+
let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
|
62
|
+
for i in 0..n_samples {
|
63
|
+
let row: RArray = rarray.entry(i as isize)?;
|
64
|
+
|
65
|
+
// Validate row length consistency
|
66
|
+
if row.len() != n_features {
|
67
|
+
return Err(Error::new(
|
68
|
+
magnus::exception::arg_error(),
|
69
|
+
format!("Row {} has {} elements, expected {}", i, row.len(), n_features),
|
70
|
+
));
|
71
|
+
}
|
72
|
+
|
73
|
+
for j in 0..n_features {
|
74
|
+
let val: f64 = row.entry(j as isize)?;
|
75
|
+
data_array[[i, j]] = val;
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
Ok(data_array)
|
80
|
+
}
|
81
|
+
|
82
|
+
/// Convert Ruby 2D array to Vec<Vec<f64>>
|
83
|
+
/// Handles validation and provides consistent error messages
|
84
|
+
pub fn ruby_array_to_vec_vec_f64(data: Value) -> Result<Vec<Vec<f64>>, Error> {
|
85
|
+
let rarray: RArray = TryConvert::try_convert(data)?;
|
86
|
+
let n_samples = rarray.len();
|
87
|
+
|
88
|
+
if n_samples == 0 {
|
89
|
+
return Err(Error::new(
|
90
|
+
magnus::exception::arg_error(),
|
91
|
+
"Data cannot be empty",
|
92
|
+
));
|
93
|
+
}
|
94
|
+
|
95
|
+
let mut data_vec: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
|
96
|
+
let mut expected_features: Option<usize> = None;
|
97
|
+
|
98
|
+
for i in 0..n_samples {
|
99
|
+
let row: RArray = rarray.entry(i as isize)?;
|
100
|
+
let n_features = row.len();
|
101
|
+
|
102
|
+
// Check row length consistency
|
103
|
+
match expected_features {
|
104
|
+
Some(expected) => {
|
105
|
+
if n_features != expected {
|
106
|
+
return Err(Error::new(
|
107
|
+
magnus::exception::arg_error(),
|
108
|
+
format!("Row {} has {} elements, expected {}", i, n_features, expected),
|
109
|
+
));
|
110
|
+
}
|
111
|
+
}
|
112
|
+
None => expected_features = Some(n_features),
|
113
|
+
}
|
114
|
+
|
115
|
+
let mut row_vec: Vec<f64> = Vec::with_capacity(n_features);
|
116
|
+
for j in 0..n_features {
|
117
|
+
let val: f64 = row.entry(j as isize)?;
|
118
|
+
row_vec.push(val);
|
119
|
+
}
|
120
|
+
data_vec.push(row_vec);
|
121
|
+
}
|
122
|
+
|
123
|
+
Ok(data_vec)
|
124
|
+
}
|
125
|
+
|
126
|
+
/// Convert Ruby 2D array to Vec<Vec<f32>>
|
127
|
+
/// For algorithms that require f32 precision (like UMAP)
|
128
|
+
pub fn ruby_array_to_vec_vec_f32(data: Value) -> Result<Vec<Vec<f32>>, Error> {
|
129
|
+
let rarray: RArray = TryConvert::try_convert(data)?;
|
130
|
+
let array_len = rarray.len();
|
131
|
+
|
132
|
+
if array_len == 0 {
|
133
|
+
return Err(Error::new(
|
134
|
+
magnus::exception::arg_error(),
|
135
|
+
"Input data cannot be empty",
|
136
|
+
));
|
137
|
+
}
|
138
|
+
|
139
|
+
let mut rust_data: Vec<Vec<f32>> = Vec::with_capacity(array_len);
|
140
|
+
|
141
|
+
for i in 0..array_len {
|
142
|
+
let row = rarray.entry::<Value>(i as isize)?;
|
143
|
+
let row_array = RArray::try_convert(row).map_err(|_| {
|
144
|
+
Error::new(
|
145
|
+
magnus::exception::type_error(),
|
146
|
+
"Expected array of arrays (2D array)",
|
147
|
+
)
|
148
|
+
})?;
|
149
|
+
|
150
|
+
let mut rust_row: Vec<f32> = Vec::new();
|
151
|
+
let row_len = row_array.len();
|
152
|
+
|
153
|
+
for j in 0..row_len {
|
154
|
+
let val = row_array.entry::<Value>(j as isize)?;
|
155
|
+
let float_val = if let Ok(f) = Float::try_convert(val) {
|
156
|
+
f.to_f64() as f32
|
157
|
+
} else if let Ok(i) = Integer::try_convert(val) {
|
158
|
+
i.to_i64()? as f32
|
159
|
+
} else {
|
160
|
+
return Err(Error::new(
|
161
|
+
magnus::exception::type_error(),
|
162
|
+
"All values must be numeric",
|
163
|
+
));
|
164
|
+
};
|
165
|
+
rust_row.push(float_val);
|
166
|
+
}
|
167
|
+
|
168
|
+
// Validate row length consistency
|
169
|
+
if !rust_data.is_empty() && rust_row.len() != rust_data[0].len() {
|
170
|
+
return Err(Error::new(
|
171
|
+
magnus::exception::arg_error(),
|
172
|
+
"All rows must have the same length",
|
173
|
+
));
|
174
|
+
}
|
175
|
+
|
176
|
+
rust_data.push(rust_row);
|
177
|
+
}
|
178
|
+
|
179
|
+
Ok(rust_data)
|
33
180
|
}
|
@@ -1,5 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require_relative '../data_validator'
|
4
|
+
|
3
5
|
module ClusterKit
|
4
6
|
module Clustering
|
5
7
|
# HDBSCAN clustering algorithm - matching KMeans API pattern
|
@@ -128,23 +130,8 @@ module ClusterKit
|
|
128
130
|
private
|
129
131
|
|
130
132
|
def validate_data(data)
|
131
|
-
#
|
132
|
-
|
133
|
-
raise ArgumentError, "Data cannot be empty" if data.empty?
|
134
|
-
raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
|
135
|
-
|
136
|
-
row_length = data.first.length
|
137
|
-
unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
|
138
|
-
raise ArgumentError, "All rows must have the same length"
|
139
|
-
end
|
140
|
-
|
141
|
-
data.each_with_index do |row, i|
|
142
|
-
row.each_with_index do |val, j|
|
143
|
-
unless val.is_a?(Numeric)
|
144
|
-
raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
|
145
|
-
end
|
146
|
-
end
|
147
|
-
end
|
133
|
+
# Use same validation as KMeans for consistency
|
134
|
+
DataValidator.validate_clustering(data, check_finite: false)
|
148
135
|
end
|
149
136
|
end
|
150
137
|
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require_relative 'clusterkit'
|
4
4
|
require_relative 'clustering/hdbscan'
|
5
|
+
require_relative 'data_validator'
|
5
6
|
|
6
7
|
module ClusterKit
|
7
8
|
# Module for clustering algorithms
|
@@ -28,11 +29,8 @@ module ClusterKit
|
|
28
29
|
def fit(data)
|
29
30
|
validate_data(data)
|
30
31
|
|
31
|
-
#
|
32
|
-
|
33
|
-
|
34
|
-
# Call Rust implementation
|
35
|
-
@labels, @centroids, @inertia = Clustering.kmeans_rust(data, @k, @max_iter)
|
32
|
+
# Call Rust implementation with optional seed
|
33
|
+
@labels, @centroids, @inertia = Clustering.kmeans_rust(data, @k, @max_iter, @random_seed)
|
36
34
|
@fitted = true
|
37
35
|
|
38
36
|
self
|
@@ -132,24 +130,7 @@ module ClusterKit
|
|
132
130
|
private
|
133
131
|
|
134
132
|
def validate_data(data)
|
135
|
-
|
136
|
-
raise ArgumentError, "Data cannot be empty" if data.empty?
|
137
|
-
raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
|
138
|
-
|
139
|
-
# Check all rows have same length
|
140
|
-
row_length = data.first.length
|
141
|
-
unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
|
142
|
-
raise ArgumentError, "All rows must have the same length"
|
143
|
-
end
|
144
|
-
|
145
|
-
# Check all values are numeric
|
146
|
-
data.each_with_index do |row, i|
|
147
|
-
row.each_with_index do |val, j|
|
148
|
-
unless val.is_a?(Numeric)
|
149
|
-
raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
133
|
+
DataValidator.validate_clustering(data, check_finite: false)
|
153
134
|
end
|
154
135
|
end
|
155
136
|
|
@@ -0,0 +1,132 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ClusterKit
|
4
|
+
# Shared data validation methods for all algorithms
|
5
|
+
module DataValidator
|
6
|
+
class << self
|
7
|
+
# Validate basic data structure and types
|
8
|
+
# @param data [Array] Data to validate
|
9
|
+
# @raise [ArgumentError] If data structure is invalid
|
10
|
+
def validate_basic_structure(data)
|
11
|
+
raise ArgumentError, "Input must be an array" unless data.is_a?(Array)
|
12
|
+
raise ArgumentError, "Input cannot be empty" if data.empty?
|
13
|
+
|
14
|
+
first_row = data.first
|
15
|
+
raise ArgumentError, "Input must be a 2D array (array of arrays)" unless first_row.is_a?(Array)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Validate row consistency (all rows have same length)
|
19
|
+
# @param data [Array] 2D array to validate
|
20
|
+
# @raise [ArgumentError] If rows have different lengths
|
21
|
+
def validate_row_consistency(data)
|
22
|
+
row_length = data.first.length
|
23
|
+
|
24
|
+
data.each_with_index do |row, i|
|
25
|
+
unless row.is_a?(Array)
|
26
|
+
raise ArgumentError, "Row #{i} is not an array"
|
27
|
+
end
|
28
|
+
|
29
|
+
if row.length != row_length
|
30
|
+
raise ArgumentError, "All rows must have the same length (row #{i} has #{row.length} elements, expected #{row_length})"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Validate that all elements are numeric
|
36
|
+
# @param data [Array] 2D array to validate
|
37
|
+
# @raise [ArgumentError] If any element is not numeric
|
38
|
+
def validate_numeric_types(data)
|
39
|
+
data.each_with_index do |row, i|
|
40
|
+
row.each_with_index do |val, j|
|
41
|
+
unless val.is_a?(Numeric)
|
42
|
+
raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Validate finite values (no NaN or Infinite)
|
49
|
+
# @param data [Array] 2D array to validate
|
50
|
+
# @raise [ArgumentError] If any float is NaN or Infinite
|
51
|
+
def validate_finite_values(data)
|
52
|
+
data.each_with_index do |row, i|
|
53
|
+
row.each_with_index do |val, j|
|
54
|
+
# Only check for NaN/Infinite on floats
|
55
|
+
if val.is_a?(Float) && (val.nan? || val.infinite?)
|
56
|
+
raise ArgumentError, "Element at position [#{i}, #{j}] is NaN or Infinite"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Standard validation for most algorithms
|
63
|
+
# @param data [Array] 2D array to validate
|
64
|
+
# @param check_finite [Boolean] Whether to check for NaN/Infinite values
|
65
|
+
# @raise [ArgumentError] If data is invalid
|
66
|
+
def validate_standard(data, check_finite: true)
|
67
|
+
validate_basic_structure(data)
|
68
|
+
validate_row_consistency(data)
|
69
|
+
validate_numeric_types(data)
|
70
|
+
validate_finite_values(data) if check_finite
|
71
|
+
end
|
72
|
+
|
73
|
+
# Validation for clustering algorithms (KMeans, HDBSCAN) with specific error messages
|
74
|
+
# @param data [Array] 2D array to validate
|
75
|
+
# @param check_finite [Boolean] Whether to check for NaN/Infinite values
|
76
|
+
# @raise [ArgumentError] If data is invalid
|
77
|
+
def validate_clustering(data, check_finite: false)
|
78
|
+
raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
|
79
|
+
raise ArgumentError, "Data cannot be empty" if data.empty?
|
80
|
+
raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
|
81
|
+
|
82
|
+
validate_row_consistency(data)
|
83
|
+
validate_numeric_types(data)
|
84
|
+
validate_finite_values(data) if check_finite
|
85
|
+
end
|
86
|
+
|
87
|
+
# Validation for PCA with specific error messages (same as clustering but without finite checks)
|
88
|
+
# @param data [Array] 2D array to validate
|
89
|
+
# @raise [ArgumentError] If data is invalid
|
90
|
+
def validate_pca(data)
|
91
|
+
raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
|
92
|
+
raise ArgumentError, "Data cannot be empty" if data.empty?
|
93
|
+
raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
|
94
|
+
|
95
|
+
validate_row_consistency(data)
|
96
|
+
validate_numeric_types(data)
|
97
|
+
end
|
98
|
+
|
99
|
+
# Get data statistics for warnings/error context
|
100
|
+
# @param data [Array] 2D array
|
101
|
+
# @return [Hash] Statistics about the data
|
102
|
+
def data_statistics(data)
|
103
|
+
return { n_samples: 0, n_features: 0, data_range: 0.0 } if data.empty?
|
104
|
+
|
105
|
+
n_samples = data.size
|
106
|
+
n_features = data.first&.size || 0
|
107
|
+
|
108
|
+
# Calculate data range for warnings
|
109
|
+
min_val = Float::INFINITY
|
110
|
+
max_val = -Float::INFINITY
|
111
|
+
|
112
|
+
data.each do |row|
|
113
|
+
row.each do |val|
|
114
|
+
val_f = val.to_f
|
115
|
+
min_val = val_f if val_f < min_val
|
116
|
+
max_val = val_f if val_f > max_val
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
data_range = max_val - min_val
|
121
|
+
|
122
|
+
{
|
123
|
+
n_samples: n_samples,
|
124
|
+
n_features: n_features,
|
125
|
+
data_range: data_range,
|
126
|
+
min_value: min_val,
|
127
|
+
max_value: max_val
|
128
|
+
}
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require_relative '../clusterkit'
|
4
4
|
require_relative 'svd'
|
5
|
+
require_relative '../data_validator'
|
5
6
|
|
6
7
|
module ClusterKit
|
7
8
|
module Dimensionality
|
@@ -30,7 +31,7 @@ module ClusterKit
|
|
30
31
|
|
31
32
|
# Perform SVD on centered data
|
32
33
|
# U contains the transformed data, S contains singular values, VT contains components
|
33
|
-
u, s, vt =
|
34
|
+
u, s, vt = perform_svd(centered_data)
|
34
35
|
|
35
36
|
# Store the principal components (eigenvectors)
|
36
37
|
@components = vt # Shape: (n_components, n_features)
|
@@ -76,7 +77,7 @@ module ClusterKit
|
|
76
77
|
centered_data = center_data(data, @mean)
|
77
78
|
|
78
79
|
# Perform SVD on centered data
|
79
|
-
u, s, vt =
|
80
|
+
u, s, vt = perform_svd(centered_data)
|
80
81
|
|
81
82
|
# Store the principal components (eigenvectors)
|
82
83
|
@components = vt
|
@@ -166,17 +167,10 @@ module ClusterKit
|
|
166
167
|
private
|
167
168
|
|
168
169
|
def validate_data(data)
|
169
|
-
|
170
|
-
|
171
|
-
raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
|
172
|
-
|
173
|
-
# Check all rows have same length
|
174
|
-
row_length = data.first.length
|
175
|
-
unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
|
176
|
-
raise ArgumentError, "All rows must have the same length"
|
177
|
-
end
|
170
|
+
# Use shared validation for common checks
|
171
|
+
DataValidator.validate_pca(data)
|
178
172
|
|
179
|
-
#
|
173
|
+
# PCA-specific validations
|
180
174
|
if data.size < @n_components
|
181
175
|
raise ArgumentError, "n_components (#{@n_components}) cannot be larger than n_samples (#{data.size})"
|
182
176
|
end
|
@@ -237,6 +231,12 @@ module ClusterKit
|
|
237
231
|
|
238
232
|
transformed
|
239
233
|
end
|
234
|
+
|
235
|
+
# Shared SVD computation for both fit and fit_transform
|
236
|
+
# Ensures both methods use identical SVD invocation and parameters
|
237
|
+
def perform_svd(centered_data)
|
238
|
+
SVD.randomized_svd(centered_data, @n_components, n_iter: 5)
|
239
|
+
end
|
240
240
|
end
|
241
241
|
|
242
242
|
# Module-level convenience method
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative '../clusterkit'
|
4
|
+
require_relative '../data_validator'
|
4
5
|
|
5
6
|
module ClusterKit
|
6
7
|
module Dimensionality
|
@@ -8,7 +9,7 @@ module ClusterKit
|
|
8
9
|
# Decomposes a matrix into U, S, V^T components
|
9
10
|
class SVD
|
10
11
|
attr_reader :n_components, :n_iter, :random_seed
|
11
|
-
attr_reader :u, :s, :vt
|
12
|
+
attr_reader :u, :s, :vt, :n_features
|
12
13
|
|
13
14
|
# Initialize a new SVD instance
|
14
15
|
# @param n_components [Integer] Number of components to compute
|
@@ -27,7 +28,8 @@ module ClusterKit
|
|
27
28
|
def fit_transform(data)
|
28
29
|
validate_input(data)
|
29
30
|
|
30
|
-
# Store
|
31
|
+
# Store data characteristics for later transform operations
|
32
|
+
@n_features = data.first.size
|
31
33
|
@original_data_id = data.object_id
|
32
34
|
|
33
35
|
# Determine n_components if not set
|
@@ -77,26 +79,21 @@ module ClusterKit
|
|
77
79
|
|
78
80
|
# Transform data using fitted SVD (project onto components)
|
79
81
|
# @param data [Array<Array<Numeric>>] Data to transform
|
80
|
-
# @return [Array<Array<Float>>] Transformed data
|
82
|
+
# @return [Array<Array<Float>>] Transformed data projected onto SVD components
|
81
83
|
def transform(data)
|
82
84
|
raise RuntimeError, "Model must be fitted first" unless fitted?
|
83
|
-
|
84
|
-
|
85
|
-
# For SVD, transform typically means projecting onto the components
|
86
|
-
# This is equivalent to data * V (or data * V^T.T)
|
87
|
-
# But for dimensionality reduction, we usually want U * S
|
88
|
-
# which is already computed in fit_transform
|
85
|
+
validate_transform_input(data)
|
89
86
|
|
90
|
-
# If transforming new data, we'd need to project it
|
91
|
-
# For now, return U * S for the fitted data
|
92
87
|
if data.object_id == @original_data_id
|
93
88
|
# Same data that was fitted - return U * S
|
94
89
|
@u.map.with_index do |row, i|
|
95
90
|
row.map.with_index { |val, j| val * @s[j] }
|
96
91
|
end
|
97
92
|
else
|
98
|
-
# New data -
|
99
|
-
|
93
|
+
# New data - project onto V components: data × V
|
94
|
+
# Since we have V^T, we need to transpose it back to V
|
95
|
+
# V = V^T^T, so we project: data × V^T^T
|
96
|
+
transform_new_data(data)
|
100
97
|
end
|
101
98
|
end
|
102
99
|
|
@@ -135,9 +132,43 @@ module ClusterKit
|
|
135
132
|
private
|
136
133
|
|
137
134
|
def validate_input(data)
|
138
|
-
|
139
|
-
|
140
|
-
|
135
|
+
DataValidator.validate_standard(data, check_finite: false)
|
136
|
+
end
|
137
|
+
|
138
|
+
def validate_transform_input(data)
|
139
|
+
DataValidator.validate_standard(data, check_finite: false)
|
140
|
+
|
141
|
+
# Check feature count matches training data
|
142
|
+
if data.first.size != @n_features
|
143
|
+
raise ArgumentError, "New data has #{data.first.size} features, but model was fitted with #{@n_features} features"
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# Transform new data by projecting onto V components
|
148
|
+
# Mathematical operation: new_data × V, where V = V^T^T
|
149
|
+
def transform_new_data(data)
|
150
|
+
# V^T is stored as @vt (shape: n_components × n_features)
|
151
|
+
# We need V (shape: n_features × n_components)
|
152
|
+
# V = V^T^T, so we transpose @vt
|
153
|
+
|
154
|
+
result = []
|
155
|
+
data.each do |sample|
|
156
|
+
# Project sample onto each component (column of V = row of V^T)
|
157
|
+
projected = Array.new(@vt.size, 0.0)
|
158
|
+
|
159
|
+
@vt.each_with_index do |vt_row, comp_idx|
|
160
|
+
# Dot product: sample · vt_row (this is sample · V[:, comp_idx])
|
161
|
+
dot_product = 0.0
|
162
|
+
sample.each_with_index do |val, feat_idx|
|
163
|
+
dot_product += val * vt_row[feat_idx]
|
164
|
+
end
|
165
|
+
projected[comp_idx] = dot_product
|
166
|
+
end
|
167
|
+
|
168
|
+
result << projected
|
169
|
+
end
|
170
|
+
|
171
|
+
result
|
141
172
|
end
|
142
173
|
end
|
143
174
|
end
|
@@ -4,6 +4,7 @@ require 'fileutils'
|
|
4
4
|
require 'json'
|
5
5
|
require_relative '../configuration'
|
6
6
|
require_relative '../silence'
|
7
|
+
require_relative '../data_validator'
|
7
8
|
|
8
9
|
module ClusterKit
|
9
10
|
module Dimensionality
|
@@ -224,44 +225,10 @@ module ClusterKit
|
|
224
225
|
end
|
225
226
|
|
226
227
|
def validate_input(data, check_min_samples: true)
|
227
|
-
|
228
|
-
|
228
|
+
# Use shared validation for common checks
|
229
|
+
DataValidator.validate_standard(data)
|
229
230
|
|
230
|
-
|
231
|
-
raise ArgumentError, "Input must be a 2D array (array of arrays)" unless first_row.is_a?(Array)
|
232
|
-
|
233
|
-
row_length = first_row.length
|
234
|
-
min_val = Float::INFINITY
|
235
|
-
max_val = -Float::INFINITY
|
236
|
-
|
237
|
-
# First validate data structure and types
|
238
|
-
data.each_with_index do |row, i|
|
239
|
-
unless row.is_a?(Array)
|
240
|
-
raise ArgumentError, "Row #{i} is not an array"
|
241
|
-
end
|
242
|
-
|
243
|
-
if row.length != row_length
|
244
|
-
raise ArgumentError, "All rows must have the same length (row #{i} has #{row.length} elements, expected #{row_length})"
|
245
|
-
end
|
246
|
-
|
247
|
-
row.each_with_index do |val, j|
|
248
|
-
unless val.is_a?(Numeric)
|
249
|
-
raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
|
250
|
-
end
|
251
|
-
|
252
|
-
# Only check for NaN/Infinite on floats
|
253
|
-
if val.is_a?(Float) && (val.nan? || val.infinite?)
|
254
|
-
raise ArgumentError, "Element at position [#{i}, #{j}] is NaN or Infinite"
|
255
|
-
end
|
256
|
-
|
257
|
-
# Track data range
|
258
|
-
val_f = val.to_f
|
259
|
-
min_val = val_f if val_f < min_val
|
260
|
-
max_val = val_f if val_f > max_val
|
261
|
-
end
|
262
|
-
end
|
263
|
-
|
264
|
-
# Check for sufficient data points after validating structure (only for fit operations)
|
231
|
+
# UMAP-specific validations
|
265
232
|
if check_min_samples && data.size < 10
|
266
233
|
raise ::ClusterKit::InsufficientDataError, <<~MSG
|
267
234
|
UMAP requires at least 10 data points, but only #{data.size} provided.
|
@@ -274,9 +241,9 @@ module ClusterKit
|
|
274
241
|
end
|
275
242
|
|
276
243
|
# Check for extreme data ranges that might cause numerical issues
|
277
|
-
|
278
|
-
if data_range > 1000
|
279
|
-
warn "WARNING: Large data range detected (#{data_range.round(2)}). Consider normalizing your data to prevent numerical instability."
|
244
|
+
stats = DataValidator.data_statistics(data)
|
245
|
+
if stats[:data_range] > 1000
|
246
|
+
warn "WARNING: Large data range detected (#{stats[:data_range].round(2)}). Consider normalizing your data to prevent numerical instability."
|
280
247
|
end
|
281
248
|
end
|
282
249
|
|
data/lib/clusterkit/version.rb
CHANGED
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: clusterkit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Petersen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-09-
|
11
|
+
date: 2025-09-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rb_sys
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.9'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.9'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: csv
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -52,20 +66,6 @@ dependencies:
|
|
52
66
|
- - "~>"
|
53
67
|
- !ruby/object:Gem::Version
|
54
68
|
version: '1.2'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: rb_sys
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - "~>"
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0.9'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - "~>"
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0.9'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -159,6 +159,7 @@ files:
|
|
159
159
|
- lib/clusterkit/clustering/hdbscan.rb
|
160
160
|
- lib/clusterkit/clusterkit.rb
|
161
161
|
- lib/clusterkit/configuration.rb
|
162
|
+
- lib/clusterkit/data_validator.rb
|
162
163
|
- lib/clusterkit/dimensionality.rb
|
163
164
|
- lib/clusterkit/dimensionality/pca.rb
|
164
165
|
- lib/clusterkit/dimensionality/svd.rb
|