clusterkit 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ebad40c2aac3fa3569357eedf740336a3d463ecc5c038d2771d3fd266d414b1e
4
- data.tar.gz: 0ab2851e0adab583567460e469d4073e755514854e57bca464d5a835c1534bea
3
+ metadata.gz: ed8756bb9b2d2cf253c849f529b18aa57c0043fe0e0b65997fffc62522d21fd6
4
+ data.tar.gz: adcb4a5186e042a0be6d9e712b8dbff5751c469295ccbef9034bf40688658c21
5
5
  SHA512:
6
- metadata.gz: 16412a2db10bf55593c778c7a02813b8db9652e39479d17fa3cd10da1e87298d5d95f4afccd8f5da10bb89bc950b314faeb9a8f0149ecd2dd3c3305121c1b5b2
7
- data.tar.gz: edf73f2d0ce8f73441c07975a5cd4da31faaece9b216d908a743b9c1bd3f2d8e213d1b85a7e82f2f397c60f8d2fd2c4a0109a5a337963e008ecf069b3fa40266
6
+ metadata.gz: 3f16b15661cf7db9a42f080dc5cb8835ded9f36518f0f3e095ea50bf7c93555906a1b3233744b11551cb2d45c1b3a9b1d2e5a64051d770b277e6a0669e0698b2
7
+ data.tar.gz: 36e4df00fbfb48b44bdcacbcb29e9a077c5711cc5cbc79d0ae9091c604db489bea06fa1c354ad89a87aa50e5c7dc8fe2b75f426e1942ae01d46887dda72b5ab1
data/clusterkit.gemspec CHANGED
@@ -30,16 +30,16 @@ Gem::Specification.new do |spec|
30
30
  # Runtime dependencies
31
31
  # Numo is optional but recommended for better performance
32
32
  # spec.add_dependency "numo-narray", "~> 0.9"
33
+ spec.add_dependency "rb_sys", "~> 0.9"
33
34
 
34
35
  # Development dependencies
35
36
  spec.add_development_dependency "csv"
36
37
  spec.add_development_dependency "rake", "~> 13.0"
37
38
  spec.add_development_dependency "rake-compiler", "~> 1.2"
38
- spec.add_development_dependency "rb_sys", "~> 0.9"
39
39
  spec.add_development_dependency "rspec", "~> 3.0"
40
40
  spec.add_development_dependency "simplecov", "~> 0.22"
41
41
  spec.add_development_dependency "yard", "~> 0.9"
42
42
 
43
43
  # For more information and examples about making a new gem, check out our
44
44
  # guide at: https://bundler.io/guides/creating_gem.html
45
- end
45
+ end
@@ -1,5 +1,6 @@
1
- use magnus::{function, prelude::*, Error, Value, RArray, RHash, Integer, TryConvert};
1
+ use magnus::{function, prelude::*, Error, Value, RArray, RHash, Integer};
2
2
  use hdbscan::{Hdbscan, HdbscanHyperParams};
3
+ use crate::utils::ruby_array_to_vec_vec_f64;
3
4
 
4
5
  /// Perform HDBSCAN clustering
5
6
  /// Returns a hash with labels and basic statistics
@@ -9,32 +10,9 @@ pub fn hdbscan_fit(
9
10
  min_cluster_size: usize,
10
11
  metric: String,
11
12
  ) -> Result<RHash, Error> {
12
- // Convert Ruby array to ndarray
13
- let rarray: RArray = TryConvert::try_convert(data)?;
14
- let n_samples = rarray.len();
15
-
16
- if n_samples == 0 {
17
- return Err(Error::new(
18
- magnus::exception::arg_error(),
19
- "Data cannot be empty",
20
- ));
21
- }
22
-
23
- // Get dimensions
24
- let first_row: RArray = rarray.entry::<RArray>(0)?;
25
- let n_features = first_row.len();
26
-
27
- // Convert to Vec<Vec<f64>> format expected by hdbscan crate
28
- let mut data_vec: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
29
- for i in 0..n_samples {
30
- let row: RArray = rarray.entry(i as isize)?;
31
- let mut row_vec: Vec<f64> = Vec::with_capacity(n_features);
32
- for j in 0..n_features {
33
- let val: f64 = row.entry(j as isize)?;
34
- row_vec.push(val);
35
- }
36
- data_vec.push(row_vec);
37
- }
13
+ // Convert Ruby array to Vec<Vec<f64>> using shared helper
14
+ let data_vec = ruby_array_to_vec_vec_f64(data)?;
15
+ let n_samples = data_vec.len();
38
16
 
39
17
  // Note: hdbscan crate doesn't support custom metrics directly
40
18
  // We'll use the default Euclidean distance for now
@@ -1,6 +1,9 @@
1
- use magnus::{function, prelude::*, Error, Value, RArray, Integer, TryConvert};
1
+ use magnus::{function, prelude::*, Error, Value, RArray, Integer};
2
2
  use ndarray::{Array1, Array2, ArrayView1, Axis};
3
3
  use rand::prelude::*;
4
+ use rand::rngs::StdRng;
5
+ use rand::SeedableRng;
6
+ use crate::utils::{ruby_array_to_ndarray2};
4
7
 
5
8
  mod hdbscan_wrapper;
6
9
 
@@ -9,7 +12,7 @@ pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
9
12
 
10
13
  clustering_module.define_singleton_method(
11
14
  "kmeans_rust",
12
- function!(kmeans, 3),
15
+ function!(kmeans, 4),
13
16
  )?;
14
17
 
15
18
  clustering_module.define_singleton_method(
@@ -25,21 +28,10 @@ pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
25
28
 
26
29
  /// Perform K-means clustering
27
30
  /// Returns (labels, centroids, inertia)
28
- fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64), Error> {
29
- // Convert Ruby array to ndarray
30
- let rarray: RArray = TryConvert::try_convert(data)?;
31
- let n_samples = rarray.len();
32
-
33
- if n_samples == 0 {
34
- return Err(Error::new(
35
- magnus::exception::arg_error(),
36
- "Data cannot be empty",
37
- ));
38
- }
39
-
40
- // Get dimensions
41
- let first_row: RArray = rarray.entry::<RArray>(0)?;
42
- let n_features = first_row.len();
31
+ fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> Result<(RArray, RArray, f64), Error> {
32
+ // Convert Ruby array to ndarray using shared helper
33
+ let data_array = ruby_array_to_ndarray2(data)?;
34
+ let (n_samples, n_features) = data_array.dim();
43
35
 
44
36
  if k > n_samples {
45
37
  return Err(Error::new(
@@ -48,18 +40,8 @@ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64
48
40
  ));
49
41
  }
50
42
 
51
- // Convert to ndarray
52
- let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
53
- for i in 0..n_samples {
54
- let row: RArray = rarray.entry(i as isize)?;
55
- for j in 0..n_features {
56
- let val: f64 = row.entry(j as isize)?;
57
- data_array[[i, j]] = val;
58
- }
59
- }
60
-
61
43
  // Initialize centroids using K-means++
62
- let mut centroids = kmeans_plusplus(&data_array, k)?;
44
+ let mut centroids = kmeans_plusplus(&data_array, k, random_seed)?;
63
45
  let mut labels = vec![0usize; n_samples];
64
46
  let mut prev_labels = vec![0usize; n_samples];
65
47
 
@@ -140,43 +122,12 @@ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64
140
122
 
141
123
  /// Predict cluster labels for new data given centroids
142
124
  fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
143
- // Convert inputs
144
- let data_array: RArray = TryConvert::try_convert(data)?;
145
- let centroids_array: RArray = TryConvert::try_convert(centroids)?;
146
-
147
- let n_samples = data_array.len();
148
- let k = centroids_array.len();
149
-
150
- if n_samples == 0 {
151
- return Err(Error::new(
152
- magnus::exception::arg_error(),
153
- "Data cannot be empty",
154
- ));
155
- }
156
-
157
- // Get dimensions
158
- let first_row: RArray = data_array.entry::<RArray>(0)?;
159
- let n_features = first_row.len();
160
-
161
- // Convert data to ndarray
162
- let mut data_matrix = Array2::<f64>::zeros((n_samples, n_features));
163
- for i in 0..n_samples {
164
- let row: RArray = data_array.entry(i as isize)?;
165
- for j in 0..n_features {
166
- let val: f64 = row.entry(j as isize)?;
167
- data_matrix[[i, j]] = val;
168
- }
169
- }
125
+ // Convert inputs using shared helpers
126
+ let data_matrix = ruby_array_to_ndarray2(data)?;
127
+ let centroids_matrix = ruby_array_to_ndarray2(centroids)?;
170
128
 
171
- // Convert centroids to ndarray
172
- let mut centroids_matrix = Array2::<f64>::zeros((k, n_features));
173
- for i in 0..k {
174
- let row: RArray = centroids_array.entry(i as isize)?;
175
- for j in 0..n_features {
176
- let val: f64 = row.entry(j as isize)?;
177
- centroids_matrix[[i, j]] = val;
178
- }
179
- }
129
+ let (n_samples, _) = data_matrix.dim();
130
+ let (_k, _) = centroids_matrix.dim();
180
131
 
181
132
  // Predict labels
182
133
  let ruby = magnus::Ruby::get().unwrap();
@@ -202,10 +153,19 @@ fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
202
153
  }
203
154
 
204
155
  /// K-means++ initialization
205
- fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
156
+ fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Result<Array2<f64>, Error> {
206
157
  let n_samples = data.nrows();
207
158
  let n_features = data.ncols();
208
- let mut rng = thread_rng();
159
+
160
+ // Use seeded RNG if seed is provided, otherwise use thread_rng
161
+ let mut rng: Box<dyn RngCore> = match random_seed {
162
+ Some(seed) => {
163
+ // Convert i64 to u64 for seeding (negative numbers wrap around)
164
+ let seed_u64 = seed as u64;
165
+ Box::new(StdRng::seed_from_u64(seed_u64))
166
+ },
167
+ None => Box::new(thread_rng()),
168
+ };
209
169
 
210
170
  let mut centroids = Array2::<f64>::zeros((k, n_features));
211
171
 
@@ -1,4 +1,4 @@
1
- use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Float, Module, Object};
1
+ use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Module, Object};
2
2
  use magnus::value::ReprValue;
3
3
  use hnsw_rs::prelude::*;
4
4
  use annembed::prelude::*;
@@ -7,6 +7,7 @@ use std::io::{Write, Read};
7
7
  use std::cell::RefCell;
8
8
  use bincode;
9
9
  use serde::{Serialize, Deserialize};
10
+ use crate::utils::ruby_array_to_vec_vec_f32;
10
11
 
11
12
  // Simple struct to serialize UMAP results
12
13
  #[derive(Serialize, Deserialize)]
@@ -124,61 +125,8 @@ impl RustUMAP {
124
125
  }
125
126
 
126
127
  fn fit_transform(&self, data: Value) -> Result<RArray, Error> {
127
- // Convert Ruby array to Rust Vec<Vec<f64>>
128
- let ruby_array = RArray::try_convert(data)?;
129
- let mut rust_data: Vec<Vec<f64>> = Vec::new();
130
-
131
- // Get array length
132
- let array_len = ruby_array.len();
133
-
134
- for i in 0..array_len {
135
- let row = ruby_array.entry::<Value>(i as isize)?;
136
- let row_array = RArray::try_convert(row).map_err(|_| {
137
- Error::new(
138
- magnus::exception::type_error(),
139
- "Expected array of arrays (2D array)",
140
- )
141
- })?;
142
-
143
- let mut rust_row: Vec<f64> = Vec::new();
144
- let row_len = row_array.len();
145
-
146
- for j in 0..row_len {
147
- let val = row_array.entry::<Value>(j as isize)?;
148
- let float_val = if let Ok(f) = Float::try_convert(val) {
149
- f.to_f64()
150
- } else if let Ok(i) = Integer::try_convert(val) {
151
- i.to_i64()? as f64
152
- } else {
153
- return Err(Error::new(
154
- magnus::exception::type_error(),
155
- "All values must be numeric",
156
- ));
157
- };
158
- rust_row.push(float_val);
159
- }
160
-
161
- if !rust_data.is_empty() && rust_row.len() != rust_data[0].len() {
162
- return Err(Error::new(
163
- magnus::exception::arg_error(),
164
- "All rows must have the same length",
165
- ));
166
- }
167
-
168
- rust_data.push(rust_row);
169
- }
170
-
171
- if rust_data.is_empty() {
172
- return Err(Error::new(
173
- magnus::exception::arg_error(),
174
- "Input data cannot be empty",
175
- ));
176
- }
177
-
178
- // Convert to Vec<Vec<f32>> for HNSW
179
- let data_f32: Vec<Vec<f32>> = rust_data.iter()
180
- .map(|row| row.iter().map(|&x| x as f32).collect())
181
- .collect();
128
+ // Convert Ruby array to Rust Vec<Vec<f32>> using shared helper
129
+ let data_f32 = ruby_array_to_vec_vec_f32(data)?;
182
130
 
183
131
  // Build HNSW graph
184
132
  let ef_c = 50;
@@ -331,31 +279,8 @@ impl RustUMAP {
331
279
  let training_embeddings_ref = training_embeddings.as_ref()
332
280
  .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "No embeddings available."))?;
333
281
 
334
- // Convert input data to Rust format
335
- let ruby_array = RArray::try_convert(data)?;
336
- let mut new_data: Vec<Vec<f32>> = Vec::new();
337
-
338
- for i in 0..ruby_array.len() {
339
- let row = ruby_array.entry::<Value>(i as isize)?;
340
- let row_array = RArray::try_convert(row)?;
341
- let mut rust_row: Vec<f32> = Vec::new();
342
-
343
- for j in 0..row_array.len() {
344
- let val = row_array.entry::<Value>(j as isize)?;
345
- let float_val = if let Ok(f) = Float::try_convert(val) {
346
- f.to_f64() as f32
347
- } else if let Ok(i) = Integer::try_convert(val) {
348
- i.to_i64()? as f32
349
- } else {
350
- return Err(Error::new(
351
- magnus::exception::type_error(),
352
- "All values must be numeric",
353
- ));
354
- };
355
- rust_row.push(float_val);
356
- }
357
- new_data.push(rust_row);
358
- }
282
+ // Convert input data to Rust format using shared helper
283
+ let new_data = ruby_array_to_vec_vec_f32(data)?;
359
284
 
360
285
  // For each new point, find k nearest neighbors in training data
361
286
  // and average their embeddings (weighted by distance)
@@ -1,6 +1,6 @@
1
- use magnus::{function, prelude::*, Error, Value, RArray, TryConvert};
1
+ use magnus::{function, prelude::*, Error, Value, RArray};
2
2
  use annembed::tools::svdapprox::{SvdApprox, RangeApproxMode, RangeRank, MatRepr};
3
- use ndarray::Array2;
3
+ use crate::utils::ruby_array_to_ndarray2;
4
4
 
5
5
  pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
6
6
  let svd_module = parent.define_module("SVD")?;
@@ -14,20 +14,9 @@ pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
14
14
  }
15
15
 
16
16
  fn randomized_svd(matrix: Value, k: usize, n_iter: usize) -> Result<RArray, Error> {
17
- // Convert Ruby array to ndarray
18
- let rarray: RArray = TryConvert::try_convert(matrix)?;
19
-
20
- // Check if it's a 2D array
21
- let first_row: RArray = rarray.entry::<RArray>(0)?;
22
- let n_rows = rarray.len();
23
- let n_cols = first_row.len();
24
-
25
- if n_rows == 0 || n_cols == 0 {
26
- return Err(Error::new(
27
- magnus::exception::arg_error(),
28
- "Matrix cannot be empty",
29
- ));
30
- }
17
+ // Convert Ruby array to ndarray using shared helper
18
+ let matrix_data = ruby_array_to_ndarray2(matrix)?;
19
+ let (n_rows, n_cols) = matrix_data.dim();
31
20
 
32
21
  if k > n_rows.min(n_cols) {
33
22
  return Err(Error::new(
@@ -36,16 +25,6 @@ fn randomized_svd(matrix: Value, k: usize, n_iter: usize) -> Result<RArray, Erro
36
25
  ));
37
26
  }
38
27
 
39
- // Convert to ndarray Array2
40
- let mut matrix_data = Array2::<f64>::zeros((n_rows, n_cols));
41
- for i in 0..n_rows {
42
- let row: RArray = rarray.entry(i as isize)?;
43
- for j in 0..n_cols {
44
- let val: f64 = row.entry(j as isize)?;
45
- matrix_data[[i, j]] = val;
46
- }
47
- }
48
-
49
28
  // Create MatRepr for the full matrix
50
29
  let mat_repr = MatRepr::from_array2(matrix_data.clone());
51
30
 
@@ -1,4 +1,5 @@
1
- use magnus::{function, prelude::*, Error, Value};
1
+ use magnus::{function, prelude::*, Error, Value, RArray, TryConvert, Float, Integer};
2
+ use ndarray::Array2;
2
3
 
3
4
  pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
4
5
  let utils_module = parent.define_module("Utils")?;
@@ -30,4 +31,150 @@ fn estimate_hubness(_data: Value) -> Result<Value, Error> {
30
31
  magnus::exception::not_imp_error(),
31
32
  "Hubness estimation not implemented yet",
32
33
  ))
34
+ }
35
+
36
+ /// Convert Ruby 2D array to ndarray Array2<f64>
37
+ /// Handles validation and provides consistent error messages
38
+ pub fn ruby_array_to_ndarray2(data: Value) -> Result<Array2<f64>, Error> {
39
+ let rarray: RArray = TryConvert::try_convert(data)?;
40
+ let n_samples = rarray.len();
41
+
42
+ if n_samples == 0 {
43
+ return Err(Error::new(
44
+ magnus::exception::arg_error(),
45
+ "Data cannot be empty",
46
+ ));
47
+ }
48
+
49
+ // Get dimensions from first row
50
+ let first_row: RArray = rarray.entry::<RArray>(0)?;
51
+ let n_features = first_row.len();
52
+
53
+ if n_features == 0 {
54
+ return Err(Error::new(
55
+ magnus::exception::arg_error(),
56
+ "Data rows cannot be empty",
57
+ ));
58
+ }
59
+
60
+ // Create ndarray and populate
61
+ let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
62
+ for i in 0..n_samples {
63
+ let row: RArray = rarray.entry(i as isize)?;
64
+
65
+ // Validate row length consistency
66
+ if row.len() != n_features {
67
+ return Err(Error::new(
68
+ magnus::exception::arg_error(),
69
+ format!("Row {} has {} elements, expected {}", i, row.len(), n_features),
70
+ ));
71
+ }
72
+
73
+ for j in 0..n_features {
74
+ let val: f64 = row.entry(j as isize)?;
75
+ data_array[[i, j]] = val;
76
+ }
77
+ }
78
+
79
+ Ok(data_array)
80
+ }
81
+
82
+ /// Convert Ruby 2D array to Vec<Vec<f64>>
83
+ /// Handles validation and provides consistent error messages
84
+ pub fn ruby_array_to_vec_vec_f64(data: Value) -> Result<Vec<Vec<f64>>, Error> {
85
+ let rarray: RArray = TryConvert::try_convert(data)?;
86
+ let n_samples = rarray.len();
87
+
88
+ if n_samples == 0 {
89
+ return Err(Error::new(
90
+ magnus::exception::arg_error(),
91
+ "Data cannot be empty",
92
+ ));
93
+ }
94
+
95
+ let mut data_vec: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
96
+ let mut expected_features: Option<usize> = None;
97
+
98
+ for i in 0..n_samples {
99
+ let row: RArray = rarray.entry(i as isize)?;
100
+ let n_features = row.len();
101
+
102
+ // Check row length consistency
103
+ match expected_features {
104
+ Some(expected) => {
105
+ if n_features != expected {
106
+ return Err(Error::new(
107
+ magnus::exception::arg_error(),
108
+ format!("Row {} has {} elements, expected {}", i, n_features, expected),
109
+ ));
110
+ }
111
+ }
112
+ None => expected_features = Some(n_features),
113
+ }
114
+
115
+ let mut row_vec: Vec<f64> = Vec::with_capacity(n_features);
116
+ for j in 0..n_features {
117
+ let val: f64 = row.entry(j as isize)?;
118
+ row_vec.push(val);
119
+ }
120
+ data_vec.push(row_vec);
121
+ }
122
+
123
+ Ok(data_vec)
124
+ }
125
+
126
+ /// Convert Ruby 2D array to Vec<Vec<f32>>
127
+ /// For algorithms that require f32 precision (like UMAP)
128
+ pub fn ruby_array_to_vec_vec_f32(data: Value) -> Result<Vec<Vec<f32>>, Error> {
129
+ let rarray: RArray = TryConvert::try_convert(data)?;
130
+ let array_len = rarray.len();
131
+
132
+ if array_len == 0 {
133
+ return Err(Error::new(
134
+ magnus::exception::arg_error(),
135
+ "Input data cannot be empty",
136
+ ));
137
+ }
138
+
139
+ let mut rust_data: Vec<Vec<f32>> = Vec::with_capacity(array_len);
140
+
141
+ for i in 0..array_len {
142
+ let row = rarray.entry::<Value>(i as isize)?;
143
+ let row_array = RArray::try_convert(row).map_err(|_| {
144
+ Error::new(
145
+ magnus::exception::type_error(),
146
+ "Expected array of arrays (2D array)",
147
+ )
148
+ })?;
149
+
150
+ let mut rust_row: Vec<f32> = Vec::new();
151
+ let row_len = row_array.len();
152
+
153
+ for j in 0..row_len {
154
+ let val = row_array.entry::<Value>(j as isize)?;
155
+ let float_val = if let Ok(f) = Float::try_convert(val) {
156
+ f.to_f64() as f32
157
+ } else if let Ok(i) = Integer::try_convert(val) {
158
+ i.to_i64()? as f32
159
+ } else {
160
+ return Err(Error::new(
161
+ magnus::exception::type_error(),
162
+ "All values must be numeric",
163
+ ));
164
+ };
165
+ rust_row.push(float_val);
166
+ }
167
+
168
+ // Validate row length consistency
169
+ if !rust_data.is_empty() && rust_row.len() != rust_data[0].len() {
170
+ return Err(Error::new(
171
+ magnus::exception::arg_error(),
172
+ "All rows must have the same length",
173
+ ));
174
+ }
175
+
176
+ rust_data.push(rust_row);
177
+ }
178
+
179
+ Ok(rust_data)
33
180
  }
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative '../data_validator'
4
+
3
5
  module ClusterKit
4
6
  module Clustering
5
7
  # HDBSCAN clustering algorithm - matching KMeans API pattern
@@ -128,23 +130,8 @@ module ClusterKit
128
130
  private
129
131
 
130
132
  def validate_data(data)
131
- # Exact same validation as KMeans for consistency
132
- raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
133
- raise ArgumentError, "Data cannot be empty" if data.empty?
134
- raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
135
-
136
- row_length = data.first.length
137
- unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
138
- raise ArgumentError, "All rows must have the same length"
139
- end
140
-
141
- data.each_with_index do |row, i|
142
- row.each_with_index do |val, j|
143
- unless val.is_a?(Numeric)
144
- raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
145
- end
146
- end
147
- end
133
+ # Use same validation as KMeans for consistency
134
+ DataValidator.validate_clustering(data, check_finite: false)
148
135
  end
149
136
  end
150
137
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative 'clusterkit'
4
4
  require_relative 'clustering/hdbscan'
5
+ require_relative 'data_validator'
5
6
 
6
7
  module ClusterKit
7
8
  # Module for clustering algorithms
@@ -28,11 +29,8 @@ module ClusterKit
28
29
  def fit(data)
29
30
  validate_data(data)
30
31
 
31
- # Set random seed if provided
32
- srand(@random_seed) if @random_seed
33
-
34
- # Call Rust implementation
35
- @labels, @centroids, @inertia = Clustering.kmeans_rust(data, @k, @max_iter)
32
+ # Call Rust implementation with optional seed
33
+ @labels, @centroids, @inertia = Clustering.kmeans_rust(data, @k, @max_iter, @random_seed)
36
34
  @fitted = true
37
35
 
38
36
  self
@@ -132,24 +130,7 @@ module ClusterKit
132
130
  private
133
131
 
134
132
  def validate_data(data)
135
- raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
136
- raise ArgumentError, "Data cannot be empty" if data.empty?
137
- raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
138
-
139
- # Check all rows have same length
140
- row_length = data.first.length
141
- unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
142
- raise ArgumentError, "All rows must have the same length"
143
- end
144
-
145
- # Check all values are numeric
146
- data.each_with_index do |row, i|
147
- row.each_with_index do |val, j|
148
- unless val.is_a?(Numeric)
149
- raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
150
- end
151
- end
152
- end
133
+ DataValidator.validate_clustering(data, check_finite: false)
153
134
  end
154
135
  end
155
136
 
@@ -0,0 +1,132 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClusterKit
4
+ # Shared data validation methods for all algorithms
5
+ module DataValidator
6
+ class << self
7
+ # Validate basic data structure and types
8
+ # @param data [Array] Data to validate
9
+ # @raise [ArgumentError] If data structure is invalid
10
+ def validate_basic_structure(data)
11
+ raise ArgumentError, "Input must be an array" unless data.is_a?(Array)
12
+ raise ArgumentError, "Input cannot be empty" if data.empty?
13
+
14
+ first_row = data.first
15
+ raise ArgumentError, "Input must be a 2D array (array of arrays)" unless first_row.is_a?(Array)
16
+ end
17
+
18
+ # Validate row consistency (all rows have same length)
19
+ # @param data [Array] 2D array to validate
20
+ # @raise [ArgumentError] If rows have different lengths
21
+ def validate_row_consistency(data)
22
+ row_length = data.first.length
23
+
24
+ data.each_with_index do |row, i|
25
+ unless row.is_a?(Array)
26
+ raise ArgumentError, "Row #{i} is not an array"
27
+ end
28
+
29
+ if row.length != row_length
30
+ raise ArgumentError, "All rows must have the same length (row #{i} has #{row.length} elements, expected #{row_length})"
31
+ end
32
+ end
33
+ end
34
+
35
+ # Validate that all elements are numeric
36
+ # @param data [Array] 2D array to validate
37
+ # @raise [ArgumentError] If any element is not numeric
38
+ def validate_numeric_types(data)
39
+ data.each_with_index do |row, i|
40
+ row.each_with_index do |val, j|
41
+ unless val.is_a?(Numeric)
42
+ raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+ # Validate finite values (no NaN or Infinite)
49
+ # @param data [Array] 2D array to validate
50
+ # @raise [ArgumentError] If any float is NaN or Infinite
51
+ def validate_finite_values(data)
52
+ data.each_with_index do |row, i|
53
+ row.each_with_index do |val, j|
54
+ # Only check for NaN/Infinite on floats
55
+ if val.is_a?(Float) && (val.nan? || val.infinite?)
56
+ raise ArgumentError, "Element at position [#{i}, #{j}] is NaN or Infinite"
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ # Standard validation for most algorithms
63
+ # @param data [Array] 2D array to validate
64
+ # @param check_finite [Boolean] Whether to check for NaN/Infinite values
65
+ # @raise [ArgumentError] If data is invalid
66
+ def validate_standard(data, check_finite: true)
67
+ validate_basic_structure(data)
68
+ validate_row_consistency(data)
69
+ validate_numeric_types(data)
70
+ validate_finite_values(data) if check_finite
71
+ end
72
+
73
+ # Validation for clustering algorithms (KMeans, HDBSCAN) with specific error messages
74
+ # @param data [Array] 2D array to validate
75
+ # @param check_finite [Boolean] Whether to check for NaN/Infinite values
76
+ # @raise [ArgumentError] If data is invalid
77
+ def validate_clustering(data, check_finite: false)
78
+ raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
79
+ raise ArgumentError, "Data cannot be empty" if data.empty?
80
+ raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
81
+
82
+ validate_row_consistency(data)
83
+ validate_numeric_types(data)
84
+ validate_finite_values(data) if check_finite
85
+ end
86
+
87
+ # Validation for PCA with specific error messages (same as clustering but without finite checks)
88
+ # @param data [Array] 2D array to validate
89
+ # @raise [ArgumentError] If data is invalid
90
+ def validate_pca(data)
91
+ raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
92
+ raise ArgumentError, "Data cannot be empty" if data.empty?
93
+ raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
94
+
95
+ validate_row_consistency(data)
96
+ validate_numeric_types(data)
97
+ end
98
+
99
+ # Get data statistics for warnings/error context
100
+ # @param data [Array] 2D array
101
+ # @return [Hash] Statistics about the data
102
+ def data_statistics(data)
103
+ return { n_samples: 0, n_features: 0, data_range: 0.0 } if data.empty?
104
+
105
+ n_samples = data.size
106
+ n_features = data.first&.size || 0
107
+
108
+ # Calculate data range for warnings
109
+ min_val = Float::INFINITY
110
+ max_val = -Float::INFINITY
111
+
112
+ data.each do |row|
113
+ row.each do |val|
114
+ val_f = val.to_f
115
+ min_val = val_f if val_f < min_val
116
+ max_val = val_f if val_f > max_val
117
+ end
118
+ end
119
+
120
+ data_range = max_val - min_val
121
+
122
+ {
123
+ n_samples: n_samples,
124
+ n_features: n_features,
125
+ data_range: data_range,
126
+ min_value: min_val,
127
+ max_value: max_val
128
+ }
129
+ end
130
+ end
131
+ end
132
+ end
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative '../clusterkit'
4
4
  require_relative 'svd'
5
+ require_relative '../data_validator'
5
6
 
6
7
  module ClusterKit
7
8
  module Dimensionality
@@ -30,7 +31,7 @@ module ClusterKit
30
31
 
31
32
  # Perform SVD on centered data
32
33
  # U contains the transformed data, S contains singular values, VT contains components
33
- u, s, vt = ClusterKit.svd(centered_data, @n_components, n_iter: 5)
34
+ u, s, vt = perform_svd(centered_data)
34
35
 
35
36
  # Store the principal components (eigenvectors)
36
37
  @components = vt # Shape: (n_components, n_features)
@@ -76,7 +77,7 @@ module ClusterKit
76
77
  centered_data = center_data(data, @mean)
77
78
 
78
79
  # Perform SVD on centered data
79
- u, s, vt = SVD.randomized_svd(centered_data, @n_components, n_iter: 5)
80
+ u, s, vt = perform_svd(centered_data)
80
81
 
81
82
  # Store the principal components (eigenvectors)
82
83
  @components = vt
@@ -166,17 +167,10 @@ module ClusterKit
166
167
  private
167
168
 
168
169
  def validate_data(data)
169
- raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
170
- raise ArgumentError, "Data cannot be empty" if data.empty?
171
- raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
172
-
173
- # Check all rows have same length
174
- row_length = data.first.length
175
- unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
176
- raise ArgumentError, "All rows must have the same length"
177
- end
170
+ # Use shared validation for common checks
171
+ DataValidator.validate_pca(data)
178
172
 
179
- # Check we have enough samples for n_components
173
+ # PCA-specific validations
180
174
  if data.size < @n_components
181
175
  raise ArgumentError, "n_components (#{@n_components}) cannot be larger than n_samples (#{data.size})"
182
176
  end
@@ -237,6 +231,12 @@ module ClusterKit
237
231
 
238
232
  transformed
239
233
  end
234
+
235
+ # Shared SVD computation for both fit and fit_transform
236
+ # Ensures both methods use identical SVD invocation and parameters
237
+ def perform_svd(centered_data)
238
+ SVD.randomized_svd(centered_data, @n_components, n_iter: 5)
239
+ end
240
240
  end
241
241
 
242
242
  # Module-level convenience method
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative '../clusterkit'
4
+ require_relative '../data_validator'
4
5
 
5
6
  module ClusterKit
6
7
  module Dimensionality
@@ -8,7 +9,7 @@ module ClusterKit
8
9
  # Decomposes a matrix into U, S, V^T components
9
10
  class SVD
10
11
  attr_reader :n_components, :n_iter, :random_seed
11
- attr_reader :u, :s, :vt
12
+ attr_reader :u, :s, :vt, :n_features
12
13
 
13
14
  # Initialize a new SVD instance
14
15
  # @param n_components [Integer] Number of components to compute
@@ -27,7 +28,8 @@ module ClusterKit
27
28
  def fit_transform(data)
28
29
  validate_input(data)
29
30
 
30
- # Store reference to original data for transform detection
31
+ # Store data characteristics for later transform operations
32
+ @n_features = data.first.size
31
33
  @original_data_id = data.object_id
32
34
 
33
35
  # Determine n_components if not set
@@ -77,26 +79,21 @@ module ClusterKit
77
79
 
78
80
  # Transform data using fitted SVD (project onto components)
79
81
  # @param data [Array<Array<Numeric>>] Data to transform
80
- # @return [Array<Array<Float>>] Transformed data (U * S)
82
+ # @return [Array<Array<Float>>] Transformed data projected onto SVD components
81
83
  def transform(data)
82
84
  raise RuntimeError, "Model must be fitted first" unless fitted?
83
- validate_input(data)
84
-
85
- # For SVD, transform typically means projecting onto the components
86
- # This is equivalent to data * V (or data * V^T.T)
87
- # But for dimensionality reduction, we usually want U * S
88
- # which is already computed in fit_transform
85
+ validate_transform_input(data)
89
86
 
90
- # If transforming new data, we'd need to project it
91
- # For now, return U * S for the fitted data
92
87
  if data.object_id == @original_data_id
93
88
  # Same data that was fitted - return U * S
94
89
  @u.map.with_index do |row, i|
95
90
  row.map.with_index { |val, j| val * @s[j] }
96
91
  end
97
92
  else
98
- # New data - would need proper projection
99
- raise NotImplementedError, "Transform for new data not yet implemented"
93
+ # New data - project onto V components: data × V
94
+ # Since we have V^T, we need to transpose it back to V
95
+ # V = V^T^T, so we project: data × V^T^T
96
+ transform_new_data(data)
100
97
  end
101
98
  end
102
99
 
@@ -135,9 +132,43 @@ module ClusterKit
135
132
  private
136
133
 
137
134
  def validate_input(data)
138
- raise ArgumentError, "Input must be an array" unless data.is_a?(Array)
139
- raise ArgumentError, "Input cannot be empty" if data.empty?
140
- raise ArgumentError, "Input must be a 2D array" unless data.first.is_a?(Array)
135
+ DataValidator.validate_standard(data, check_finite: false)
136
+ end
137
+
138
+ def validate_transform_input(data)
139
+ DataValidator.validate_standard(data, check_finite: false)
140
+
141
+ # Check feature count matches training data
142
+ if data.first.size != @n_features
143
+ raise ArgumentError, "New data has #{data.first.size} features, but model was fitted with #{@n_features} features"
144
+ end
145
+ end
146
+
147
+ # Transform new data by projecting onto V components
148
+ # Mathematical operation: new_data × V, where V = V^T^T
149
+ def transform_new_data(data)
150
+ # V^T is stored as @vt (shape: n_components × n_features)
151
+ # We need V (shape: n_features × n_components)
152
+ # V = V^T^T, so we transpose @vt
153
+
154
+ result = []
155
+ data.each do |sample|
156
+ # Project sample onto each component (column of V = row of V^T)
157
+ projected = Array.new(@vt.size, 0.0)
158
+
159
+ @vt.each_with_index do |vt_row, comp_idx|
160
+ # Dot product: sample · vt_row (this is sample · V[:, comp_idx])
161
+ dot_product = 0.0
162
+ sample.each_with_index do |val, feat_idx|
163
+ dot_product += val * vt_row[feat_idx]
164
+ end
165
+ projected[comp_idx] = dot_product
166
+ end
167
+
168
+ result << projected
169
+ end
170
+
171
+ result
141
172
  end
142
173
  end
143
174
  end
@@ -4,6 +4,7 @@ require 'fileutils'
4
4
  require 'json'
5
5
  require_relative '../configuration'
6
6
  require_relative '../silence'
7
+ require_relative '../data_validator'
7
8
 
8
9
  module ClusterKit
9
10
  module Dimensionality
@@ -224,44 +225,10 @@ module ClusterKit
224
225
  end
225
226
 
226
227
  def validate_input(data, check_min_samples: true)
227
- raise ArgumentError, "Input must be an array" unless data.is_a?(Array)
228
- raise ArgumentError, "Input cannot be empty" if data.empty?
228
+ # Use shared validation for common checks
229
+ DataValidator.validate_standard(data)
229
230
 
230
- first_row = data.first
231
- raise ArgumentError, "Input must be a 2D array (array of arrays)" unless first_row.is_a?(Array)
232
-
233
- row_length = first_row.length
234
- min_val = Float::INFINITY
235
- max_val = -Float::INFINITY
236
-
237
- # First validate data structure and types
238
- data.each_with_index do |row, i|
239
- unless row.is_a?(Array)
240
- raise ArgumentError, "Row #{i} is not an array"
241
- end
242
-
243
- if row.length != row_length
244
- raise ArgumentError, "All rows must have the same length (row #{i} has #{row.length} elements, expected #{row_length})"
245
- end
246
-
247
- row.each_with_index do |val, j|
248
- unless val.is_a?(Numeric)
249
- raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
250
- end
251
-
252
- # Only check for NaN/Infinite on floats
253
- if val.is_a?(Float) && (val.nan? || val.infinite?)
254
- raise ArgumentError, "Element at position [#{i}, #{j}] is NaN or Infinite"
255
- end
256
-
257
- # Track data range
258
- val_f = val.to_f
259
- min_val = val_f if val_f < min_val
260
- max_val = val_f if val_f > max_val
261
- end
262
- end
263
-
264
- # Check for sufficient data points after validating structure (only for fit operations)
231
+ # UMAP-specific validations
265
232
  if check_min_samples && data.size < 10
266
233
  raise ::ClusterKit::InsufficientDataError, <<~MSG
267
234
  UMAP requires at least 10 data points, but only #{data.size} provided.
@@ -274,9 +241,9 @@ module ClusterKit
274
241
  end
275
242
 
276
243
  # Check for extreme data ranges that might cause numerical issues
277
- data_range = max_val - min_val
278
- if data_range > 1000
279
- warn "WARNING: Large data range detected (#{data_range.round(2)}). Consider normalizing your data to prevent numerical instability."
244
+ stats = DataValidator.data_statistics(data)
245
+ if stats[:data_range] > 1000
246
+ warn "WARNING: Large data range detected (#{stats[:data_range].round(2)}). Consider normalizing your data to prevent numerical instability."
280
247
  end
281
248
  end
282
249
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ClusterKit
4
- VERSION = "0.2.0"
4
+ VERSION = "0.2.1"
5
5
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: clusterkit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Petersen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-09-05 00:00:00.000000000 Z
11
+ date: 2025-09-06 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rb_sys
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.9'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.9'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: csv
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -52,20 +66,6 @@ dependencies:
52
66
  - - "~>"
53
67
  - !ruby/object:Gem::Version
54
68
  version: '1.2'
55
- - !ruby/object:Gem::Dependency
56
- name: rb_sys
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - "~>"
60
- - !ruby/object:Gem::Version
61
- version: '0.9'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - "~>"
67
- - !ruby/object:Gem::Version
68
- version: '0.9'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rspec
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -159,6 +159,7 @@ files:
159
159
  - lib/clusterkit/clustering/hdbscan.rb
160
160
  - lib/clusterkit/clusterkit.rb
161
161
  - lib/clusterkit/configuration.rb
162
+ - lib/clusterkit/data_validator.rb
162
163
  - lib/clusterkit/dimensionality.rb
163
164
  - lib/clusterkit/dimensionality/pca.rb
164
165
  - lib/clusterkit/dimensionality/svd.rb