clusterkit 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,68 +1,52 @@
1
- use magnus::{function, prelude::*, Error, Value, RArray, Integer, TryConvert};
1
+ use magnus::{function, prelude::*, Error, Value, RArray, Ruby};
2
2
  use ndarray::{Array1, Array2, ArrayView1, Axis};
3
3
  use rand::prelude::*;
4
+ use rand::rngs::StdRng;
5
+ use rand::SeedableRng;
6
+ use crate::utils::ruby_array_to_ndarray2;
4
7
 
5
8
  mod hdbscan_wrapper;
6
9
 
7
10
  pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
8
11
  let clustering_module = parent.define_module("Clustering")?;
9
-
12
+
10
13
  clustering_module.define_singleton_method(
11
14
  "kmeans_rust",
12
- function!(kmeans, 3),
15
+ function!(kmeans, 4),
13
16
  )?;
14
-
17
+
15
18
  clustering_module.define_singleton_method(
16
19
  "kmeans_predict_rust",
17
20
  function!(kmeans_predict, 2),
18
21
  )?;
19
-
22
+
20
23
  // Initialize HDBSCAN functions
21
24
  hdbscan_wrapper::init(&clustering_module)?;
22
-
25
+
23
26
  Ok(())
24
27
  }
25
28
 
26
29
  /// Perform K-means clustering
27
30
  /// Returns (labels, centroids, inertia)
28
- fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64), Error> {
29
- // Convert Ruby array to ndarray
30
- let rarray: RArray = TryConvert::try_convert(data)?;
31
- let n_samples = rarray.len();
32
-
33
- if n_samples == 0 {
34
- return Err(Error::new(
35
- magnus::exception::arg_error(),
36
- "Data cannot be empty",
37
- ));
38
- }
39
-
40
- // Get dimensions
41
- let first_row: RArray = rarray.entry::<RArray>(0)?;
42
- let n_features = first_row.len();
43
-
31
+ fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> Result<(RArray, RArray, f64), Error> {
32
+ let ruby = Ruby::get().unwrap();
33
+
34
+ // Convert Ruby array to ndarray using shared helper
35
+ let data_array = ruby_array_to_ndarray2(data)?;
36
+ let (n_samples, n_features) = data_array.dim();
37
+
44
38
  if k > n_samples {
45
39
  return Err(Error::new(
46
- magnus::exception::arg_error(),
40
+ ruby.exception_arg_error(),
47
41
  format!("k ({}) cannot be larger than number of samples ({})", k, n_samples),
48
42
  ));
49
43
  }
50
-
51
- // Convert to ndarray
52
- let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
53
- for i in 0..n_samples {
54
- let row: RArray = rarray.entry(i as isize)?;
55
- for j in 0..n_features {
56
- let val: f64 = row.entry(j as isize)?;
57
- data_array[[i, j]] = val;
58
- }
59
- }
60
-
44
+
61
45
  // Initialize centroids using K-means++
62
- let mut centroids = kmeans_plusplus(&data_array, k)?;
46
+ let mut centroids = kmeans_plusplus(&data_array, k, random_seed)?;
63
47
  let mut labels = vec![0usize; n_samples];
64
48
  let mut prev_labels = vec![0usize; n_samples];
65
-
49
+
66
50
  // K-means iterations
67
51
  for iteration in 0..max_iter {
68
52
  // Assign points to nearest centroid
@@ -71,7 +55,7 @@ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64
71
55
  let point = data_array.row(i);
72
56
  let mut min_dist = f64::INFINITY;
73
57
  let mut best_cluster = 0;
74
-
58
+
75
59
  for (j, centroid) in centroids.axis_iter(Axis(0)).enumerate() {
76
60
  let dist = euclidean_distance(&point, &centroid);
77
61
  if dist < min_dist {
@@ -79,38 +63,38 @@ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64
79
63
  best_cluster = j;
80
64
  }
81
65
  }
82
-
66
+
83
67
  if labels[i] != best_cluster {
84
68
  changed = true;
85
69
  }
86
70
  labels[i] = best_cluster;
87
71
  }
88
-
72
+
89
73
  // Check for convergence
90
74
  if !changed && iteration > 0 {
91
75
  break;
92
76
  }
93
-
77
+
94
78
  // Update centroids
95
79
  for j in 0..k {
96
80
  let mut sum = Array1::<f64>::zeros(n_features);
97
81
  let mut count = 0;
98
-
82
+
99
83
  for i in 0..n_samples {
100
84
  if labels[i] == j {
101
85
  sum += &data_array.row(i);
102
86
  count += 1;
103
87
  }
104
88
  }
105
-
89
+
106
90
  if count > 0 {
107
91
  centroids.row_mut(j).assign(&(sum / count as f64));
108
92
  }
109
93
  }
110
-
94
+
111
95
  prev_labels.clone_from(&labels);
112
96
  }
113
-
97
+
114
98
  // Calculate inertia (sum of squared distances to nearest centroid)
115
99
  let mut inertia = 0.0;
116
100
  for i in 0..n_samples {
@@ -118,75 +102,43 @@ fn kmeans(data: Value, k: usize, max_iter: usize) -> Result<(RArray, RArray, f64
118
102
  let centroid = centroids.row(labels[i]);
119
103
  inertia += euclidean_distance(&point, &centroid).powi(2);
120
104
  }
121
-
105
+
122
106
  // Convert results to Ruby arrays
123
- let ruby = magnus::Ruby::get().unwrap();
124
- let labels_array = RArray::new();
107
+ let labels_array = ruby.ary_new();
125
108
  for label in labels {
126
- labels_array.push(Integer::from_value(ruby.eval(&format!("{}", label)).unwrap()).unwrap())?;
109
+ labels_array.push(ruby.integer_from_i64(label as i64))?;
127
110
  }
128
-
129
- let centroids_array = RArray::new();
111
+
112
+ let centroids_array = ruby.ary_new();
130
113
  for i in 0..k {
131
- let row_array = RArray::new();
114
+ let row_array = ruby.ary_new();
132
115
  for j in 0..n_features {
133
116
  row_array.push(centroids[[i, j]])?;
134
117
  }
135
118
  centroids_array.push(row_array)?;
136
119
  }
137
-
120
+
138
121
  Ok((labels_array, centroids_array, inertia))
139
122
  }
140
123
 
141
124
  /// Predict cluster labels for new data given centroids
142
125
  fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
143
- // Convert inputs
144
- let data_array: RArray = TryConvert::try_convert(data)?;
145
- let centroids_array: RArray = TryConvert::try_convert(centroids)?;
146
-
147
- let n_samples = data_array.len();
148
- let k = centroids_array.len();
149
-
150
- if n_samples == 0 {
151
- return Err(Error::new(
152
- magnus::exception::arg_error(),
153
- "Data cannot be empty",
154
- ));
155
- }
156
-
157
- // Get dimensions
158
- let first_row: RArray = data_array.entry::<RArray>(0)?;
159
- let n_features = first_row.len();
160
-
161
- // Convert data to ndarray
162
- let mut data_matrix = Array2::<f64>::zeros((n_samples, n_features));
163
- for i in 0..n_samples {
164
- let row: RArray = data_array.entry(i as isize)?;
165
- for j in 0..n_features {
166
- let val: f64 = row.entry(j as isize)?;
167
- data_matrix[[i, j]] = val;
168
- }
169
- }
170
-
171
- // Convert centroids to ndarray
172
- let mut centroids_matrix = Array2::<f64>::zeros((k, n_features));
173
- for i in 0..k {
174
- let row: RArray = centroids_array.entry(i as isize)?;
175
- for j in 0..n_features {
176
- let val: f64 = row.entry(j as isize)?;
177
- centroids_matrix[[i, j]] = val;
178
- }
179
- }
180
-
126
+ let ruby = Ruby::get().unwrap();
127
+
128
+ // Convert inputs using shared helpers
129
+ let data_matrix = ruby_array_to_ndarray2(data)?;
130
+ let centroids_matrix = ruby_array_to_ndarray2(centroids)?;
131
+
132
+ let (n_samples, _) = data_matrix.dim();
133
+
181
134
  // Predict labels
182
- let ruby = magnus::Ruby::get().unwrap();
183
- let labels_array = RArray::new();
184
-
135
+ let labels_array = ruby.ary_new();
136
+
185
137
  for i in 0..n_samples {
186
138
  let point = data_matrix.row(i);
187
139
  let mut min_dist = f64::INFINITY;
188
140
  let mut best_cluster = 0;
189
-
141
+
190
142
  for (j, centroid) in centroids_matrix.axis_iter(Axis(0)).enumerate() {
191
143
  let dist = euclidean_distance(&point, &centroid);
192
144
  if dist < min_dist {
@@ -194,30 +146,37 @@ fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
194
146
  best_cluster = j;
195
147
  }
196
148
  }
197
-
198
- labels_array.push(Integer::from_value(ruby.eval(&format!("{}", best_cluster)).unwrap()).unwrap())?;
149
+
150
+ labels_array.push(ruby.integer_from_i64(best_cluster as i64))?;
199
151
  }
200
-
152
+
201
153
  Ok(labels_array)
202
154
  }
203
155
 
204
156
  /// K-means++ initialization
205
- fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
157
+ fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Result<Array2<f64>, Error> {
206
158
  let n_samples = data.nrows();
207
159
  let n_features = data.ncols();
208
- let mut rng = thread_rng();
209
-
160
+
161
+ // Use seeded RNG if seed is provided, otherwise use thread_rng
162
+ let mut rng: Box<dyn RngCore> = match random_seed {
163
+ Some(seed) => {
164
+ let seed_u64 = seed as u64;
165
+ Box::new(StdRng::seed_from_u64(seed_u64))
166
+ },
167
+ None => Box::new(thread_rng()),
168
+ };
169
+
210
170
  let mut centroids = Array2::<f64>::zeros((k, n_features));
211
-
171
+
212
172
  // Choose first centroid randomly
213
173
  let first_idx = rng.gen_range(0..n_samples);
214
174
  centroids.row_mut(0).assign(&data.row(first_idx));
215
-
175
+
216
176
  // Choose remaining centroids
217
177
  for i in 1..k {
218
178
  let mut distances = vec![f64::INFINITY; n_samples];
219
-
220
- // Calculate distance to nearest centroid for each point
179
+
221
180
  for j in 0..n_samples {
222
181
  for c in 0..i {
223
182
  let dist = euclidean_distance(&data.row(j), &centroids.row(c));
@@ -226,25 +185,20 @@ fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
226
185
  }
227
186
  }
228
187
  }
229
-
230
- // Convert distances to probabilities
188
+
231
189
  let total: f64 = distances.iter().map(|d| d * d).sum();
232
190
  if total == 0.0 {
233
- // All points are identical or we've selected duplicates
234
- // Just use sequential points as centroids
235
191
  if i < n_samples {
236
192
  centroids.row_mut(i).assign(&data.row(i));
237
193
  } else {
238
- // Reuse first point if we run out
239
194
  centroids.row_mut(i).assign(&data.row(0));
240
195
  }
241
196
  continue;
242
197
  }
243
-
244
- // Choose next centroid with probability proportional to squared distance
198
+
245
199
  let mut cumsum = 0.0;
246
200
  let rand_val: f64 = rng.gen::<f64>() * total;
247
-
201
+
248
202
  for j in 0..n_samples {
249
203
  cumsum += distances[j] * distances[j];
250
204
  if cumsum >= rand_val {
@@ -253,7 +207,7 @@ fn kmeans_plusplus(data: &Array2<f64>, k: usize) -> Result<Array2<f64>, Error> {
253
207
  }
254
208
  }
255
209
  }
256
-
210
+
257
211
  Ok(centroids)
258
212
  }
259
213
 
@@ -264,4 +218,4 @@ fn euclidean_distance(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
264
218
  .map(|(x, y)| (x - y).powi(2))
265
219
  .sum::<f64>()
266
220
  .sqrt()
267
- }
221
+ }