clusterkit 0.3.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.simplecov +47 -0
- data/CHANGELOG.md +35 -0
- data/CLAUDE.md +226 -0
- data/Cargo.lock +3228 -0
- data/Cargo.toml +8 -0
- data/Gemfile +17 -0
- data/IMPLEMENTATION_NOTES.md +143 -0
- data/LICENSE.txt +21 -0
- data/PYTHON_COMPARISON.md +183 -0
- data/README.md +744 -0
- data/Rakefile +259 -0
- data/docs/KNOWN_ISSUES.md +130 -0
- data/docs/RUST_ERROR_HANDLING.md +164 -0
- data/docs/TEST_FIXTURES.md +170 -0
- data/docs/UMAP_EXPLAINED.md +362 -0
- data/docs/UMAP_TROUBLESHOOTING.md +284 -0
- data/docs/VERBOSE_OUTPUT.md +84 -0
- data/docs/assets/clusterkit-wide.png +0 -0
- data/docs/assets/clusterkit.png +0 -0
- data/docs/assets/visualization.png +0 -0
- data/examples/hdbscan_example.rb +147 -0
- data/examples/optimal_kmeans_example.rb +96 -0
- data/examples/pca_example.rb +114 -0
- data/examples/reproducible_umap.rb +99 -0
- data/examples/verbose_control.rb +43 -0
- data/ext/clusterkit/Cargo.toml +26 -0
- data/ext/clusterkit/extconf.rb +23 -0
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +80 -0
- data/ext/clusterkit/src/clustering.rs +221 -0
- data/ext/clusterkit/src/embedder.rs +349 -0
- data/ext/clusterkit/src/hnsw.rs +579 -0
- data/ext/clusterkit/src/lib.rs +24 -0
- data/ext/clusterkit/src/svd.rs +89 -0
- data/ext/clusterkit/src/tests.rs +16 -0
- data/ext/clusterkit/src/utils.rs +183 -0
- data/lib/clusterkit/3.1/clusterkit.bundle +0 -0
- data/lib/clusterkit/3.2/clusterkit.bundle +0 -0
- data/lib/clusterkit/3.3/clusterkit.bundle +0 -0
- data/lib/clusterkit/3.4/clusterkit.bundle +0 -0
- data/lib/clusterkit/clustering/hdbscan.rb +164 -0
- data/lib/clusterkit/clustering.rb +194 -0
- data/lib/clusterkit/clusterkit.rb +14 -0
- data/lib/clusterkit/configuration.rb +24 -0
- data/lib/clusterkit/data_validator.rb +132 -0
- data/lib/clusterkit/dimensionality/pca.rb +251 -0
- data/lib/clusterkit/dimensionality/svd.rb +175 -0
- data/lib/clusterkit/dimensionality/umap.rb +282 -0
- data/lib/clusterkit/dimensionality.rb +29 -0
- data/lib/clusterkit/hdbscan_api_design.rb +142 -0
- data/lib/clusterkit/hnsw.rb +251 -0
- data/lib/clusterkit/preprocessing.rb +106 -0
- data/lib/clusterkit/silence.rb +42 -0
- data/lib/clusterkit/utils.rb +51 -0
- data/lib/clusterkit/version.rb +5 -0
- data/lib/clusterkit.rb +105 -0
- data/lib/tasks/visualize.rake +641 -0
- metadata +214 -0
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
use magnus::{Error, RArray, RHash, Value, TryConvert, Integer, Module, Object, Ruby};
|
|
2
|
+
use magnus::value::ReprValue;
|
|
3
|
+
use hnsw_rs::prelude::*;
|
|
4
|
+
use annembed::prelude::*;
|
|
5
|
+
use std::fs::File;
|
|
6
|
+
use std::io::{Write, Read};
|
|
7
|
+
use std::cell::RefCell;
|
|
8
|
+
use bincode;
|
|
9
|
+
use serde::{Serialize, Deserialize};
|
|
10
|
+
use crate::utils::ruby_array_to_vec_vec_f32;
|
|
11
|
+
|
|
12
|
+
// Simple struct to serialize UMAP results
|
|
13
|
+
#[derive(Serialize, Deserialize)]
|
|
14
|
+
struct SavedUMAPModel {
|
|
15
|
+
n_components: usize,
|
|
16
|
+
n_neighbors: usize,
|
|
17
|
+
nb_grad_batch: usize,
|
|
18
|
+
nb_sampling_by_edge: usize,
|
|
19
|
+
embeddings: Vec<Vec<f64>>,
|
|
20
|
+
original_data: Vec<Vec<f32>>,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
|
24
|
+
let ruby = Ruby::get().unwrap();
|
|
25
|
+
let umap_class = parent.define_class("RustUMAP", ruby.class_object())?;
|
|
26
|
+
|
|
27
|
+
umap_class.define_singleton_method("new", magnus::function!(RustUMAP::new, 1))?;
|
|
28
|
+
umap_class.define_singleton_method("load_model", magnus::function!(RustUMAP::load_model, 1))?;
|
|
29
|
+
umap_class.define_method("fit_transform", magnus::method!(RustUMAP::fit_transform, 1))?;
|
|
30
|
+
umap_class.define_method("save_model", magnus::method!(RustUMAP::save_model, 1))?;
|
|
31
|
+
umap_class.define_method("transform", magnus::method!(RustUMAP::transform, 1))?;
|
|
32
|
+
|
|
33
|
+
Ok(())
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
#[magnus::wrap(class = "ClusterKit::RustUMAP")]
|
|
37
|
+
struct RustUMAP {
|
|
38
|
+
n_components: usize,
|
|
39
|
+
n_neighbors: usize,
|
|
40
|
+
#[allow(dead_code)]
|
|
41
|
+
random_seed: Option<u64>,
|
|
42
|
+
nb_grad_batch: usize,
|
|
43
|
+
nb_sampling_by_edge: usize,
|
|
44
|
+
training_data: RefCell<Option<Vec<Vec<f32>>>>,
|
|
45
|
+
training_embeddings: RefCell<Option<Vec<Vec<f64>>>>,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
impl RustUMAP {
|
|
49
|
+
fn new(options: RHash) -> Result<Self, Error> {
|
|
50
|
+
let ruby = Ruby::get().unwrap();
|
|
51
|
+
|
|
52
|
+
let n_components = match options.lookup::<_, Value>(ruby.to_symbol("n_components")) {
|
|
53
|
+
Ok(val) => {
|
|
54
|
+
if val.is_nil() {
|
|
55
|
+
2
|
|
56
|
+
} else {
|
|
57
|
+
Integer::try_convert(val)
|
|
58
|
+
.map(|i| i.to_u32().unwrap_or(2) as usize)
|
|
59
|
+
.unwrap_or(2)
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
Err(_) => 2,
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
let n_neighbors = match options.lookup::<_, Value>(ruby.to_symbol("n_neighbors")) {
|
|
66
|
+
Ok(val) => {
|
|
67
|
+
if val.is_nil() {
|
|
68
|
+
15
|
|
69
|
+
} else {
|
|
70
|
+
Integer::try_convert(val)
|
|
71
|
+
.map(|i| i.to_u32().unwrap_or(15) as usize)
|
|
72
|
+
.unwrap_or(15)
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
Err(_) => 15,
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
let random_seed = match options.lookup::<_, Value>(ruby.to_symbol("random_seed")) {
|
|
79
|
+
Ok(val) => {
|
|
80
|
+
if val.is_nil() {
|
|
81
|
+
None
|
|
82
|
+
} else {
|
|
83
|
+
Integer::try_convert(val)
|
|
84
|
+
.map(|i| Some(i.to_u64().unwrap_or(42)))
|
|
85
|
+
.unwrap_or(None)
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
Err(_) => None,
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
let nb_grad_batch = match options.lookup::<_, Value>(ruby.to_symbol("nb_grad_batch")) {
|
|
92
|
+
Ok(val) => {
|
|
93
|
+
if val.is_nil() {
|
|
94
|
+
10
|
|
95
|
+
} else {
|
|
96
|
+
Integer::try_convert(val)
|
|
97
|
+
.map(|i| i.to_u32().unwrap_or(10) as usize)
|
|
98
|
+
.unwrap_or(10)
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
Err(_) => 10,
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
let nb_sampling_by_edge = match options.lookup::<_, Value>(ruby.to_symbol("nb_sampling_by_edge")) {
|
|
105
|
+
Ok(val) => {
|
|
106
|
+
if val.is_nil() {
|
|
107
|
+
8
|
|
108
|
+
} else {
|
|
109
|
+
Integer::try_convert(val)
|
|
110
|
+
.map(|i| i.to_u32().unwrap_or(8) as usize)
|
|
111
|
+
.unwrap_or(8)
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
Err(_) => 8,
|
|
115
|
+
};
|
|
116
|
+
|
|
117
|
+
Ok(RustUMAP {
|
|
118
|
+
n_components,
|
|
119
|
+
n_neighbors,
|
|
120
|
+
random_seed,
|
|
121
|
+
nb_grad_batch,
|
|
122
|
+
nb_sampling_by_edge,
|
|
123
|
+
training_data: RefCell::new(None),
|
|
124
|
+
training_embeddings: RefCell::new(None),
|
|
125
|
+
})
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
fn fit_transform(&self, data: Value) -> Result<RArray, Error> {
|
|
129
|
+
let ruby = Ruby::get().unwrap();
|
|
130
|
+
|
|
131
|
+
// Convert Ruby array to Rust Vec<Vec<f32>> using shared helper
|
|
132
|
+
let data_f32 = ruby_array_to_vec_vec_f32(data)?;
|
|
133
|
+
|
|
134
|
+
// Build HNSW graph
|
|
135
|
+
let ef_c = 50;
|
|
136
|
+
let max_nb_connection = 70;
|
|
137
|
+
let nb_points = data_f32.len();
|
|
138
|
+
let nb_layer = 16.min((nb_points as f32).ln().trunc() as usize);
|
|
139
|
+
|
|
140
|
+
// Create HNSW with or without seed
|
|
141
|
+
let hnsw = match self.random_seed {
|
|
142
|
+
Some(seed) => Hnsw::<f32, DistL2>::new_with_seed(
|
|
143
|
+
max_nb_connection, nb_points, nb_layer, ef_c, DistL2 {}, seed
|
|
144
|
+
),
|
|
145
|
+
None => Hnsw::<f32, DistL2>::new(
|
|
146
|
+
max_nb_connection, nb_points, nb_layer, ef_c, DistL2 {}
|
|
147
|
+
),
|
|
148
|
+
};
|
|
149
|
+
|
|
150
|
+
// Insert data into HNSW
|
|
151
|
+
let data_with_id: Vec<(&Vec<f32>, usize)> = data_f32.iter()
|
|
152
|
+
.enumerate()
|
|
153
|
+
.map(|(i, v)| (v, i))
|
|
154
|
+
.collect();
|
|
155
|
+
|
|
156
|
+
if self.random_seed.is_some() {
|
|
157
|
+
hnsw.serial_insert(&data_with_id);
|
|
158
|
+
} else {
|
|
159
|
+
hnsw.parallel_insert(&data_with_id);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Create KGraph from HNSW
|
|
163
|
+
let kgraph: annembed::fromhnsw::kgraph::KGraph<f32> = annembed::fromhnsw::kgraph::kgraph_from_hnsw_all(&hnsw, self.n_neighbors)
|
|
164
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
165
|
+
|
|
166
|
+
// Set up embedding parameters
|
|
167
|
+
let mut embed_params = EmbedderParams::default();
|
|
168
|
+
embed_params.asked_dim = self.n_components;
|
|
169
|
+
embed_params.nb_grad_batch = self.nb_grad_batch;
|
|
170
|
+
embed_params.scale_rho = 1.;
|
|
171
|
+
embed_params.beta = 1.;
|
|
172
|
+
embed_params.b = 1.;
|
|
173
|
+
embed_params.grad_step = 1.;
|
|
174
|
+
embed_params.nb_sampling_by_edge = self.nb_sampling_by_edge;
|
|
175
|
+
embed_params.dmap_init = true;
|
|
176
|
+
embed_params.random_seed = self.random_seed;
|
|
177
|
+
|
|
178
|
+
// Create embedder and perform embedding
|
|
179
|
+
let mut embedder = Embedder::new(&kgraph, embed_params);
|
|
180
|
+
|
|
181
|
+
let embed_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
|
182
|
+
embedder.embed()
|
|
183
|
+
}));
|
|
184
|
+
|
|
185
|
+
let embed_result = match embed_result {
|
|
186
|
+
Ok(Ok(result)) => result,
|
|
187
|
+
Ok(Err(e)) => {
|
|
188
|
+
return Err(Error::new(ruby.exception_runtime_error(),
|
|
189
|
+
format!("Embedding failed: {}", e)));
|
|
190
|
+
}
|
|
191
|
+
Err(panic_info) => {
|
|
192
|
+
let msg = if let Some(s) = panic_info.downcast_ref::<String>() {
|
|
193
|
+
s.clone()
|
|
194
|
+
} else if let Some(s) = panic_info.downcast_ref::<&str>() {
|
|
195
|
+
s.to_string()
|
|
196
|
+
} else {
|
|
197
|
+
"unknown panic".to_string()
|
|
198
|
+
};
|
|
199
|
+
return Err(Error::new(ruby.exception_runtime_error(),
|
|
200
|
+
format!("Embedding panicked: {}", msg)));
|
|
201
|
+
}
|
|
202
|
+
};
|
|
203
|
+
|
|
204
|
+
if embed_result == 0 {
|
|
205
|
+
return Err(Error::new(ruby.exception_runtime_error(), "No points were embedded"));
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
// Get embedded data
|
|
209
|
+
let embedded_array = embedder.get_embedded_reindexed();
|
|
210
|
+
|
|
211
|
+
let mut embeddings = Vec::new();
|
|
212
|
+
for i in 0..embedded_array.nrows() {
|
|
213
|
+
let mut row = Vec::new();
|
|
214
|
+
for j in 0..embedded_array.ncols() {
|
|
215
|
+
row.push(embedded_array[[i, j]] as f64);
|
|
216
|
+
}
|
|
217
|
+
embeddings.push(row);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Store the training data and embeddings for future transforms
|
|
221
|
+
*self.training_data.borrow_mut() = Some(data_f32.clone());
|
|
222
|
+
*self.training_embeddings.borrow_mut() = Some(embeddings.clone());
|
|
223
|
+
|
|
224
|
+
// Convert result back to Ruby array
|
|
225
|
+
let result = ruby.ary_new();
|
|
226
|
+
for embedding in &embeddings {
|
|
227
|
+
let row = ruby.ary_new();
|
|
228
|
+
for &val in embedding {
|
|
229
|
+
row.push(val)?;
|
|
230
|
+
}
|
|
231
|
+
result.push(row)?;
|
|
232
|
+
}
|
|
233
|
+
Ok(result)
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
fn save_model(&self, path: String) -> Result<(), Error> {
|
|
237
|
+
let ruby = Ruby::get().unwrap();
|
|
238
|
+
let training_data = self.training_data.borrow();
|
|
239
|
+
let training_embeddings = self.training_embeddings.borrow();
|
|
240
|
+
|
|
241
|
+
let training_data_ref = training_data.as_ref()
|
|
242
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No model to save. Run fit_transform first."))?;
|
|
243
|
+
let training_embeddings_ref = training_embeddings.as_ref()
|
|
244
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No embeddings to save."))?;
|
|
245
|
+
|
|
246
|
+
let saved_model = SavedUMAPModel {
|
|
247
|
+
n_components: self.n_components,
|
|
248
|
+
n_neighbors: self.n_neighbors,
|
|
249
|
+
nb_grad_batch: self.nb_grad_batch,
|
|
250
|
+
nb_sampling_by_edge: self.nb_sampling_by_edge,
|
|
251
|
+
embeddings: training_embeddings_ref.clone(),
|
|
252
|
+
original_data: training_data_ref.clone(),
|
|
253
|
+
};
|
|
254
|
+
|
|
255
|
+
let serialized = bincode::serialize(&saved_model)
|
|
256
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
257
|
+
|
|
258
|
+
let mut file = File::create(&path)
|
|
259
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
260
|
+
|
|
261
|
+
file.write_all(&serialized)
|
|
262
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
263
|
+
|
|
264
|
+
Ok(())
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
fn load_model(path: String) -> Result<Self, Error> {
|
|
268
|
+
let ruby = Ruby::get().unwrap();
|
|
269
|
+
|
|
270
|
+
let mut file = File::open(&path)
|
|
271
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
272
|
+
|
|
273
|
+
let mut buffer = Vec::new();
|
|
274
|
+
file.read_to_end(&mut buffer)
|
|
275
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
276
|
+
|
|
277
|
+
let saved_model: SavedUMAPModel = bincode::deserialize(&buffer)
|
|
278
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
279
|
+
|
|
280
|
+
Ok(RustUMAP {
|
|
281
|
+
n_components: saved_model.n_components,
|
|
282
|
+
n_neighbors: saved_model.n_neighbors,
|
|
283
|
+
random_seed: None,
|
|
284
|
+
nb_grad_batch: saved_model.nb_grad_batch,
|
|
285
|
+
nb_sampling_by_edge: saved_model.nb_sampling_by_edge,
|
|
286
|
+
training_data: RefCell::new(Some(saved_model.original_data)),
|
|
287
|
+
training_embeddings: RefCell::new(Some(saved_model.embeddings)),
|
|
288
|
+
})
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
fn transform(&self, data: Value) -> Result<RArray, Error> {
|
|
292
|
+
let ruby = Ruby::get().unwrap();
|
|
293
|
+
let training_data = self.training_data.borrow();
|
|
294
|
+
let training_embeddings = self.training_embeddings.borrow();
|
|
295
|
+
|
|
296
|
+
let training_data_ref = training_data.as_ref()
|
|
297
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No model loaded. Load a model or run fit_transform first."))?;
|
|
298
|
+
let training_embeddings_ref = training_embeddings.as_ref()
|
|
299
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "No embeddings available."))?;
|
|
300
|
+
|
|
301
|
+
let new_data = ruby_array_to_vec_vec_f32(data)?;
|
|
302
|
+
|
|
303
|
+
let k = self.n_neighbors.min(training_data_ref.len());
|
|
304
|
+
let result = ruby.ary_new();
|
|
305
|
+
|
|
306
|
+
for new_point in &new_data {
|
|
307
|
+
let mut distances: Vec<(f32, usize)> = Vec::new();
|
|
308
|
+
for (idx, train_point) in training_data_ref.iter().enumerate() {
|
|
309
|
+
let dist = euclidean_distance(new_point, train_point);
|
|
310
|
+
distances.push((dist, idx));
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
|
314
|
+
let k_nearest = &distances[..k];
|
|
315
|
+
|
|
316
|
+
let mut avg_embedding = vec![0.0; self.n_components];
|
|
317
|
+
let mut total_weight = 0.0;
|
|
318
|
+
|
|
319
|
+
for &(dist, idx) in k_nearest {
|
|
320
|
+
let weight = 1.0 / (dist as f64 + 0.001);
|
|
321
|
+
total_weight += weight;
|
|
322
|
+
|
|
323
|
+
for (i, &val) in training_embeddings_ref[idx].iter().enumerate() {
|
|
324
|
+
avg_embedding[i] += val * weight;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
for val in &mut avg_embedding {
|
|
329
|
+
*val /= total_weight;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
let row = ruby.ary_new();
|
|
333
|
+
for val in avg_embedding {
|
|
334
|
+
row.push(val)?;
|
|
335
|
+
}
|
|
336
|
+
result.push(row)?;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
Ok(result)
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
|
|
344
|
+
a.iter()
|
|
345
|
+
.zip(b.iter())
|
|
346
|
+
.map(|(x, y)| (x - y).powi(2))
|
|
347
|
+
.sum::<f32>()
|
|
348
|
+
.sqrt()
|
|
349
|
+
}
|