clusterkit 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4df777f3b01fea2f411cde6233b2abd5c1ab01903117ee8f377a1ae67bb6510b
4
- data.tar.gz: cbf92ed9e86d14a3959b81f348e69f4eb2fa3bcd92d86149c4872ba9a197f7c6
3
+ metadata.gz: ebad40c2aac3fa3569357eedf740336a3d463ecc5c038d2771d3fd266d414b1e
4
+ data.tar.gz: 0ab2851e0adab583567460e469d4073e755514854e57bca464d5a835c1534bea
5
5
  SHA512:
6
- metadata.gz: bae2d9fac87d5cb27458ae8ace71edffe754216ea5b9af088c98fecfc4489dc3f572ca45e7d926ed658516dac2ab18b6bcf04fc138daf48a01d17569f5698521
7
- data.tar.gz: 560c59116a016c60dae79ed3817158005551c8a5e6a1d7d7b47a67cdb87a0c566d07db7771bcad0507e21111d380b1b4795bd85a5e75d5e00ce1229a181704b9
6
+ metadata.gz: 16412a2db10bf55593c778c7a02813b8db9652e39479d17fa3cd10da1e87298d5d95f4afccd8f5da10bb89bc950b314faeb9a8f0149ecd2dd3c3305121c1b5b2
7
+ data.tar.gz: edf73f2d0ce8f73441c07975a5cd4da31faaece9b216d908a743b9c1bd3f2d8e213d1b85a7e82f2f397c60f8d2fd2c4a0109a5a337963e008ecf069b3fa40266
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # ClusterKit
1
+ <img src="/docs/assets/clusterkit-wide.png" alt="clusterkit" height="80px">
2
2
 
3
3
  A high-performance clustering and dimensionality reduction toolkit for Ruby, powered by best-in-class Rust implementations.
4
4
 
@@ -44,7 +44,7 @@ ClusterKit organizes its functionality into clear modules:
44
44
 
45
45
  - **`ClusterKit::Dimensionality`** - All dimensionality reduction algorithms
46
46
  - `ClusterKit::Dimensionality::UMAP` - UMAP implementation
47
- - `ClusterKit::Dimensionality::PCA` - PCA implementation
47
+ - `ClusterKit::Dimensionality::PCA` - PCA implementation
48
48
  - `ClusterKit::Dimensionality::SVD` - SVD implementation
49
49
  - **`ClusterKit::Clustering`** - All clustering algorithms
50
50
  - `ClusterKit::Clustering::KMeans` - K-means clustering
@@ -96,7 +96,7 @@ data = []
96
96
  3.times do |cluster|
97
97
  # Each cluster has a different center, well-separated
98
98
  center = Array.new(50) { rand * 0.1 + cluster * 2.0 }
99
-
99
+
100
100
  # Add 33 points around each center with controlled noise
101
101
  33.times do
102
102
  point = center.map { |c| c + (rand - 0.5) * 0.3 }
@@ -329,6 +329,223 @@ probabilities = hdbscan.probabilities # Cluster membership probabilities
329
329
  outlier_scores = hdbscan.outlier_scores # Outlier scores for each point
330
330
  ```
331
331
 
332
+ ### HNSW - Fast Nearest Neighbor Search
333
+
334
+ ClusterKit includes HNSW (Hierarchical Navigable Small World) for fast approximate nearest neighbor search, useful for building recommendation systems, similarity search, and as a building block for other algorithms.
335
+
336
+ Copy and paste this **entire block** into IRB to try HNSW with real embeddings:
337
+
338
+ ```ruby
339
+ require 'clusterkit'
340
+ require 'candle'
341
+
342
+ # Step 1: Initialize the embedding model
343
+ puts "Loading embedding model..."
344
+ embedding_model = Candle::EmbeddingModel.from_pretrained(
345
+ 'sentence-transformers/all-MiniLM-L6-v2',
346
+ device: Candle::Device.best
347
+ )
348
+ puts " ✓ Model loaded: #{embedding_model.model_id}"
349
+
350
+ # Step 2: Create sample documents for semantic search
351
+ documents = [
352
+ "The cat sat on the mat",
353
+ "Dogs are loyal pets that love their owners",
354
+ "Machine learning algorithms can classify text documents",
355
+ "Natural language processing helps computers understand human language",
356
+ "Ruby is a programming language known for its simplicity",
357
+ "Python is popular for data science and machine learning",
358
+ "The weather today is sunny and warm",
359
+ "Climate change affects global weather patterns",
360
+ "Artificial intelligence is transforming many industries",
361
+ "Deep learning models require large amounts of training data",
362
+ "Cats and dogs are common household pets",
363
+ "Software engineering requires problem-solving skills",
364
+ "The ocean contains many different species of fish",
365
+ "Marine biology studies life in aquatic environments",
366
+ "Cooking requires understanding of ingredients and techniques"
367
+ ]
368
+
369
+ puts "\nGenerating embeddings for #{documents.size} documents..."
370
+
371
+ # Step 3: Generate embeddings for all documents
372
+ embeddings = documents.map do |doc|
373
+ embedding_model.embedding(doc).first.to_a
374
+ end
375
+ puts " ✓ Generated embeddings: #{embeddings.first.count} dimensions each"
376
+
377
+ # Step 4: Create HNSW index
378
+ puts "\nBuilding HNSW search index..."
379
+ index = ClusterKit::HNSW.new(
380
+ dim: embeddings.first.count, # 384 dimensions for all-MiniLM-L6-v2
381
+ space: :euclidean,
382
+ m: 16, # Good balance of speed vs accuracy
383
+ ef_construction: 200, # Build quality
384
+ max_elements: documents.size,
385
+ random_seed: 42 # For reproducible results
386
+ )
387
+
388
+ # Step 5: Add all documents to the index
389
+ documents.each_with_index do |doc, i|
390
+ index.add_item(
391
+ embeddings[i],
392
+ label: "doc_#{i}",
393
+ metadata: {
394
+ 'text' => doc,
395
+ 'length' => doc.length,
396
+ 'word_count' => doc.split.size
397
+ }
398
+ )
399
+ end
400
+ puts " ✓ Added #{documents.size} documents to index"
401
+
402
+ # Step 6: Perform semantic searches
403
+ puts "\n" + "="*50
404
+ puts "SEMANTIC SEARCH DEMO"
405
+ puts "="*50
406
+
407
+ queries = [
408
+ "pets and animals",
409
+ "computer programming",
410
+ "weather and environment"
411
+ ]
412
+
413
+ queries.each do |query|
414
+ puts "\nQuery: '#{query}'"
415
+ puts "-" * 30
416
+
417
+ # Generate query embedding
418
+ query_embedding = embedding_model.embedding(query).first.to_a
419
+
420
+ # Search for similar documents
421
+ results = index.search_with_metadata(query_embedding, k: 3)
422
+
423
+ results.each_with_index do |result, i|
424
+ similarity = (1.0 - result[:distance]).round(3) # Convert distance to similarity
425
+ text = result[:metadata]['text']
426
+ puts " #{i+1}. [#{similarity}] #{text}"
427
+ end
428
+ end
429
+
430
+ # Step 7: Demonstrate advanced features
431
+ puts "\n" + "="*50
432
+ puts "ADVANCED FEATURES"
433
+ puts "="*50
434
+
435
+ # Show search quality adjustment
436
+ puts "\nAdjusting search quality (ef parameter):"
437
+ index.set_ef(50) # Lower ef = faster but potentially less accurate
438
+ fast_results = index.search(embeddings[0], k: 3)
439
+ puts " Fast search (ef=50): #{fast_results}"
440
+
441
+ index.set_ef(200) # Higher ef = slower but more accurate
442
+ accurate_results = index.search(embeddings[0], k: 3)
443
+ puts " Accurate search (ef=200): #{accurate_results}"
444
+
445
+ # Show batch operations
446
+ puts "\nBatch search example:"
447
+ query_embeddings = [embeddings[0], embeddings[5], embeddings[10]]
448
+ batch_results = query_embeddings.map { |emb| index.search(emb, k: 2) }
449
+ puts " Found #{batch_results.size} result sets"
450
+
451
+ # Save and load demonstration
452
+ puts "\nSaving and loading index:"
453
+ index.save('demo_index')
454
+ puts " ✓ Index saved to 'demo_index'"
455
+
456
+ loaded_index = ClusterKit::HNSW.load('demo_index')
457
+ test_results = loaded_index.search(embeddings[0], k: 2)
458
+ puts " ✓ Loaded index works: #{test_results}"
459
+
460
+ puts "\n✅ HNSW demo complete!"
461
+ puts "\nTry your own queries by running:"
462
+ puts "query_embedding = embedding_model.embedding('your search query').first.to_a"
463
+ puts "results = index.search_with_metadata(query_embedding, k: 5)"
464
+ ```
465
+
466
+ #### When to Use HNSW
467
+
468
+ HNSW is ideal for:
469
+ - **Recommendation Systems**: Find similar items/users quickly
470
+ - **Semantic Search**: Find documents with similar embeddings
471
+ - **Duplicate Detection**: Identify near-duplicate content
472
+ - **Clustering Support**: As a fast neighbor graph for HDBSCAN
473
+ - **Real-time Applications**: When you need sub-millisecond search times
474
+
475
+ #### Configuration Guidelines
476
+
477
+ ```ruby
478
+ # High recall (>0.95) - Best quality, slower
479
+ ClusterKit::HNSW.new(
480
+ dim: dim,
481
+ m: 32,
482
+ ef_construction: 400
483
+ ).tap { |idx| idx.set_ef(100) }
484
+
485
+ # Balanced (>0.90 recall) - Good quality, fast
486
+ ClusterKit::HNSW.new(
487
+ dim: dim,
488
+ m: 16,
489
+ ef_construction: 200
490
+ ).tap { |idx| idx.set_ef(50) }
491
+
492
+ # Speed optimized (>0.85 recall) - Fastest, acceptable quality
493
+ ClusterKit::HNSW.new(
494
+ dim: dim,
495
+ m: 8,
496
+ ef_construction: 100
497
+ ).tap { |idx| idx.set_ef(20) }
498
+ ```
499
+
500
+ #### Important Notes
501
+
502
+ 1. **Memory Usage**: HNSW keeps the entire index in memory. Estimate: `(num_items * (dim * 4 + m * 16))` bytes
503
+ 2. **Distance Metrics**: Currently only Euclidean distance is fully supported
504
+ 3. **Loading Behavior**: Due to Rust lifetime constraints, loading an index creates a small memory leak (the index metadata persists until program exit). This is typically negligible for most applications.
505
+ 4. **Build Time**: Index construction is O(N * log(N)). For large datasets (>1M items), consider building offline
506
+
507
+ #### Example: Semantic Search System
508
+
509
+ ```ruby
510
+ # Build a simple semantic search system
511
+ documents = load_documents()
512
+ embeddings = generate_embeddings(documents) # Use red-candle or similar
513
+
514
+ # Build search index
515
+ search_index = ClusterKit::HNSW.new(
516
+ dim: embeddings.first.size,
517
+ m: 16,
518
+ ef_construction: 200,
519
+ max_elements: documents.size
520
+ )
521
+
522
+ # Add all documents
523
+ documents.each_with_index do |doc, i|
524
+ search_index.add_item(
525
+ embeddings[i],
526
+ label: i,
527
+ metadata: { title: doc[:title], url: doc[:url] }
528
+ )
529
+ end
530
+
531
+ # Search function
532
+ def search(query, index, k: 10)
533
+ query_embedding = generate_embedding(query)
534
+ results = index.search_with_metadata(query_embedding, k: k)
535
+
536
+ results.map do |result|
537
+ {
538
+ title: result[:metadata]['title'],
539
+ url: result[:metadata]['url'],
540
+ similarity: 1.0 - result[:distance] # Convert distance to similarity
541
+ }
542
+ end
543
+ end
544
+
545
+ # Save for later use
546
+ search_index.save('document_index')
547
+ ```
548
+
332
549
  ### Visualization
333
550
 
334
551
  ClusterKit includes a built-in visualization tool:
@@ -350,6 +567,9 @@ This creates an interactive HTML file with:
350
567
  - Performance metrics
351
568
  - Interactive Plotly.js charts
352
569
 
570
+ <img src="/docs/assets/visualization.png" alt="rake clusterkit:visualize">
571
+
572
+
353
573
  ## Choosing the Right Algorithm
354
574
 
355
575
  ### Dimensionality Reduction
@@ -454,7 +674,7 @@ This error occurs when UMAP cannot find enough neighbors for some points. Soluti
454
674
  ```ruby
455
675
  # Bad: Pure random data with no structure
456
676
  data = Array.new(100) { Array.new(50) { rand } }
457
-
677
+
458
678
  # Good: Data with clusters or patterns (see Quick Start example)
459
679
  # Create clusters with centers and add points around them
460
680
  ```
@@ -521,4 +741,4 @@ If you use ClusterKit in your research, please cite:
521
741
 
522
742
  And please also cite the underlying libraries:
523
743
  - [annembed](https://github.com/jean-pierreBoth/annembed) for dimensionality reduction algorithms
524
- - [hdbscan](https://github.com/petabi/hdbscan) for HDBSCAN clustering
744
+ - [hdbscan](https://github.com/petabi/hdbscan) for HDBSCAN clustering
Binary file
Binary file
Binary file
@@ -0,0 +1,613 @@
1
+ use magnus::{
2
+ class, exception, function, method, prelude::*,
3
+ Error, Float, Integer, RArray, RHash, RString, Symbol, Value, value, TryConvert, r_hash::ForEach
4
+ };
5
+ use hnsw_rs::prelude::*;
6
+ use hnsw_rs::hnswio::HnswIo;
7
+ // use ndarray::Array1; // Not used currently
8
+ use std::collections::HashMap;
9
+ use std::sync::{Arc, Mutex};
10
+ use serde::{Serialize, Deserialize};
11
+ use std::fs::File;
12
+
13
+ // Store metadata alongside vectors
14
+ #[derive(Clone, Debug, Serialize, Deserialize)]
15
+ struct ItemMetadata {
16
+ label: String,
17
+ metadata: Option<HashMap<String, String>>,
18
+ }
19
+
20
+ // Main HNSW wrapper struct
21
+ #[magnus::wrap(class = "ClusterKit::HNSW", free_immediately, size)]
22
+ pub struct HnswIndex {
23
+ hnsw: Arc<Mutex<Hnsw<'static, f32, DistL2>>>,
24
+ dim: usize,
25
+ space: DistanceType,
26
+ metadata_store: Arc<Mutex<HashMap<usize, ItemMetadata>>>,
27
+ current_id: Arc<Mutex<usize>>,
28
+ label_to_id: Arc<Mutex<HashMap<String, usize>>>,
29
+ ef_search: Arc<Mutex<usize>>,
30
+ }
31
+
32
+ #[derive(Clone, Copy)]
33
+ #[allow(dead_code)] // These variants will be implemented in the future
34
+ enum DistanceType {
35
+ Euclidean,
36
+ Cosine,
37
+ InnerProduct,
38
+ }
39
+
40
+ impl HnswIndex {
41
+ // Initialize a new HNSW index
42
+ pub fn new(kwargs: RHash) -> Result<Self, Error> {
43
+ // Parse arguments
44
+ let dim_opt: Option<Value> = kwargs.delete(Symbol::new("dim"))?;
45
+ let dim_value = dim_opt.ok_or_else(|| Error::new(exception::arg_error(), "dim is required"))?;
46
+ let dim: usize = TryConvert::try_convert(dim_value)
47
+ .map_err(|_| Error::new(exception::arg_error(), "dim must be an integer"))?;
48
+
49
+ // Validate dimension
50
+ if dim == 0 {
51
+ return Err(Error::new(exception::arg_error(), "dim must be a positive integer (got 0)"));
52
+ }
53
+
54
+ let space: String = if let Some(v) = kwargs.delete(Symbol::new("space"))? {
55
+ // Convert Ruby symbol to string properly
56
+ if let Ok(sym) = Symbol::try_convert(v) {
57
+ sym.name()?.to_string()
58
+ } else if let Ok(s) = String::try_convert(v) {
59
+ s
60
+ } else {
61
+ return Err(Error::new(
62
+ exception::type_error(),
63
+ "space must be a string or symbol"
64
+ ));
65
+ }
66
+ } else {
67
+ "euclidean".to_string()
68
+ };
69
+
70
+ let max_elements: usize = if let Some(v) = kwargs.delete(Symbol::new("max_elements"))? {
71
+ TryConvert::try_convert(v).unwrap_or(10_000)
72
+ } else {
73
+ 10_000
74
+ };
75
+
76
+ let m: usize = if let Some(v) = kwargs.delete(Symbol::new("M"))? {
77
+ TryConvert::try_convert(v).unwrap_or(16)
78
+ } else {
79
+ 16
80
+ };
81
+
82
+ let ef_construction: usize = if let Some(v) = kwargs.delete(Symbol::new("ef_construction"))? {
83
+ TryConvert::try_convert(v).unwrap_or(200)
84
+ } else {
85
+ 200
86
+ };
87
+
88
+ let random_seed: Option<u64> = if let Some(v) = kwargs.delete(Symbol::new("random_seed"))? {
89
+ TryConvert::try_convert(v).ok()
90
+ } else {
91
+ None
92
+ };
93
+
94
+ // Validate and convert space parameter
95
+ // For now, only support Euclidean distance
96
+ let distance_type = match space.as_str() {
97
+ "euclidean" => DistanceType::Euclidean,
98
+ "cosine" => {
99
+ return Err(Error::new(
100
+ exception::runtime_error(),
101
+ "Cosine distance is not yet implemented, please use :euclidean"
102
+ ));
103
+ },
104
+ "inner_product" => {
105
+ return Err(Error::new(
106
+ exception::runtime_error(),
107
+ "Inner product distance is not yet implemented, please use :euclidean"
108
+ ));
109
+ },
110
+ _ => return Err(Error::new(
111
+ exception::arg_error(),
112
+ format!("space must be :euclidean, :cosine, or :inner_product (got: {})", space)
113
+ )),
114
+ };
115
+
116
+ // Create HNSW instance with Euclidean distance
117
+ let hnsw = if let Some(seed) = random_seed {
118
+ Hnsw::<f32, DistL2>::new_with_seed(m, max_elements, 16, ef_construction, DistL2, seed)
119
+ } else {
120
+ Hnsw::<f32, DistL2>::new(m, max_elements, 16, ef_construction, DistL2)
121
+ };
122
+
123
+ Ok(Self {
124
+ hnsw: Arc::new(Mutex::new(hnsw)),
125
+ dim,
126
+ space: distance_type,
127
+ metadata_store: Arc::new(Mutex::new(HashMap::new())),
128
+ current_id: Arc::new(Mutex::new(0)),
129
+ label_to_id: Arc::new(Mutex::new(HashMap::new())),
130
+ ef_search: Arc::new(Mutex::new(ef_construction)),
131
+ })
132
+ }
133
+
134
+ // Add a single item to the index
135
+ pub fn add_item(&self, vector: RArray, kwargs: RHash) -> Result<Value, Error> {
136
+ // Parse vector
137
+ let vec_data = parse_vector(vector, self.dim)?;
138
+
139
+ // Get or generate label
140
+ let label: String = if let Some(v) = kwargs.delete(Symbol::new("label"))? {
141
+ TryConvert::try_convert(v).unwrap_or_else(|_| {
142
+ let mut id = self.current_id.lock().unwrap();
143
+ let label = id.to_string();
144
+ *id += 1;
145
+ label
146
+ })
147
+ } else {
148
+ let mut id = self.current_id.lock().unwrap();
149
+ let label = id.to_string();
150
+ *id += 1;
151
+ label
152
+ };
153
+
154
+ // Get metadata if provided
155
+ let metadata: Option<HashMap<String, String>> = if let Some(v) = kwargs.delete(Symbol::new("metadata"))? {
156
+ Some(parse_metadata(v)?)
157
+ } else {
158
+ None
159
+ };
160
+
161
+ // Get internal ID for this item
162
+ let internal_id = {
163
+ let mut label_map = self.label_to_id.lock().unwrap();
164
+ let mut current_id = self.current_id.lock().unwrap();
165
+
166
+ if label_map.contains_key(&label) {
167
+ return Err(Error::new(
168
+ exception::arg_error(),
169
+ format!("Label '{}' already exists in index", label)
170
+ ));
171
+ }
172
+
173
+ let id = *current_id;
174
+ label_map.insert(label.clone(), id);
175
+ *current_id += 1;
176
+ id
177
+ };
178
+
179
+ // Store metadata
180
+ {
181
+ let mut metadata_store = self.metadata_store.lock().unwrap();
182
+ metadata_store.insert(internal_id, ItemMetadata {
183
+ label: label.clone(),
184
+ metadata,
185
+ });
186
+ }
187
+
188
+ // Add to HNSW
189
+ {
190
+ let hnsw = self.hnsw.lock().unwrap();
191
+ hnsw.insert((&vec_data, internal_id));
192
+ }
193
+
194
+ Ok(value::qnil().as_value())
195
+ }
196
+
197
+ // Add multiple items in batch
198
+ pub fn add_batch(&self, vectors: RArray, kwargs: RHash) -> Result<Value, Error> {
199
+ let parallel: bool = if let Some(v) = kwargs.delete(Symbol::new("parallel"))? {
200
+ TryConvert::try_convert(v).unwrap_or(true)
201
+ } else {
202
+ true
203
+ };
204
+
205
+ let labels: Option<RArray> = if let Some(v) = kwargs.delete(Symbol::new("labels"))? {
206
+ TryConvert::try_convert(v).ok()
207
+ } else {
208
+ None
209
+ };
210
+
211
+ // Parse all vectors
212
+ let mut data_points: Vec<(Vec<f32>, usize)> = Vec::new();
213
+ let mut metadata_entries: Vec<(usize, ItemMetadata)> = Vec::new();
214
+
215
+ for (i, vector) in vectors.each().enumerate() {
216
+ let vector: RArray = TryConvert::try_convert(vector?)?;
217
+ let vec_data = parse_vector(vector, self.dim)?;
218
+
219
+ // Get or generate label
220
+ let label = if let Some(ref labels_array) = labels {
221
+ labels_array.entry::<String>(i as isize)?
222
+ } else {
223
+ let mut id = self.current_id.lock().unwrap();
224
+ let label = id.to_string();
225
+ *id += 1;
226
+ label
227
+ };
228
+
229
+ // Get internal ID
230
+ let internal_id = {
231
+ let mut label_map = self.label_to_id.lock().unwrap();
232
+ let mut current_id = self.current_id.lock().unwrap();
233
+
234
+ if label_map.contains_key(&label) {
235
+ return Err(Error::new(
236
+ exception::arg_error(),
237
+ format!("Label '{}' already exists in index", label)
238
+ ));
239
+ }
240
+
241
+ let id = *current_id;
242
+ label_map.insert(label.clone(), id);
243
+ *current_id += 1;
244
+ id
245
+ };
246
+
247
+ data_points.push((vec_data, internal_id));
248
+ metadata_entries.push((internal_id, ItemMetadata {
249
+ label,
250
+ metadata: None,
251
+ }));
252
+ }
253
+
254
+ // Store metadata
255
+ {
256
+ let mut metadata_store = self.metadata_store.lock().unwrap();
257
+ for (id, metadata) in metadata_entries {
258
+ metadata_store.insert(id, metadata);
259
+ }
260
+ }
261
+
262
+ // Insert into HNSW
263
+ {
264
+ let hnsw = self.hnsw.lock().unwrap();
265
+ if parallel {
266
+ let data_refs: Vec<(&Vec<f32>, usize)> = data_points.iter().map(|(v, id)| (v, *id)).collect();
267
+ hnsw.parallel_insert(&data_refs);
268
+ } else {
269
+ for (vec, id) in data_points {
270
+ hnsw.insert((&vec, id));
271
+ }
272
+ }
273
+ }
274
+
275
+ Ok(value::qnil().as_value())
276
+ }
277
+
278
+ // Search for k nearest neighbors
279
+ pub fn search(&self, query: RArray, kwargs: RHash) -> Result<Value, Error> {
280
+ let k: usize = if let Some(v) = kwargs.delete(Symbol::new("k"))? {
281
+ TryConvert::try_convert(v).unwrap_or(10)
282
+ } else {
283
+ 10
284
+ };
285
+
286
+ let include_distances: bool = if let Some(v) = kwargs.delete(Symbol::new("include_distances"))? {
287
+ TryConvert::try_convert(v).unwrap_or(false)
288
+ } else {
289
+ false
290
+ };
291
+
292
+ // Parse query vector
293
+ let query_vec = parse_vector(query, self.dim)?;
294
+
295
+ // Set search ef if provided
296
+ if let Some(v) = kwargs.delete(Symbol::new("ef"))? {
297
+ if let Ok(ef) = TryConvert::try_convert(v) as Result<usize, _> {
298
+ let mut ef_search = self.ef_search.lock().unwrap();
299
+ *ef_search = ef;
300
+ }
301
+ }
302
+
303
+ // Perform search
304
+ let neighbors = {
305
+ let hnsw = self.hnsw.lock().unwrap();
306
+ let ef_search = self.ef_search.lock().unwrap();
307
+ hnsw.search(&query_vec, k, *ef_search)
308
+ };
309
+
310
+ // Convert results
311
+ let metadata_store = self.metadata_store.lock().unwrap();
312
+
313
+ let indices = RArray::new();
314
+ let distances = RArray::new();
315
+
316
+ for neighbor in neighbors {
317
+ if let Some(metadata) = metadata_store.get(&neighbor.d_id) {
318
+ indices.push(RString::new(&metadata.label))?;
319
+ distances.push(Float::from_f64(neighbor.distance as f64))?;
320
+ }
321
+ }
322
+
323
+ if include_distances {
324
+ let result = RArray::new();
325
+ result.push(indices)?;
326
+ result.push(distances)?;
327
+ Ok(result.as_value())
328
+ } else {
329
+ Ok(indices.as_value())
330
+ }
331
+ }
332
+
333
+ // Search with metadata included
334
+ pub fn search_with_metadata(&self, query: RArray, kwargs: RHash) -> Result<Value, Error> {
335
+ let k: usize = if let Some(v) = kwargs.delete(Symbol::new("k"))? {
336
+ TryConvert::try_convert(v).unwrap_or(10)
337
+ } else {
338
+ 10
339
+ };
340
+
341
+ // Parse query vector
342
+ let query_vec = parse_vector(query, self.dim)?;
343
+
344
+ // Perform search
345
+ let neighbors = {
346
+ let hnsw = self.hnsw.lock().unwrap();
347
+ let ef_search = self.ef_search.lock().unwrap();
348
+ hnsw.search(&query_vec, k, *ef_search)
349
+ };
350
+
351
+ // Build results with metadata
352
+ let metadata_store = self.metadata_store.lock().unwrap();
353
+ let results = RArray::new();
354
+
355
+ for neighbor in neighbors {
356
+ if let Some(item_metadata) = metadata_store.get(&neighbor.d_id) {
357
+ let result = RHash::new();
358
+ result.aset(Symbol::new("label"), RString::new(&item_metadata.label))?;
359
+ result.aset(Symbol::new("distance"), Float::from_f64(neighbor.distance as f64))?;
360
+
361
+ let meta_hash = RHash::new();
362
+ if let Some(ref meta) = item_metadata.metadata {
363
+ for (key, value) in meta {
364
+ meta_hash.aset(RString::new(key), RString::new(value))?;
365
+ }
366
+ }
367
+ result.aset(Symbol::new("metadata"), meta_hash)?;
368
+
369
+ results.push(result)?;
370
+ }
371
+ }
372
+
373
+ Ok(results.as_value())
374
+ }
375
+
376
+ // Get current size of the index
377
+ pub fn size(&self) -> Result<usize, Error> {
378
+ let metadata_store = self.metadata_store.lock().unwrap();
379
+ Ok(metadata_store.len())
380
+ }
381
+
382
+ // Check if index is empty
383
+ pub fn empty(&self) -> Result<bool, Error> {
384
+ Ok(self.size()? == 0)
385
+ }
386
+
387
+ // Set the ef parameter for search
388
+ pub fn set_ef(&self, ef: usize) -> Result<Value, Error> {
389
+ let mut ef_search = self.ef_search.lock().unwrap();
390
+ *ef_search = ef;
391
+ Ok(value::qnil().as_value())
392
+ }
393
+
394
+ // Get configuration
395
+ pub fn config(&self) -> Result<RHash, Error> {
396
+ let config = RHash::new();
397
+ config.aset(Symbol::new("dim"), Integer::from_i64(self.dim as i64))?;
398
+
399
+ let space_str = match self.space {
400
+ DistanceType::Euclidean => "euclidean",
401
+ DistanceType::Cosine => "cosine",
402
+ DistanceType::InnerProduct => "inner_product",
403
+ };
404
+ config.aset(Symbol::new("space"), RString::new(space_str))?;
405
+
406
+ let ef_search = self.ef_search.lock().unwrap();
407
+ config.aset(Symbol::new("ef"), Integer::from_i64(*ef_search as i64))?;
408
+ config.aset(Symbol::new("size"), Integer::from_i64(self.size()? as i64))?;
409
+
410
+ Ok(config)
411
+ }
412
+
413
+ // Get statistics about the index
414
+ pub fn stats(&self) -> Result<RHash, Error> {
415
+ let stats = RHash::new();
416
+
417
+ stats.aset(Symbol::new("size"), Integer::from_i64(self.size()? as i64))?;
418
+ stats.aset(Symbol::new("dim"), Integer::from_i64(self.dim as i64))?;
419
+
420
+ let ef_search = self.ef_search.lock().unwrap();
421
+ stats.aset(Symbol::new("ef_search"), Integer::from_i64(*ef_search as i64))?;
422
+
423
+ // TODO: Add more statistics from HNSW structure
424
+
425
+ Ok(stats)
426
+ }
427
+
428
+ // Load index from file (class method)
429
+ pub fn load(path: RString) -> Result<Self, Error> {
430
+ let path_str = path.to_string()?;
431
+
432
+ // Load metadata first to get dimensions and space
433
+ let metadata_path = format!("{}.metadata", path_str);
434
+ let metadata_file = File::open(&metadata_path)
435
+ .map_err(|e| Error::new(exception::runtime_error(), format!("Failed to open metadata file: {}", e)))?;
436
+
437
+ let (
438
+ _metadata_store,
439
+ _label_to_id,
440
+ _current_id,
441
+ _dim,
442
+ _space_str,
443
+ ): (
444
+ HashMap<usize, ItemMetadata>,
445
+ HashMap<String, usize>,
446
+ usize,
447
+ usize,
448
+ String, // Changed from &str to String for deserialization
449
+ ) = bincode::deserialize_from(metadata_file)
450
+ .map_err(|e| Error::new(exception::runtime_error(), format!("Failed to load metadata: {}", e)))?;
451
+
452
+ // Load HNSW structure
453
+ let hnsw_dir = format!("{}_hnsw_data", path_str);
454
+ let hnsw_path = std::path::Path::new(&hnsw_dir);
455
+
456
+ // Create HnswIo and leak it to get 'static lifetime
457
+ // This is a memory leak, but necessary due to hnsw_rs lifetime constraints
458
+ // The memory will never be freed until the program exits
459
+ let hnswio = Box::new(HnswIo::new(hnsw_path, "hnsw"));
460
+ let hnswio_static: &'static mut HnswIo = Box::leak(hnswio);
461
+
462
+ // Now we can load the HNSW with 'static lifetime
463
+ let hnsw: Hnsw<'static, f32, DistL2> = hnswio_static.load_hnsw()
464
+ .map_err(|e| Error::new(exception::runtime_error(), format!("Failed to load HNSW index: {}", e)))?;
465
+
466
+ // Use the loaded metadata
467
+ let metadata_store = _metadata_store;
468
+ let label_to_id = _label_to_id;
469
+ let current_id = _current_id;
470
+ let dim = _dim;
471
+ let space = match _space_str.as_str() {
472
+ "euclidean" => DistanceType::Euclidean,
473
+ "cosine" => DistanceType::Cosine,
474
+ "inner_product" => DistanceType::InnerProduct,
475
+ _ => return Err(Error::new(exception::runtime_error(), "Unknown distance type in saved file")),
476
+ };
477
+
478
+ // Use default ef_construction as ef_search
479
+ let ef_search = 200;
480
+
481
+ Ok(Self {
482
+ hnsw: Arc::new(Mutex::new(hnsw)),
483
+ dim,
484
+ space,
485
+ metadata_store: Arc::new(Mutex::new(metadata_store)),
486
+ current_id: Arc::new(Mutex::new(current_id)),
487
+ label_to_id: Arc::new(Mutex::new(label_to_id)),
488
+ ef_search: Arc::new(Mutex::new(ef_search)),
489
+ })
490
+ }
491
+
492
+ // Save index to file
493
+ pub fn save(&self, path: RString) -> Result<Value, Error> {
494
+ let path_str = path.to_string()?;
495
+
496
+ // Create directory for HNSW structure
497
+ let hnsw_dir = format!("{}_hnsw_data", path_str);
498
+ std::fs::create_dir_all(&hnsw_dir)
499
+ .map_err(|e| Error::new(exception::runtime_error(), format!("Failed to create directory: {}", e)))?;
500
+
501
+ // Save HNSW structure
502
+ {
503
+ let hnsw = self.hnsw.lock().unwrap();
504
+ hnsw.file_dump(&std::path::Path::new(&hnsw_dir), "hnsw")
505
+ .map_err(|e| Error::new(exception::runtime_error(), format!("Failed to save HNSW: {}", e)))?;
506
+ }
507
+
508
+ // Save metadata
509
+ let metadata_path = format!("{}.metadata", path_str);
510
+ {
511
+ let metadata_store = self.metadata_store.lock().unwrap();
512
+ let label_to_id = self.label_to_id.lock().unwrap();
513
+ let current_id = self.current_id.lock().unwrap();
514
+
515
+ let metadata_data = (
516
+ &*metadata_store,
517
+ &*label_to_id,
518
+ *current_id,
519
+ self.dim,
520
+ match self.space {
521
+ DistanceType::Euclidean => "euclidean",
522
+ DistanceType::Cosine => "cosine",
523
+ DistanceType::InnerProduct => "inner_product",
524
+ },
525
+ );
526
+
527
+ let file = File::create(&metadata_path)
528
+ .map_err(|e| Error::new(exception::runtime_error(), format!("Failed to create metadata file: {}", e)))?;
529
+
530
+ bincode::serialize_into(file, &metadata_data)
531
+ .map_err(|e| Error::new(exception::runtime_error(), format!("Failed to save metadata: {}", e)))?;
532
+ }
533
+
534
+ Ok(value::qnil().as_value())
535
+ }
536
+ }
537
+
538
+ // Helper function to parse a Ruby array into a Vec<f32>
539
+ fn parse_vector(array: RArray, expected_dim: usize) -> Result<Vec<f32>, Error> {
540
+ let len = array.len();
541
+ if len != expected_dim {
542
+ return Err(Error::new(
543
+ exception::arg_error(),
544
+ format!("Vector dimension mismatch: expected {}, got {}", expected_dim, len)
545
+ ));
546
+ }
547
+
548
+ let mut vec = Vec::with_capacity(len);
549
+ for item in array.each() {
550
+ let value: f64 = TryConvert::try_convert(item?)
551
+ .map_err(|_| Error::new(exception::type_error(), "Vector elements must be numeric"))?;
552
+ vec.push(value as f32);
553
+ }
554
+
555
+ Ok(vec)
556
+ }
557
+
558
+ // Helper function to parse metadata
559
+ fn parse_metadata(value: Value) -> Result<HashMap<String, String>, Error> {
560
+ let hash: RHash = TryConvert::try_convert(value)
561
+ .map_err(|_| Error::new(exception::type_error(), "Metadata must be a hash"))?;
562
+
563
+ let mut metadata = HashMap::new();
564
+
565
+ hash.foreach(|key: Value, value: Value| {
566
+ // Handle both string and symbol keys
567
+ let key_str = if let Ok(s) = String::try_convert(key) {
568
+ s
569
+ } else if let Ok(sym) = Symbol::try_convert(key) {
570
+ sym.name()?.to_string()
571
+ } else {
572
+ return Err(Error::new(exception::type_error(), "Metadata keys must be strings or symbols"));
573
+ };
574
+
575
+ // Convert value to string (handle various Ruby types)
576
+ let value_str = if let Ok(s) = String::try_convert(value) {
577
+ s
578
+ } else if let Ok(i) = Integer::try_convert(value) {
579
+ i.to_string()
580
+ } else if let Ok(f) = Float::try_convert(value) {
581
+ f.to_f64().to_string()
582
+ } else {
583
+ // Fallback: use Ruby's to_s method
584
+ let to_s_method = value.funcall::<_, _, RString>("to_s", ())?;
585
+ to_s_method.to_string()?
586
+ };
587
+
588
+ metadata.insert(key_str, value_str);
589
+ Ok(ForEach::Continue)
590
+ })?;
591
+
592
+ Ok(metadata)
593
+ }
594
+
595
+ // Initialize the HNSW module
596
+ pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
597
+ let class = parent.define_class("HNSW", class::object())?;
598
+
599
+ class.define_singleton_method("new", function!(HnswIndex::new, 1))?;
600
+ class.define_singleton_method("load", function!(HnswIndex::load, 1))?;
601
+ class.define_method("add_item", method!(HnswIndex::add_item, 2))?;
602
+ class.define_method("add_batch", method!(HnswIndex::add_batch, 2))?;
603
+ class.define_method("search", method!(HnswIndex::search, 2))?;
604
+ class.define_method("search_with_metadata", method!(HnswIndex::search_with_metadata, 2))?;
605
+ class.define_method("size", method!(HnswIndex::size, 0))?;
606
+ class.define_method("empty?", method!(HnswIndex::empty, 0))?;
607
+ class.define_method("set_ef", method!(HnswIndex::set_ef, 1))?;
608
+ class.define_method("config", method!(HnswIndex::config, 0))?;
609
+ class.define_method("stats", method!(HnswIndex::stats, 0))?;
610
+ class.define_method("save", method!(HnswIndex::save, 1))?;
611
+
612
+ Ok(())
613
+ }
@@ -4,6 +4,7 @@ mod embedder;
4
4
  mod svd;
5
5
  mod utils;
6
6
  mod clustering;
7
+ mod hnsw;
7
8
 
8
9
  #[cfg(test)]
9
10
  mod tests;
@@ -17,6 +18,7 @@ fn init() -> Result<(), Error> {
17
18
  svd::init(&module)?;
18
19
  utils::init(&module)?;
19
20
  clustering::init(&module)?;
21
+ hnsw::init(&module)?;
20
22
 
21
23
  Ok(())
22
24
  }
@@ -0,0 +1,251 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClusterKit
4
+ # HNSW (Hierarchical Navigable Small World) index for fast approximate nearest neighbor search
5
+ #
6
+ # @example Basic usage
7
+ # index = ClusterKit::HNSW.new(dim: 128, space: :euclidean)
8
+ # index.add_batch(vectors, labels: labels)
9
+ # neighbors = index.search(query_vector, k: 10)
10
+ #
11
+ # @example With metadata
12
+ # index = ClusterKit::HNSW.new(dim: 768, space: :cosine)
13
+ # index.add_item(vector, label: "doc_1", metadata: { title: "Introduction", date: "2024-01-01" })
14
+ # results = index.search_with_metadata(query, k: 5)
15
+ # # => [{ label: "doc_1", distance: 0.23, metadata: { title: "...", date: "..." } }, ...]
16
+ class HNSW
17
+ # Note: The actual HNSW class is defined in Rust (ext/clusterkit/src/hnsw.rs)
18
+ # This Ruby file adds additional convenience methods and documentation.
19
+ # The Rust implementation provides these core methods:
20
+ # - new(kwargs) - constructor
21
+ # - add_item(vector, kwargs) - add single item
22
+ # - add_batch(vectors, kwargs) - add multiple items
23
+ # - search(query, kwargs) - search for neighbors
24
+ # - search_with_metadata(query, kwargs) - search with metadata
25
+ # - size() - get number of items
26
+ # - config() - get configuration
27
+ # - stats() - get statistics
28
+ # - set_ef(ef) - set search quality parameter
29
+ # - save(path) - save to file
30
+
31
+ # Initialize is actually handled by the Rust code
32
+ # This documentation is for reference
33
+ #
34
+ # @param dim [Integer] Dimension of vectors (required)
35
+ # @param space [Symbol] Distance metric: :euclidean, :cosine, or :inner_product (default: :euclidean)
36
+ # @param max_elements [Integer] Maximum number of elements (default: 10_000)
37
+ # @param m [Integer] Number of bi-directional links (default: 16)
38
+ # @param ef_construction [Integer] Size of dynamic candidate list (default: 200)
39
+ # @param random_seed [Integer, nil] Random seed for reproducible builds (default: nil)
40
+ # @param dynamic_list [Boolean] Allow index to grow dynamically (not yet implemented)
41
+
42
+ # Fit the index with training data (alias for add_batch)
43
+ #
44
+ # @param data [Array<Array>, Numo::NArray] Training vectors
45
+ # @param labels [Array, nil] Optional labels for vectors
46
+ # @return [self]
47
+ def fit(data, labels: nil)
48
+ add_batch(data, labels: labels)
49
+ self
50
+ end
51
+
52
+ # Fit and return transformed data (for compatibility with sklearn-like interface)
53
+ #
54
+ # @param data [Array<Array>, Numo::NArray] Training vectors
55
+ # @return [self]
56
+ def fit_transform(data)
57
+ fit(data)
58
+ self
59
+ end
60
+
61
+ # Add a vector using the << operator
62
+ #
63
+ # @param vector [Array, Numo::NArray] Vector to add
64
+ # @return [self]
65
+ def <<(vector)
66
+ add_item(vector, {})
67
+ self
68
+ end
69
+
70
+ # Alias for search that always includes distances
71
+ #
72
+ # @param query [Array, Numo::NArray] Query vector
73
+ # @param k [Integer] Number of neighbors
74
+ # @param ef [Integer, nil] Search parameter (higher = better quality, slower)
75
+ # @return [Array<Array>] Array of [indices, distances]
76
+ def knn_query(query, k: 10, ef: nil)
77
+ search(query, k: k, ef: ef, include_distances: true)
78
+ end
79
+
80
+ # Batch search for multiple queries
81
+ #
82
+ # @param queries [Array<Array>, Numo::NArray] Multiple query vectors
83
+ # @param k [Integer] Number of neighbors per query
84
+ # @param parallel [Boolean] Process queries in parallel
85
+ # @return [Array<Array>] Results for each query
86
+ def batch_search(queries, k: 10, parallel: true)
87
+ queries = ensure_array(queries)
88
+
89
+ if parallel && queries.size > 1
90
+ require 'parallel'
91
+ Parallel.map(queries) { |query| search(query, k: k) }
92
+ else
93
+ queries.map { |query| search(query, k: k) }
94
+ end
95
+ rescue LoadError
96
+ # Parallel gem not available, fall back to sequential
97
+ queries.map { |query| search(query, k: k) }
98
+ end
99
+
100
+ # Range search - find all points within a given radius
101
+ #
102
+ # @param query [Array, Numo::NArray] Query vector
103
+ # @param radius [Float] Search radius
104
+ # @param limit [Integer, nil] Maximum number of results
105
+ # @return [Array<Hash>] Results within radius
106
+ def range_search(query, radius:, limit: nil)
107
+ # Get a large number of candidates
108
+ k = limit || size
109
+ k = [k, size].min
110
+
111
+ results = search_with_metadata(query, k: k)
112
+
113
+ # Filter by radius
114
+ results.select { |r| r[:distance] <= radius }
115
+ .take(limit || results.size)
116
+ end
117
+
118
+ # Check if index is empty
119
+ # @return [Boolean]
120
+ def empty?
121
+ size == 0
122
+ end
123
+
124
+ # Clear all elements from the index
125
+ #
126
+ # @return [self]
127
+ def clear!
128
+ # Would need to recreate the index
129
+ raise NotImplementedError, "Clear not yet implemented"
130
+ end
131
+
132
+ # Check if a label exists in the index
133
+ #
134
+ # @param label [String, Integer] Label to check
135
+ # @return [Boolean]
136
+ def include?(label)
137
+ # This would need to be implemented in Rust
138
+ # For now, return false
139
+ false
140
+ end
141
+
142
+ # Get recall rate for a test set
143
+ #
144
+ # @param test_queries [Array<Array>] Query vectors
145
+ # @param ground_truth [Array<Array>] True nearest neighbors for each query
146
+ # @param k [Integer] Number of neighbors to evaluate
147
+ # @return [Float] Recall rate (0.0 to 1.0)
148
+ def recall(test_queries, ground_truth, k: 10)
149
+ test_queries = ensure_array(test_queries)
150
+
151
+ require 'set'
152
+ total_correct = 0
153
+ total_possible = 0
154
+
155
+ test_queries.each_with_index do |query, i|
156
+ predicted = Set.new(search(query, k: k))
157
+ actual = Set.new(ground_truth[i].take(k))
158
+
159
+ total_correct += (predicted & actual).size
160
+ total_possible += [k, actual.size].min
161
+ end
162
+
163
+ total_possible > 0 ? total_correct.to_f / total_possible : 0.0
164
+ end
165
+
166
+ # Load an index from file
167
+ # Note: This uses Box::leak internally to work around hnsw_rs lifetime constraints
168
+ # This causes a small memory leak - the HnswIo struct won't be freed until program exit
169
+ #
170
+ # @param path [String] File path to load from
171
+ # @return [HNSW] New HNSW instance loaded from file
172
+ # (The actual implementation is in Rust)
173
+
174
+ # Create an index from embeddings produced by UMAP or other dimensionality reduction
175
+ #
176
+ # @param embeddings [Array<Array>, Numo::NArray] Embedding vectors
177
+ # @param kwargs [Hash] Additional options for HNSW initialization
178
+ # @return [HNSW] New HNSW instance
179
+ def self.from_embedding(embeddings, **kwargs)
180
+ embeddings = ensure_array(embeddings)
181
+
182
+ dim = embeddings.first.size
183
+ index = new(dim: dim, **kwargs)
184
+ index.fit(embeddings)
185
+ index
186
+ end
187
+
188
+ # Builder pattern for creating HNSW indices
189
+ class Builder
190
+ def initialize
191
+ @config = {}
192
+ end
193
+
194
+ def space(type)
195
+ @config[:space] = type
196
+ self
197
+ end
198
+
199
+ def dimensions(dim)
200
+ @config[:dim] = dim
201
+ self
202
+ end
203
+
204
+ def max_elements(n)
205
+ @config[:max_elements] = n
206
+ self
207
+ end
208
+
209
+ def m_parameter(m)
210
+ @config[:m] = m
211
+ self
212
+ end
213
+
214
+ def ef_construction(ef)
215
+ @config[:ef_construction] = ef
216
+ self
217
+ end
218
+
219
+ def seed(seed)
220
+ @config[:random_seed] = seed
221
+ self
222
+ end
223
+
224
+ def build
225
+ HNSW.new(**@config)
226
+ end
227
+ end
228
+
229
+ private
230
+
231
+ # Ensure input is a proper array format
232
+ def ensure_array(data)
233
+ case data
234
+ when Array
235
+ data
236
+ else
237
+ data.respond_to?(:to_a) ? data.to_a : raise(ArgumentError, "Data must be convertible to Array")
238
+ end
239
+ end
240
+
241
+ # Class method to make it available to class methods
242
+ def self.ensure_array(data)
243
+ case data
244
+ when Array
245
+ data
246
+ else
247
+ data.respond_to?(:to_a) ? data.to_a : raise(ArgumentError, "Data must be convertible to Array")
248
+ end
249
+ end
250
+ end
251
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ClusterKit
4
- VERSION = "0.1.0"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/clusterkit.rb CHANGED
@@ -29,6 +29,7 @@ module ClusterKit
29
29
  # Load modules that depend on the extension
30
30
  require_relative "clusterkit/dimensionality"
31
31
  require_relative "clusterkit/clustering"
32
+ require_relative "clusterkit/hnsw"
32
33
 
33
34
  # Make RustUMAP private - it's an implementation detail
34
35
  # Users should use Dimensionality::UMAP instead
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: clusterkit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Petersen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-08-21 00:00:00.000000000 Z
11
+ date: 2025-09-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: csv
@@ -136,6 +136,9 @@ files:
136
136
  - docs/UMAP_EXPLAINED.md
137
137
  - docs/UMAP_TROUBLESHOOTING.md
138
138
  - docs/VERBOSE_OUTPUT.md
139
+ - docs/assets/clusterkit-wide.png
140
+ - docs/assets/clusterkit.png
141
+ - docs/assets/visualization.png
139
142
  - examples/hdbscan_example.rb
140
143
  - examples/optimal_kmeans_example.rb
141
144
  - examples/pca_example.rb
@@ -146,6 +149,7 @@ files:
146
149
  - ext/clusterkit/src/clustering.rs
147
150
  - ext/clusterkit/src/clustering/hdbscan_wrapper.rs
148
151
  - ext/clusterkit/src/embedder.rs
152
+ - ext/clusterkit/src/hnsw.rs
149
153
  - ext/clusterkit/src/lib.rs
150
154
  - ext/clusterkit/src/svd.rs
151
155
  - ext/clusterkit/src/tests.rs
@@ -160,6 +164,7 @@ files:
160
164
  - lib/clusterkit/dimensionality/svd.rb
161
165
  - lib/clusterkit/dimensionality/umap.rb
162
166
  - lib/clusterkit/hdbscan_api_design.rb
167
+ - lib/clusterkit/hnsw.rb
163
168
  - lib/clusterkit/preprocessing.rb
164
169
  - lib/clusterkit/silence.rb
165
170
  - lib/clusterkit/utils.rb