clusterkit 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +225 -5
- data/docs/assets/clusterkit-wide.png +0 -0
- data/docs/assets/clusterkit.png +0 -0
- data/docs/assets/visualization.png +0 -0
- data/ext/clusterkit/src/hnsw.rs +613 -0
- data/ext/clusterkit/src/lib.rs +2 -0
- data/lib/clusterkit/hnsw.rb +251 -0
- data/lib/clusterkit/version.rb +1 -1
- data/lib/clusterkit.rb +1 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ebad40c2aac3fa3569357eedf740336a3d463ecc5c038d2771d3fd266d414b1e
|
4
|
+
data.tar.gz: 0ab2851e0adab583567460e469d4073e755514854e57bca464d5a835c1534bea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 16412a2db10bf55593c778c7a02813b8db9652e39479d17fa3cd10da1e87298d5d95f4afccd8f5da10bb89bc950b314faeb9a8f0149ecd2dd3c3305121c1b5b2
|
7
|
+
data.tar.gz: edf73f2d0ce8f73441c07975a5cd4da31faaece9b216d908a743b9c1bd3f2d8e213d1b85a7e82f2f397c60f8d2fd2c4a0109a5a337963e008ecf069b3fa40266
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
<img src="/docs/assets/clusterkit-wide.png" alt="clusterkit" height="80px">
|
2
2
|
|
3
3
|
A high-performance clustering and dimensionality reduction toolkit for Ruby, powered by best-in-class Rust implementations.
|
4
4
|
|
@@ -44,7 +44,7 @@ ClusterKit organizes its functionality into clear modules:
|
|
44
44
|
|
45
45
|
- **`ClusterKit::Dimensionality`** - All dimensionality reduction algorithms
|
46
46
|
- `ClusterKit::Dimensionality::UMAP` - UMAP implementation
|
47
|
-
- `ClusterKit::Dimensionality::PCA` - PCA implementation
|
47
|
+
- `ClusterKit::Dimensionality::PCA` - PCA implementation
|
48
48
|
- `ClusterKit::Dimensionality::SVD` - SVD implementation
|
49
49
|
- **`ClusterKit::Clustering`** - All clustering algorithms
|
50
50
|
- `ClusterKit::Clustering::KMeans` - K-means clustering
|
@@ -96,7 +96,7 @@ data = []
|
|
96
96
|
3.times do |cluster|
|
97
97
|
# Each cluster has a different center, well-separated
|
98
98
|
center = Array.new(50) { rand * 0.1 + cluster * 2.0 }
|
99
|
-
|
99
|
+
|
100
100
|
# Add 33 points around each center with controlled noise
|
101
101
|
33.times do
|
102
102
|
point = center.map { |c| c + (rand - 0.5) * 0.3 }
|
@@ -329,6 +329,223 @@ probabilities = hdbscan.probabilities # Cluster membership probabilities
|
|
329
329
|
outlier_scores = hdbscan.outlier_scores # Outlier scores for each point
|
330
330
|
```
|
331
331
|
|
332
|
+
### HNSW - Fast Nearest Neighbor Search
|
333
|
+
|
334
|
+
ClusterKit includes HNSW (Hierarchical Navigable Small World) for fast approximate nearest neighbor search, useful for building recommendation systems, similarity search, and as a building block for other algorithms.
|
335
|
+
|
336
|
+
Copy and paste this **entire block** into IRB to try HNSW with real embeddings:
|
337
|
+
|
338
|
+
```ruby
|
339
|
+
require 'clusterkit'
|
340
|
+
require 'candle'
|
341
|
+
|
342
|
+
# Step 1: Initialize the embedding model
|
343
|
+
puts "Loading embedding model..."
|
344
|
+
embedding_model = Candle::EmbeddingModel.from_pretrained(
|
345
|
+
'sentence-transformers/all-MiniLM-L6-v2',
|
346
|
+
device: Candle::Device.best
|
347
|
+
)
|
348
|
+
puts " ✓ Model loaded: #{embedding_model.model_id}"
|
349
|
+
|
350
|
+
# Step 2: Create sample documents for semantic search
|
351
|
+
documents = [
|
352
|
+
"The cat sat on the mat",
|
353
|
+
"Dogs are loyal pets that love their owners",
|
354
|
+
"Machine learning algorithms can classify text documents",
|
355
|
+
"Natural language processing helps computers understand human language",
|
356
|
+
"Ruby is a programming language known for its simplicity",
|
357
|
+
"Python is popular for data science and machine learning",
|
358
|
+
"The weather today is sunny and warm",
|
359
|
+
"Climate change affects global weather patterns",
|
360
|
+
"Artificial intelligence is transforming many industries",
|
361
|
+
"Deep learning models require large amounts of training data",
|
362
|
+
"Cats and dogs are common household pets",
|
363
|
+
"Software engineering requires problem-solving skills",
|
364
|
+
"The ocean contains many different species of fish",
|
365
|
+
"Marine biology studies life in aquatic environments",
|
366
|
+
"Cooking requires understanding of ingredients and techniques"
|
367
|
+
]
|
368
|
+
|
369
|
+
puts "\nGenerating embeddings for #{documents.size} documents..."
|
370
|
+
|
371
|
+
# Step 3: Generate embeddings for all documents
|
372
|
+
embeddings = documents.map do |doc|
|
373
|
+
embedding_model.embedding(doc).first.to_a
|
374
|
+
end
|
375
|
+
puts " ✓ Generated embeddings: #{embeddings.first.count} dimensions each"
|
376
|
+
|
377
|
+
# Step 4: Create HNSW index
|
378
|
+
puts "\nBuilding HNSW search index..."
|
379
|
+
index = ClusterKit::HNSW.new(
|
380
|
+
dim: embeddings.first.count, # 384 dimensions for all-MiniLM-L6-v2
|
381
|
+
space: :euclidean,
|
382
|
+
m: 16, # Good balance of speed vs accuracy
|
383
|
+
ef_construction: 200, # Build quality
|
384
|
+
max_elements: documents.size,
|
385
|
+
random_seed: 42 # For reproducible results
|
386
|
+
)
|
387
|
+
|
388
|
+
# Step 5: Add all documents to the index
|
389
|
+
documents.each_with_index do |doc, i|
|
390
|
+
index.add_item(
|
391
|
+
embeddings[i],
|
392
|
+
label: "doc_#{i}",
|
393
|
+
metadata: {
|
394
|
+
'text' => doc,
|
395
|
+
'length' => doc.length,
|
396
|
+
'word_count' => doc.split.size
|
397
|
+
}
|
398
|
+
)
|
399
|
+
end
|
400
|
+
puts " ✓ Added #{documents.size} documents to index"
|
401
|
+
|
402
|
+
# Step 6: Perform semantic searches
|
403
|
+
puts "\n" + "="*50
|
404
|
+
puts "SEMANTIC SEARCH DEMO"
|
405
|
+
puts "="*50
|
406
|
+
|
407
|
+
queries = [
|
408
|
+
"pets and animals",
|
409
|
+
"computer programming",
|
410
|
+
"weather and environment"
|
411
|
+
]
|
412
|
+
|
413
|
+
queries.each do |query|
|
414
|
+
puts "\nQuery: '#{query}'"
|
415
|
+
puts "-" * 30
|
416
|
+
|
417
|
+
# Generate query embedding
|
418
|
+
query_embedding = embedding_model.embedding(query).first.to_a
|
419
|
+
|
420
|
+
# Search for similar documents
|
421
|
+
results = index.search_with_metadata(query_embedding, k: 3)
|
422
|
+
|
423
|
+
results.each_with_index do |result, i|
|
424
|
+
similarity = (1.0 - result[:distance]).round(3) # Convert distance to similarity
|
425
|
+
text = result[:metadata]['text']
|
426
|
+
puts " #{i+1}. [#{similarity}] #{text}"
|
427
|
+
end
|
428
|
+
end
|
429
|
+
|
430
|
+
# Step 7: Demonstrate advanced features
|
431
|
+
puts "\n" + "="*50
|
432
|
+
puts "ADVANCED FEATURES"
|
433
|
+
puts "="*50
|
434
|
+
|
435
|
+
# Show search quality adjustment
|
436
|
+
puts "\nAdjusting search quality (ef parameter):"
|
437
|
+
index.set_ef(50) # Lower ef = faster but potentially less accurate
|
438
|
+
fast_results = index.search(embeddings[0], k: 3)
|
439
|
+
puts " Fast search (ef=50): #{fast_results}"
|
440
|
+
|
441
|
+
index.set_ef(200) # Higher ef = slower but more accurate
|
442
|
+
accurate_results = index.search(embeddings[0], k: 3)
|
443
|
+
puts " Accurate search (ef=200): #{accurate_results}"
|
444
|
+
|
445
|
+
# Show batch operations
|
446
|
+
puts "\nBatch search example:"
|
447
|
+
query_embeddings = [embeddings[0], embeddings[5], embeddings[10]]
|
448
|
+
batch_results = query_embeddings.map { |emb| index.search(emb, k: 2) }
|
449
|
+
puts " Found #{batch_results.size} result sets"
|
450
|
+
|
451
|
+
# Save and load demonstration
|
452
|
+
puts "\nSaving and loading index:"
|
453
|
+
index.save('demo_index')
|
454
|
+
puts " ✓ Index saved to 'demo_index'"
|
455
|
+
|
456
|
+
loaded_index = ClusterKit::HNSW.load('demo_index')
|
457
|
+
test_results = loaded_index.search(embeddings[0], k: 2)
|
458
|
+
puts " ✓ Loaded index works: #{test_results}"
|
459
|
+
|
460
|
+
puts "\n✅ HNSW demo complete!"
|
461
|
+
puts "\nTry your own queries by running:"
|
462
|
+
puts "query_embedding = embedding_model.embedding('your search query').first.to_a"
|
463
|
+
puts "results = index.search_with_metadata(query_embedding, k: 5)"
|
464
|
+
```
|
465
|
+
|
466
|
+
#### When to Use HNSW
|
467
|
+
|
468
|
+
HNSW is ideal for:
|
469
|
+
- **Recommendation Systems**: Find similar items/users quickly
|
470
|
+
- **Semantic Search**: Find documents with similar embeddings
|
471
|
+
- **Duplicate Detection**: Identify near-duplicate content
|
472
|
+
- **Clustering Support**: As a fast neighbor graph for HDBSCAN
|
473
|
+
- **Real-time Applications**: When you need sub-millisecond search times
|
474
|
+
|
475
|
+
#### Configuration Guidelines
|
476
|
+
|
477
|
+
```ruby
|
478
|
+
# High recall (>0.95) - Best quality, slower
|
479
|
+
ClusterKit::HNSW.new(
|
480
|
+
dim: dim,
|
481
|
+
m: 32,
|
482
|
+
ef_construction: 400
|
483
|
+
).tap { |idx| idx.set_ef(100) }
|
484
|
+
|
485
|
+
# Balanced (>0.90 recall) - Good quality, fast
|
486
|
+
ClusterKit::HNSW.new(
|
487
|
+
dim: dim,
|
488
|
+
m: 16,
|
489
|
+
ef_construction: 200
|
490
|
+
).tap { |idx| idx.set_ef(50) }
|
491
|
+
|
492
|
+
# Speed optimized (>0.85 recall) - Fastest, acceptable quality
|
493
|
+
ClusterKit::HNSW.new(
|
494
|
+
dim: dim,
|
495
|
+
m: 8,
|
496
|
+
ef_construction: 100
|
497
|
+
).tap { |idx| idx.set_ef(20) }
|
498
|
+
```
|
499
|
+
|
500
|
+
#### Important Notes
|
501
|
+
|
502
|
+
1. **Memory Usage**: HNSW keeps the entire index in memory. Estimate: `(num_items * (dim * 4 + m * 16))` bytes
|
503
|
+
2. **Distance Metrics**: Currently only Euclidean distance is fully supported
|
504
|
+
3. **Loading Behavior**: Due to Rust lifetime constraints, loading an index creates a small memory leak (the index metadata persists until program exit). This is typically negligible for most applications.
|
505
|
+
4. **Build Time**: Index construction is O(N * log(N)). For large datasets (>1M items), consider building offline
|
506
|
+
|
507
|
+
#### Example: Semantic Search System
|
508
|
+
|
509
|
+
```ruby
|
510
|
+
# Build a simple semantic search system
|
511
|
+
documents = load_documents()
|
512
|
+
embeddings = generate_embeddings(documents) # Use red-candle or similar
|
513
|
+
|
514
|
+
# Build search index
|
515
|
+
search_index = ClusterKit::HNSW.new(
|
516
|
+
dim: embeddings.first.size,
|
517
|
+
m: 16,
|
518
|
+
ef_construction: 200,
|
519
|
+
max_elements: documents.size
|
520
|
+
)
|
521
|
+
|
522
|
+
# Add all documents
|
523
|
+
documents.each_with_index do |doc, i|
|
524
|
+
search_index.add_item(
|
525
|
+
embeddings[i],
|
526
|
+
label: i,
|
527
|
+
metadata: { title: doc[:title], url: doc[:url] }
|
528
|
+
)
|
529
|
+
end
|
530
|
+
|
531
|
+
# Search function
|
532
|
+
def search(query, index, k: 10)
|
533
|
+
query_embedding = generate_embedding(query)
|
534
|
+
results = index.search_with_metadata(query_embedding, k: k)
|
535
|
+
|
536
|
+
results.map do |result|
|
537
|
+
{
|
538
|
+
title: result[:metadata]['title'],
|
539
|
+
url: result[:metadata]['url'],
|
540
|
+
similarity: 1.0 - result[:distance] # Convert distance to similarity
|
541
|
+
}
|
542
|
+
end
|
543
|
+
end
|
544
|
+
|
545
|
+
# Save for later use
|
546
|
+
search_index.save('document_index')
|
547
|
+
```
|
548
|
+
|
332
549
|
### Visualization
|
333
550
|
|
334
551
|
ClusterKit includes a built-in visualization tool:
|
@@ -350,6 +567,9 @@ This creates an interactive HTML file with:
|
|
350
567
|
- Performance metrics
|
351
568
|
- Interactive Plotly.js charts
|
352
569
|
|
570
|
+
<img src="/docs/assets/visualization.png" alt="rake clusterkit:visualize">
|
571
|
+
|
572
|
+
|
353
573
|
## Choosing the Right Algorithm
|
354
574
|
|
355
575
|
### Dimensionality Reduction
|
@@ -454,7 +674,7 @@ This error occurs when UMAP cannot find enough neighbors for some points. Soluti
|
|
454
674
|
```ruby
|
455
675
|
# Bad: Pure random data with no structure
|
456
676
|
data = Array.new(100) { Array.new(50) { rand } }
|
457
|
-
|
677
|
+
|
458
678
|
# Good: Data with clusters or patterns (see Quick Start example)
|
459
679
|
# Create clusters with centers and add points around them
|
460
680
|
```
|
@@ -521,4 +741,4 @@ If you use ClusterKit in your research, please cite:
|
|
521
741
|
|
522
742
|
And please also cite the underlying libraries:
|
523
743
|
- [annembed](https://github.com/jean-pierreBoth/annembed) for dimensionality reduction algorithms
|
524
|
-
- [hdbscan](https://github.com/petabi/hdbscan) for HDBSCAN clustering
|
744
|
+
- [hdbscan](https://github.com/petabi/hdbscan) for HDBSCAN clustering
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,613 @@
|
|
1
|
+
use magnus::{
|
2
|
+
class, exception, function, method, prelude::*,
|
3
|
+
Error, Float, Integer, RArray, RHash, RString, Symbol, Value, value, TryConvert, r_hash::ForEach
|
4
|
+
};
|
5
|
+
use hnsw_rs::prelude::*;
|
6
|
+
use hnsw_rs::hnswio::HnswIo;
|
7
|
+
// use ndarray::Array1; // Not used currently
|
8
|
+
use std::collections::HashMap;
|
9
|
+
use std::sync::{Arc, Mutex};
|
10
|
+
use serde::{Serialize, Deserialize};
|
11
|
+
use std::fs::File;
|
12
|
+
|
13
|
+
// Store metadata alongside vectors
|
14
|
+
#[derive(Clone, Debug, Serialize, Deserialize)]
|
15
|
+
struct ItemMetadata {
|
16
|
+
label: String,
|
17
|
+
metadata: Option<HashMap<String, String>>,
|
18
|
+
}
|
19
|
+
|
20
|
+
// Main HNSW wrapper struct
|
21
|
+
#[magnus::wrap(class = "ClusterKit::HNSW", free_immediately, size)]
|
22
|
+
pub struct HnswIndex {
|
23
|
+
hnsw: Arc<Mutex<Hnsw<'static, f32, DistL2>>>,
|
24
|
+
dim: usize,
|
25
|
+
space: DistanceType,
|
26
|
+
metadata_store: Arc<Mutex<HashMap<usize, ItemMetadata>>>,
|
27
|
+
current_id: Arc<Mutex<usize>>,
|
28
|
+
label_to_id: Arc<Mutex<HashMap<String, usize>>>,
|
29
|
+
ef_search: Arc<Mutex<usize>>,
|
30
|
+
}
|
31
|
+
|
32
|
+
#[derive(Clone, Copy)]
|
33
|
+
#[allow(dead_code)] // These variants will be implemented in the future
|
34
|
+
enum DistanceType {
|
35
|
+
Euclidean,
|
36
|
+
Cosine,
|
37
|
+
InnerProduct,
|
38
|
+
}
|
39
|
+
|
40
|
+
impl HnswIndex {
|
41
|
+
// Initialize a new HNSW index
|
42
|
+
pub fn new(kwargs: RHash) -> Result<Self, Error> {
|
43
|
+
// Parse arguments
|
44
|
+
let dim_opt: Option<Value> = kwargs.delete(Symbol::new("dim"))?;
|
45
|
+
let dim_value = dim_opt.ok_or_else(|| Error::new(exception::arg_error(), "dim is required"))?;
|
46
|
+
let dim: usize = TryConvert::try_convert(dim_value)
|
47
|
+
.map_err(|_| Error::new(exception::arg_error(), "dim must be an integer"))?;
|
48
|
+
|
49
|
+
// Validate dimension
|
50
|
+
if dim == 0 {
|
51
|
+
return Err(Error::new(exception::arg_error(), "dim must be a positive integer (got 0)"));
|
52
|
+
}
|
53
|
+
|
54
|
+
let space: String = if let Some(v) = kwargs.delete(Symbol::new("space"))? {
|
55
|
+
// Convert Ruby symbol to string properly
|
56
|
+
if let Ok(sym) = Symbol::try_convert(v) {
|
57
|
+
sym.name()?.to_string()
|
58
|
+
} else if let Ok(s) = String::try_convert(v) {
|
59
|
+
s
|
60
|
+
} else {
|
61
|
+
return Err(Error::new(
|
62
|
+
exception::type_error(),
|
63
|
+
"space must be a string or symbol"
|
64
|
+
));
|
65
|
+
}
|
66
|
+
} else {
|
67
|
+
"euclidean".to_string()
|
68
|
+
};
|
69
|
+
|
70
|
+
let max_elements: usize = if let Some(v) = kwargs.delete(Symbol::new("max_elements"))? {
|
71
|
+
TryConvert::try_convert(v).unwrap_or(10_000)
|
72
|
+
} else {
|
73
|
+
10_000
|
74
|
+
};
|
75
|
+
|
76
|
+
let m: usize = if let Some(v) = kwargs.delete(Symbol::new("M"))? {
|
77
|
+
TryConvert::try_convert(v).unwrap_or(16)
|
78
|
+
} else {
|
79
|
+
16
|
80
|
+
};
|
81
|
+
|
82
|
+
let ef_construction: usize = if let Some(v) = kwargs.delete(Symbol::new("ef_construction"))? {
|
83
|
+
TryConvert::try_convert(v).unwrap_or(200)
|
84
|
+
} else {
|
85
|
+
200
|
86
|
+
};
|
87
|
+
|
88
|
+
let random_seed: Option<u64> = if let Some(v) = kwargs.delete(Symbol::new("random_seed"))? {
|
89
|
+
TryConvert::try_convert(v).ok()
|
90
|
+
} else {
|
91
|
+
None
|
92
|
+
};
|
93
|
+
|
94
|
+
// Validate and convert space parameter
|
95
|
+
// For now, only support Euclidean distance
|
96
|
+
let distance_type = match space.as_str() {
|
97
|
+
"euclidean" => DistanceType::Euclidean,
|
98
|
+
"cosine" => {
|
99
|
+
return Err(Error::new(
|
100
|
+
exception::runtime_error(),
|
101
|
+
"Cosine distance is not yet implemented, please use :euclidean"
|
102
|
+
));
|
103
|
+
},
|
104
|
+
"inner_product" => {
|
105
|
+
return Err(Error::new(
|
106
|
+
exception::runtime_error(),
|
107
|
+
"Inner product distance is not yet implemented, please use :euclidean"
|
108
|
+
));
|
109
|
+
},
|
110
|
+
_ => return Err(Error::new(
|
111
|
+
exception::arg_error(),
|
112
|
+
format!("space must be :euclidean, :cosine, or :inner_product (got: {})", space)
|
113
|
+
)),
|
114
|
+
};
|
115
|
+
|
116
|
+
// Create HNSW instance with Euclidean distance
|
117
|
+
let hnsw = if let Some(seed) = random_seed {
|
118
|
+
Hnsw::<f32, DistL2>::new_with_seed(m, max_elements, 16, ef_construction, DistL2, seed)
|
119
|
+
} else {
|
120
|
+
Hnsw::<f32, DistL2>::new(m, max_elements, 16, ef_construction, DistL2)
|
121
|
+
};
|
122
|
+
|
123
|
+
Ok(Self {
|
124
|
+
hnsw: Arc::new(Mutex::new(hnsw)),
|
125
|
+
dim,
|
126
|
+
space: distance_type,
|
127
|
+
metadata_store: Arc::new(Mutex::new(HashMap::new())),
|
128
|
+
current_id: Arc::new(Mutex::new(0)),
|
129
|
+
label_to_id: Arc::new(Mutex::new(HashMap::new())),
|
130
|
+
ef_search: Arc::new(Mutex::new(ef_construction)),
|
131
|
+
})
|
132
|
+
}
|
133
|
+
|
134
|
+
// Add a single item to the index
|
135
|
+
pub fn add_item(&self, vector: RArray, kwargs: RHash) -> Result<Value, Error> {
|
136
|
+
// Parse vector
|
137
|
+
let vec_data = parse_vector(vector, self.dim)?;
|
138
|
+
|
139
|
+
// Get or generate label
|
140
|
+
let label: String = if let Some(v) = kwargs.delete(Symbol::new("label"))? {
|
141
|
+
TryConvert::try_convert(v).unwrap_or_else(|_| {
|
142
|
+
let mut id = self.current_id.lock().unwrap();
|
143
|
+
let label = id.to_string();
|
144
|
+
*id += 1;
|
145
|
+
label
|
146
|
+
})
|
147
|
+
} else {
|
148
|
+
let mut id = self.current_id.lock().unwrap();
|
149
|
+
let label = id.to_string();
|
150
|
+
*id += 1;
|
151
|
+
label
|
152
|
+
};
|
153
|
+
|
154
|
+
// Get metadata if provided
|
155
|
+
let metadata: Option<HashMap<String, String>> = if let Some(v) = kwargs.delete(Symbol::new("metadata"))? {
|
156
|
+
Some(parse_metadata(v)?)
|
157
|
+
} else {
|
158
|
+
None
|
159
|
+
};
|
160
|
+
|
161
|
+
// Get internal ID for this item
|
162
|
+
let internal_id = {
|
163
|
+
let mut label_map = self.label_to_id.lock().unwrap();
|
164
|
+
let mut current_id = self.current_id.lock().unwrap();
|
165
|
+
|
166
|
+
if label_map.contains_key(&label) {
|
167
|
+
return Err(Error::new(
|
168
|
+
exception::arg_error(),
|
169
|
+
format!("Label '{}' already exists in index", label)
|
170
|
+
));
|
171
|
+
}
|
172
|
+
|
173
|
+
let id = *current_id;
|
174
|
+
label_map.insert(label.clone(), id);
|
175
|
+
*current_id += 1;
|
176
|
+
id
|
177
|
+
};
|
178
|
+
|
179
|
+
// Store metadata
|
180
|
+
{
|
181
|
+
let mut metadata_store = self.metadata_store.lock().unwrap();
|
182
|
+
metadata_store.insert(internal_id, ItemMetadata {
|
183
|
+
label: label.clone(),
|
184
|
+
metadata,
|
185
|
+
});
|
186
|
+
}
|
187
|
+
|
188
|
+
// Add to HNSW
|
189
|
+
{
|
190
|
+
let hnsw = self.hnsw.lock().unwrap();
|
191
|
+
hnsw.insert((&vec_data, internal_id));
|
192
|
+
}
|
193
|
+
|
194
|
+
Ok(value::qnil().as_value())
|
195
|
+
}
|
196
|
+
|
197
|
+
// Add multiple items in batch
|
198
|
+
pub fn add_batch(&self, vectors: RArray, kwargs: RHash) -> Result<Value, Error> {
|
199
|
+
let parallel: bool = if let Some(v) = kwargs.delete(Symbol::new("parallel"))? {
|
200
|
+
TryConvert::try_convert(v).unwrap_or(true)
|
201
|
+
} else {
|
202
|
+
true
|
203
|
+
};
|
204
|
+
|
205
|
+
let labels: Option<RArray> = if let Some(v) = kwargs.delete(Symbol::new("labels"))? {
|
206
|
+
TryConvert::try_convert(v).ok()
|
207
|
+
} else {
|
208
|
+
None
|
209
|
+
};
|
210
|
+
|
211
|
+
// Parse all vectors
|
212
|
+
let mut data_points: Vec<(Vec<f32>, usize)> = Vec::new();
|
213
|
+
let mut metadata_entries: Vec<(usize, ItemMetadata)> = Vec::new();
|
214
|
+
|
215
|
+
for (i, vector) in vectors.each().enumerate() {
|
216
|
+
let vector: RArray = TryConvert::try_convert(vector?)?;
|
217
|
+
let vec_data = parse_vector(vector, self.dim)?;
|
218
|
+
|
219
|
+
// Get or generate label
|
220
|
+
let label = if let Some(ref labels_array) = labels {
|
221
|
+
labels_array.entry::<String>(i as isize)?
|
222
|
+
} else {
|
223
|
+
let mut id = self.current_id.lock().unwrap();
|
224
|
+
let label = id.to_string();
|
225
|
+
*id += 1;
|
226
|
+
label
|
227
|
+
};
|
228
|
+
|
229
|
+
// Get internal ID
|
230
|
+
let internal_id = {
|
231
|
+
let mut label_map = self.label_to_id.lock().unwrap();
|
232
|
+
let mut current_id = self.current_id.lock().unwrap();
|
233
|
+
|
234
|
+
if label_map.contains_key(&label) {
|
235
|
+
return Err(Error::new(
|
236
|
+
exception::arg_error(),
|
237
|
+
format!("Label '{}' already exists in index", label)
|
238
|
+
));
|
239
|
+
}
|
240
|
+
|
241
|
+
let id = *current_id;
|
242
|
+
label_map.insert(label.clone(), id);
|
243
|
+
*current_id += 1;
|
244
|
+
id
|
245
|
+
};
|
246
|
+
|
247
|
+
data_points.push((vec_data, internal_id));
|
248
|
+
metadata_entries.push((internal_id, ItemMetadata {
|
249
|
+
label,
|
250
|
+
metadata: None,
|
251
|
+
}));
|
252
|
+
}
|
253
|
+
|
254
|
+
// Store metadata
|
255
|
+
{
|
256
|
+
let mut metadata_store = self.metadata_store.lock().unwrap();
|
257
|
+
for (id, metadata) in metadata_entries {
|
258
|
+
metadata_store.insert(id, metadata);
|
259
|
+
}
|
260
|
+
}
|
261
|
+
|
262
|
+
// Insert into HNSW
|
263
|
+
{
|
264
|
+
let hnsw = self.hnsw.lock().unwrap();
|
265
|
+
if parallel {
|
266
|
+
let data_refs: Vec<(&Vec<f32>, usize)> = data_points.iter().map(|(v, id)| (v, *id)).collect();
|
267
|
+
hnsw.parallel_insert(&data_refs);
|
268
|
+
} else {
|
269
|
+
for (vec, id) in data_points {
|
270
|
+
hnsw.insert((&vec, id));
|
271
|
+
}
|
272
|
+
}
|
273
|
+
}
|
274
|
+
|
275
|
+
Ok(value::qnil().as_value())
|
276
|
+
}
|
277
|
+
|
278
|
+
// Search for k nearest neighbors
|
279
|
+
pub fn search(&self, query: RArray, kwargs: RHash) -> Result<Value, Error> {
|
280
|
+
let k: usize = if let Some(v) = kwargs.delete(Symbol::new("k"))? {
|
281
|
+
TryConvert::try_convert(v).unwrap_or(10)
|
282
|
+
} else {
|
283
|
+
10
|
284
|
+
};
|
285
|
+
|
286
|
+
let include_distances: bool = if let Some(v) = kwargs.delete(Symbol::new("include_distances"))? {
|
287
|
+
TryConvert::try_convert(v).unwrap_or(false)
|
288
|
+
} else {
|
289
|
+
false
|
290
|
+
};
|
291
|
+
|
292
|
+
// Parse query vector
|
293
|
+
let query_vec = parse_vector(query, self.dim)?;
|
294
|
+
|
295
|
+
// Set search ef if provided
|
296
|
+
if let Some(v) = kwargs.delete(Symbol::new("ef"))? {
|
297
|
+
if let Ok(ef) = TryConvert::try_convert(v) as Result<usize, _> {
|
298
|
+
let mut ef_search = self.ef_search.lock().unwrap();
|
299
|
+
*ef_search = ef;
|
300
|
+
}
|
301
|
+
}
|
302
|
+
|
303
|
+
// Perform search
|
304
|
+
let neighbors = {
|
305
|
+
let hnsw = self.hnsw.lock().unwrap();
|
306
|
+
let ef_search = self.ef_search.lock().unwrap();
|
307
|
+
hnsw.search(&query_vec, k, *ef_search)
|
308
|
+
};
|
309
|
+
|
310
|
+
// Convert results
|
311
|
+
let metadata_store = self.metadata_store.lock().unwrap();
|
312
|
+
|
313
|
+
let indices = RArray::new();
|
314
|
+
let distances = RArray::new();
|
315
|
+
|
316
|
+
for neighbor in neighbors {
|
317
|
+
if let Some(metadata) = metadata_store.get(&neighbor.d_id) {
|
318
|
+
indices.push(RString::new(&metadata.label))?;
|
319
|
+
distances.push(Float::from_f64(neighbor.distance as f64))?;
|
320
|
+
}
|
321
|
+
}
|
322
|
+
|
323
|
+
if include_distances {
|
324
|
+
let result = RArray::new();
|
325
|
+
result.push(indices)?;
|
326
|
+
result.push(distances)?;
|
327
|
+
Ok(result.as_value())
|
328
|
+
} else {
|
329
|
+
Ok(indices.as_value())
|
330
|
+
}
|
331
|
+
}
|
332
|
+
|
333
|
+
// Search with metadata included
|
334
|
+
pub fn search_with_metadata(&self, query: RArray, kwargs: RHash) -> Result<Value, Error> {
|
335
|
+
let k: usize = if let Some(v) = kwargs.delete(Symbol::new("k"))? {
|
336
|
+
TryConvert::try_convert(v).unwrap_or(10)
|
337
|
+
} else {
|
338
|
+
10
|
339
|
+
};
|
340
|
+
|
341
|
+
// Parse query vector
|
342
|
+
let query_vec = parse_vector(query, self.dim)?;
|
343
|
+
|
344
|
+
// Perform search
|
345
|
+
let neighbors = {
|
346
|
+
let hnsw = self.hnsw.lock().unwrap();
|
347
|
+
let ef_search = self.ef_search.lock().unwrap();
|
348
|
+
hnsw.search(&query_vec, k, *ef_search)
|
349
|
+
};
|
350
|
+
|
351
|
+
// Build results with metadata
|
352
|
+
let metadata_store = self.metadata_store.lock().unwrap();
|
353
|
+
let results = RArray::new();
|
354
|
+
|
355
|
+
for neighbor in neighbors {
|
356
|
+
if let Some(item_metadata) = metadata_store.get(&neighbor.d_id) {
|
357
|
+
let result = RHash::new();
|
358
|
+
result.aset(Symbol::new("label"), RString::new(&item_metadata.label))?;
|
359
|
+
result.aset(Symbol::new("distance"), Float::from_f64(neighbor.distance as f64))?;
|
360
|
+
|
361
|
+
let meta_hash = RHash::new();
|
362
|
+
if let Some(ref meta) = item_metadata.metadata {
|
363
|
+
for (key, value) in meta {
|
364
|
+
meta_hash.aset(RString::new(key), RString::new(value))?;
|
365
|
+
}
|
366
|
+
}
|
367
|
+
result.aset(Symbol::new("metadata"), meta_hash)?;
|
368
|
+
|
369
|
+
results.push(result)?;
|
370
|
+
}
|
371
|
+
}
|
372
|
+
|
373
|
+
Ok(results.as_value())
|
374
|
+
}
|
375
|
+
|
376
|
+
// Get current size of the index
|
377
|
+
pub fn size(&self) -> Result<usize, Error> {
|
378
|
+
let metadata_store = self.metadata_store.lock().unwrap();
|
379
|
+
Ok(metadata_store.len())
|
380
|
+
}
|
381
|
+
|
382
|
+
// Check if index is empty
|
383
|
+
pub fn empty(&self) -> Result<bool, Error> {
|
384
|
+
Ok(self.size()? == 0)
|
385
|
+
}
|
386
|
+
|
387
|
+
// Set the ef parameter for search
|
388
|
+
pub fn set_ef(&self, ef: usize) -> Result<Value, Error> {
|
389
|
+
let mut ef_search = self.ef_search.lock().unwrap();
|
390
|
+
*ef_search = ef;
|
391
|
+
Ok(value::qnil().as_value())
|
392
|
+
}
|
393
|
+
|
394
|
+
// Get configuration
|
395
|
+
pub fn config(&self) -> Result<RHash, Error> {
|
396
|
+
let config = RHash::new();
|
397
|
+
config.aset(Symbol::new("dim"), Integer::from_i64(self.dim as i64))?;
|
398
|
+
|
399
|
+
let space_str = match self.space {
|
400
|
+
DistanceType::Euclidean => "euclidean",
|
401
|
+
DistanceType::Cosine => "cosine",
|
402
|
+
DistanceType::InnerProduct => "inner_product",
|
403
|
+
};
|
404
|
+
config.aset(Symbol::new("space"), RString::new(space_str))?;
|
405
|
+
|
406
|
+
let ef_search = self.ef_search.lock().unwrap();
|
407
|
+
config.aset(Symbol::new("ef"), Integer::from_i64(*ef_search as i64))?;
|
408
|
+
config.aset(Symbol::new("size"), Integer::from_i64(self.size()? as i64))?;
|
409
|
+
|
410
|
+
Ok(config)
|
411
|
+
}
|
412
|
+
|
413
|
+
// Get statistics about the index
|
414
|
+
pub fn stats(&self) -> Result<RHash, Error> {
|
415
|
+
let stats = RHash::new();
|
416
|
+
|
417
|
+
stats.aset(Symbol::new("size"), Integer::from_i64(self.size()? as i64))?;
|
418
|
+
stats.aset(Symbol::new("dim"), Integer::from_i64(self.dim as i64))?;
|
419
|
+
|
420
|
+
let ef_search = self.ef_search.lock().unwrap();
|
421
|
+
stats.aset(Symbol::new("ef_search"), Integer::from_i64(*ef_search as i64))?;
|
422
|
+
|
423
|
+
// TODO: Add more statistics from HNSW structure
|
424
|
+
|
425
|
+
Ok(stats)
|
426
|
+
}
|
427
|
+
|
428
|
+
// Load index from file (class method)
|
429
|
+
pub fn load(path: RString) -> Result<Self, Error> {
|
430
|
+
let path_str = path.to_string()?;
|
431
|
+
|
432
|
+
// Load metadata first to get dimensions and space
|
433
|
+
let metadata_path = format!("{}.metadata", path_str);
|
434
|
+
let metadata_file = File::open(&metadata_path)
|
435
|
+
.map_err(|e| Error::new(exception::runtime_error(), format!("Failed to open metadata file: {}", e)))?;
|
436
|
+
|
437
|
+
let (
|
438
|
+
_metadata_store,
|
439
|
+
_label_to_id,
|
440
|
+
_current_id,
|
441
|
+
_dim,
|
442
|
+
_space_str,
|
443
|
+
): (
|
444
|
+
HashMap<usize, ItemMetadata>,
|
445
|
+
HashMap<String, usize>,
|
446
|
+
usize,
|
447
|
+
usize,
|
448
|
+
String, // Changed from &str to String for deserialization
|
449
|
+
) = bincode::deserialize_from(metadata_file)
|
450
|
+
.map_err(|e| Error::new(exception::runtime_error(), format!("Failed to load metadata: {}", e)))?;
|
451
|
+
|
452
|
+
// Load HNSW structure
|
453
|
+
let hnsw_dir = format!("{}_hnsw_data", path_str);
|
454
|
+
let hnsw_path = std::path::Path::new(&hnsw_dir);
|
455
|
+
|
456
|
+
// Create HnswIo and leak it to get 'static lifetime
|
457
|
+
// This is a memory leak, but necessary due to hnsw_rs lifetime constraints
|
458
|
+
// The memory will never be freed until the program exits
|
459
|
+
let hnswio = Box::new(HnswIo::new(hnsw_path, "hnsw"));
|
460
|
+
let hnswio_static: &'static mut HnswIo = Box::leak(hnswio);
|
461
|
+
|
462
|
+
// Now we can load the HNSW with 'static lifetime
|
463
|
+
let hnsw: Hnsw<'static, f32, DistL2> = hnswio_static.load_hnsw()
|
464
|
+
.map_err(|e| Error::new(exception::runtime_error(), format!("Failed to load HNSW index: {}", e)))?;
|
465
|
+
|
466
|
+
// Use the loaded metadata
|
467
|
+
let metadata_store = _metadata_store;
|
468
|
+
let label_to_id = _label_to_id;
|
469
|
+
let current_id = _current_id;
|
470
|
+
let dim = _dim;
|
471
|
+
let space = match _space_str.as_str() {
|
472
|
+
"euclidean" => DistanceType::Euclidean,
|
473
|
+
"cosine" => DistanceType::Cosine,
|
474
|
+
"inner_product" => DistanceType::InnerProduct,
|
475
|
+
_ => return Err(Error::new(exception::runtime_error(), "Unknown distance type in saved file")),
|
476
|
+
};
|
477
|
+
|
478
|
+
// Use default ef_construction as ef_search
|
479
|
+
let ef_search = 200;
|
480
|
+
|
481
|
+
Ok(Self {
|
482
|
+
hnsw: Arc::new(Mutex::new(hnsw)),
|
483
|
+
dim,
|
484
|
+
space,
|
485
|
+
metadata_store: Arc::new(Mutex::new(metadata_store)),
|
486
|
+
current_id: Arc::new(Mutex::new(current_id)),
|
487
|
+
label_to_id: Arc::new(Mutex::new(label_to_id)),
|
488
|
+
ef_search: Arc::new(Mutex::new(ef_search)),
|
489
|
+
})
|
490
|
+
}
|
491
|
+
|
492
|
+
// Save index to file
|
493
|
+
pub fn save(&self, path: RString) -> Result<Value, Error> {
|
494
|
+
let path_str = path.to_string()?;
|
495
|
+
|
496
|
+
// Create directory for HNSW structure
|
497
|
+
let hnsw_dir = format!("{}_hnsw_data", path_str);
|
498
|
+
std::fs::create_dir_all(&hnsw_dir)
|
499
|
+
.map_err(|e| Error::new(exception::runtime_error(), format!("Failed to create directory: {}", e)))?;
|
500
|
+
|
501
|
+
// Save HNSW structure
|
502
|
+
{
|
503
|
+
let hnsw = self.hnsw.lock().unwrap();
|
504
|
+
hnsw.file_dump(&std::path::Path::new(&hnsw_dir), "hnsw")
|
505
|
+
.map_err(|e| Error::new(exception::runtime_error(), format!("Failed to save HNSW: {}", e)))?;
|
506
|
+
}
|
507
|
+
|
508
|
+
// Save metadata
|
509
|
+
let metadata_path = format!("{}.metadata", path_str);
|
510
|
+
{
|
511
|
+
let metadata_store = self.metadata_store.lock().unwrap();
|
512
|
+
let label_to_id = self.label_to_id.lock().unwrap();
|
513
|
+
let current_id = self.current_id.lock().unwrap();
|
514
|
+
|
515
|
+
let metadata_data = (
|
516
|
+
&*metadata_store,
|
517
|
+
&*label_to_id,
|
518
|
+
*current_id,
|
519
|
+
self.dim,
|
520
|
+
match self.space {
|
521
|
+
DistanceType::Euclidean => "euclidean",
|
522
|
+
DistanceType::Cosine => "cosine",
|
523
|
+
DistanceType::InnerProduct => "inner_product",
|
524
|
+
},
|
525
|
+
);
|
526
|
+
|
527
|
+
let file = File::create(&metadata_path)
|
528
|
+
.map_err(|e| Error::new(exception::runtime_error(), format!("Failed to create metadata file: {}", e)))?;
|
529
|
+
|
530
|
+
bincode::serialize_into(file, &metadata_data)
|
531
|
+
.map_err(|e| Error::new(exception::runtime_error(), format!("Failed to save metadata: {}", e)))?;
|
532
|
+
}
|
533
|
+
|
534
|
+
Ok(value::qnil().as_value())
|
535
|
+
}
|
536
|
+
}
|
537
|
+
|
538
|
+
// Helper function to parse a Ruby array into a Vec<f32>
|
539
|
+
fn parse_vector(array: RArray, expected_dim: usize) -> Result<Vec<f32>, Error> {
|
540
|
+
let len = array.len();
|
541
|
+
if len != expected_dim {
|
542
|
+
return Err(Error::new(
|
543
|
+
exception::arg_error(),
|
544
|
+
format!("Vector dimension mismatch: expected {}, got {}", expected_dim, len)
|
545
|
+
));
|
546
|
+
}
|
547
|
+
|
548
|
+
let mut vec = Vec::with_capacity(len);
|
549
|
+
for item in array.each() {
|
550
|
+
let value: f64 = TryConvert::try_convert(item?)
|
551
|
+
.map_err(|_| Error::new(exception::type_error(), "Vector elements must be numeric"))?;
|
552
|
+
vec.push(value as f32);
|
553
|
+
}
|
554
|
+
|
555
|
+
Ok(vec)
|
556
|
+
}
|
557
|
+
|
558
|
+
// Helper function to parse metadata
|
559
|
+
fn parse_metadata(value: Value) -> Result<HashMap<String, String>, Error> {
|
560
|
+
let hash: RHash = TryConvert::try_convert(value)
|
561
|
+
.map_err(|_| Error::new(exception::type_error(), "Metadata must be a hash"))?;
|
562
|
+
|
563
|
+
let mut metadata = HashMap::new();
|
564
|
+
|
565
|
+
hash.foreach(|key: Value, value: Value| {
|
566
|
+
// Handle both string and symbol keys
|
567
|
+
let key_str = if let Ok(s) = String::try_convert(key) {
|
568
|
+
s
|
569
|
+
} else if let Ok(sym) = Symbol::try_convert(key) {
|
570
|
+
sym.name()?.to_string()
|
571
|
+
} else {
|
572
|
+
return Err(Error::new(exception::type_error(), "Metadata keys must be strings or symbols"));
|
573
|
+
};
|
574
|
+
|
575
|
+
// Convert value to string (handle various Ruby types)
|
576
|
+
let value_str = if let Ok(s) = String::try_convert(value) {
|
577
|
+
s
|
578
|
+
} else if let Ok(i) = Integer::try_convert(value) {
|
579
|
+
i.to_string()
|
580
|
+
} else if let Ok(f) = Float::try_convert(value) {
|
581
|
+
f.to_f64().to_string()
|
582
|
+
} else {
|
583
|
+
// Fallback: use Ruby's to_s method
|
584
|
+
let to_s_method = value.funcall::<_, _, RString>("to_s", ())?;
|
585
|
+
to_s_method.to_string()?
|
586
|
+
};
|
587
|
+
|
588
|
+
metadata.insert(key_str, value_str);
|
589
|
+
Ok(ForEach::Continue)
|
590
|
+
})?;
|
591
|
+
|
592
|
+
Ok(metadata)
|
593
|
+
}
|
594
|
+
|
595
|
+
// Initialize the HNSW module
|
596
|
+
pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
597
|
+
let class = parent.define_class("HNSW", class::object())?;
|
598
|
+
|
599
|
+
class.define_singleton_method("new", function!(HnswIndex::new, 1))?;
|
600
|
+
class.define_singleton_method("load", function!(HnswIndex::load, 1))?;
|
601
|
+
class.define_method("add_item", method!(HnswIndex::add_item, 2))?;
|
602
|
+
class.define_method("add_batch", method!(HnswIndex::add_batch, 2))?;
|
603
|
+
class.define_method("search", method!(HnswIndex::search, 2))?;
|
604
|
+
class.define_method("search_with_metadata", method!(HnswIndex::search_with_metadata, 2))?;
|
605
|
+
class.define_method("size", method!(HnswIndex::size, 0))?;
|
606
|
+
class.define_method("empty?", method!(HnswIndex::empty, 0))?;
|
607
|
+
class.define_method("set_ef", method!(HnswIndex::set_ef, 1))?;
|
608
|
+
class.define_method("config", method!(HnswIndex::config, 0))?;
|
609
|
+
class.define_method("stats", method!(HnswIndex::stats, 0))?;
|
610
|
+
class.define_method("save", method!(HnswIndex::save, 1))?;
|
611
|
+
|
612
|
+
Ok(())
|
613
|
+
}
|
data/ext/clusterkit/src/lib.rs
CHANGED
@@ -4,6 +4,7 @@ mod embedder;
|
|
4
4
|
mod svd;
|
5
5
|
mod utils;
|
6
6
|
mod clustering;
|
7
|
+
mod hnsw;
|
7
8
|
|
8
9
|
#[cfg(test)]
|
9
10
|
mod tests;
|
@@ -17,6 +18,7 @@ fn init() -> Result<(), Error> {
|
|
17
18
|
svd::init(&module)?;
|
18
19
|
utils::init(&module)?;
|
19
20
|
clustering::init(&module)?;
|
21
|
+
hnsw::init(&module)?;
|
20
22
|
|
21
23
|
Ok(())
|
22
24
|
}
|
@@ -0,0 +1,251 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ClusterKit
|
4
|
+
# HNSW (Hierarchical Navigable Small World) index for fast approximate nearest neighbor search
|
5
|
+
#
|
6
|
+
# @example Basic usage
|
7
|
+
# index = ClusterKit::HNSW.new(dim: 128, space: :euclidean)
|
8
|
+
# index.add_batch(vectors, labels: labels)
|
9
|
+
# neighbors = index.search(query_vector, k: 10)
|
10
|
+
#
|
11
|
+
# @example With metadata
|
12
|
+
# index = ClusterKit::HNSW.new(dim: 768, space: :cosine)
|
13
|
+
# index.add_item(vector, label: "doc_1", metadata: { title: "Introduction", date: "2024-01-01" })
|
14
|
+
# results = index.search_with_metadata(query, k: 5)
|
15
|
+
# # => [{ label: "doc_1", distance: 0.23, metadata: { title: "...", date: "..." } }, ...]
|
16
|
+
class HNSW
|
17
|
+
# Note: The actual HNSW class is defined in Rust (ext/clusterkit/src/hnsw.rs)
|
18
|
+
# This Ruby file adds additional convenience methods and documentation.
|
19
|
+
# The Rust implementation provides these core methods:
|
20
|
+
# - new(kwargs) - constructor
|
21
|
+
# - add_item(vector, kwargs) - add single item
|
22
|
+
# - add_batch(vectors, kwargs) - add multiple items
|
23
|
+
# - search(query, kwargs) - search for neighbors
|
24
|
+
# - search_with_metadata(query, kwargs) - search with metadata
|
25
|
+
# - size() - get number of items
|
26
|
+
# - config() - get configuration
|
27
|
+
# - stats() - get statistics
|
28
|
+
# - set_ef(ef) - set search quality parameter
|
29
|
+
# - save(path) - save to file
|
30
|
+
|
31
|
+
# Initialize is actually handled by the Rust code
|
32
|
+
# This documentation is for reference
|
33
|
+
#
|
34
|
+
# @param dim [Integer] Dimension of vectors (required)
|
35
|
+
# @param space [Symbol] Distance metric: :euclidean, :cosine, or :inner_product (default: :euclidean)
|
36
|
+
# @param max_elements [Integer] Maximum number of elements (default: 10_000)
|
37
|
+
# @param m [Integer] Number of bi-directional links (default: 16)
|
38
|
+
# @param ef_construction [Integer] Size of dynamic candidate list (default: 200)
|
39
|
+
# @param random_seed [Integer, nil] Random seed for reproducible builds (default: nil)
|
40
|
+
# @param dynamic_list [Boolean] Allow index to grow dynamically (not yet implemented)
|
41
|
+
|
42
|
+
# Fit the index with training data (alias for add_batch)
|
43
|
+
#
|
44
|
+
# @param data [Array<Array>, Numo::NArray] Training vectors
|
45
|
+
# @param labels [Array, nil] Optional labels for vectors
|
46
|
+
# @return [self]
|
47
|
+
def fit(data, labels: nil)
|
48
|
+
add_batch(data, labels: labels)
|
49
|
+
self
|
50
|
+
end
|
51
|
+
|
52
|
+
# Fit and return transformed data (for compatibility with sklearn-like interface)
|
53
|
+
#
|
54
|
+
# @param data [Array<Array>, Numo::NArray] Training vectors
|
55
|
+
# @return [self]
|
56
|
+
def fit_transform(data)
|
57
|
+
fit(data)
|
58
|
+
self
|
59
|
+
end
|
60
|
+
|
61
|
+
# Add a vector using the << operator
|
62
|
+
#
|
63
|
+
# @param vector [Array, Numo::NArray] Vector to add
|
64
|
+
# @return [self]
|
65
|
+
def <<(vector)
|
66
|
+
add_item(vector, {})
|
67
|
+
self
|
68
|
+
end
|
69
|
+
|
70
|
+
# Alias for search that always includes distances
|
71
|
+
#
|
72
|
+
# @param query [Array, Numo::NArray] Query vector
|
73
|
+
# @param k [Integer] Number of neighbors
|
74
|
+
# @param ef [Integer, nil] Search parameter (higher = better quality, slower)
|
75
|
+
# @return [Array<Array>] Array of [indices, distances]
|
76
|
+
def knn_query(query, k: 10, ef: nil)
|
77
|
+
search(query, k: k, ef: ef, include_distances: true)
|
78
|
+
end
|
79
|
+
|
80
|
+
# Batch search for multiple queries
|
81
|
+
#
|
82
|
+
# @param queries [Array<Array>, Numo::NArray] Multiple query vectors
|
83
|
+
# @param k [Integer] Number of neighbors per query
|
84
|
+
# @param parallel [Boolean] Process queries in parallel
|
85
|
+
# @return [Array<Array>] Results for each query
|
86
|
+
def batch_search(queries, k: 10, parallel: true)
|
87
|
+
queries = ensure_array(queries)
|
88
|
+
|
89
|
+
if parallel && queries.size > 1
|
90
|
+
require 'parallel'
|
91
|
+
Parallel.map(queries) { |query| search(query, k: k) }
|
92
|
+
else
|
93
|
+
queries.map { |query| search(query, k: k) }
|
94
|
+
end
|
95
|
+
rescue LoadError
|
96
|
+
# Parallel gem not available, fall back to sequential
|
97
|
+
queries.map { |query| search(query, k: k) }
|
98
|
+
end
|
99
|
+
|
100
|
+
# Range search - find all points within a given radius
|
101
|
+
#
|
102
|
+
# @param query [Array, Numo::NArray] Query vector
|
103
|
+
# @param radius [Float] Search radius
|
104
|
+
# @param limit [Integer, nil] Maximum number of results
|
105
|
+
# @return [Array<Hash>] Results within radius
|
106
|
+
def range_search(query, radius:, limit: nil)
|
107
|
+
# Get a large number of candidates
|
108
|
+
k = limit || size
|
109
|
+
k = [k, size].min
|
110
|
+
|
111
|
+
results = search_with_metadata(query, k: k)
|
112
|
+
|
113
|
+
# Filter by radius
|
114
|
+
results.select { |r| r[:distance] <= radius }
|
115
|
+
.take(limit || results.size)
|
116
|
+
end
|
117
|
+
|
118
|
+
# Check if index is empty
|
119
|
+
# @return [Boolean]
|
120
|
+
def empty?
|
121
|
+
size == 0
|
122
|
+
end
|
123
|
+
|
124
|
+
# Clear all elements from the index
|
125
|
+
#
|
126
|
+
# @return [self]
|
127
|
+
def clear!
|
128
|
+
# Would need to recreate the index
|
129
|
+
raise NotImplementedError, "Clear not yet implemented"
|
130
|
+
end
|
131
|
+
|
132
|
+
# Check if a label exists in the index
|
133
|
+
#
|
134
|
+
# @param label [String, Integer] Label to check
|
135
|
+
# @return [Boolean]
|
136
|
+
def include?(label)
|
137
|
+
# This would need to be implemented in Rust
|
138
|
+
# For now, return false
|
139
|
+
false
|
140
|
+
end
|
141
|
+
|
142
|
+
# Get recall rate for a test set
|
143
|
+
#
|
144
|
+
# @param test_queries [Array<Array>] Query vectors
|
145
|
+
# @param ground_truth [Array<Array>] True nearest neighbors for each query
|
146
|
+
# @param k [Integer] Number of neighbors to evaluate
|
147
|
+
# @return [Float] Recall rate (0.0 to 1.0)
|
148
|
+
def recall(test_queries, ground_truth, k: 10)
|
149
|
+
test_queries = ensure_array(test_queries)
|
150
|
+
|
151
|
+
require 'set'
|
152
|
+
total_correct = 0
|
153
|
+
total_possible = 0
|
154
|
+
|
155
|
+
test_queries.each_with_index do |query, i|
|
156
|
+
predicted = Set.new(search(query, k: k))
|
157
|
+
actual = Set.new(ground_truth[i].take(k))
|
158
|
+
|
159
|
+
total_correct += (predicted & actual).size
|
160
|
+
total_possible += [k, actual.size].min
|
161
|
+
end
|
162
|
+
|
163
|
+
total_possible > 0 ? total_correct.to_f / total_possible : 0.0
|
164
|
+
end
|
165
|
+
|
166
|
+
# Load an index from file
|
167
|
+
# Note: This uses Box::leak internally to work around hnsw_rs lifetime constraints
|
168
|
+
# This causes a small memory leak - the HnswIo struct won't be freed until program exit
|
169
|
+
#
|
170
|
+
# @param path [String] File path to load from
|
171
|
+
# @return [HNSW] New HNSW instance loaded from file
|
172
|
+
# (The actual implementation is in Rust)
|
173
|
+
|
174
|
+
# Create an index from embeddings produced by UMAP or other dimensionality reduction
|
175
|
+
#
|
176
|
+
# @param embeddings [Array<Array>, Numo::NArray] Embedding vectors
|
177
|
+
# @param kwargs [Hash] Additional options for HNSW initialization
|
178
|
+
# @return [HNSW] New HNSW instance
|
179
|
+
def self.from_embedding(embeddings, **kwargs)
|
180
|
+
embeddings = ensure_array(embeddings)
|
181
|
+
|
182
|
+
dim = embeddings.first.size
|
183
|
+
index = new(dim: dim, **kwargs)
|
184
|
+
index.fit(embeddings)
|
185
|
+
index
|
186
|
+
end
|
187
|
+
|
188
|
+
# Builder pattern for creating HNSW indices
|
189
|
+
class Builder
|
190
|
+
def initialize
|
191
|
+
@config = {}
|
192
|
+
end
|
193
|
+
|
194
|
+
def space(type)
|
195
|
+
@config[:space] = type
|
196
|
+
self
|
197
|
+
end
|
198
|
+
|
199
|
+
def dimensions(dim)
|
200
|
+
@config[:dim] = dim
|
201
|
+
self
|
202
|
+
end
|
203
|
+
|
204
|
+
def max_elements(n)
|
205
|
+
@config[:max_elements] = n
|
206
|
+
self
|
207
|
+
end
|
208
|
+
|
209
|
+
def m_parameter(m)
|
210
|
+
@config[:m] = m
|
211
|
+
self
|
212
|
+
end
|
213
|
+
|
214
|
+
def ef_construction(ef)
|
215
|
+
@config[:ef_construction] = ef
|
216
|
+
self
|
217
|
+
end
|
218
|
+
|
219
|
+
def seed(seed)
|
220
|
+
@config[:random_seed] = seed
|
221
|
+
self
|
222
|
+
end
|
223
|
+
|
224
|
+
def build
|
225
|
+
HNSW.new(**@config)
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
private
|
230
|
+
|
231
|
+
# Ensure input is a proper array format
|
232
|
+
def ensure_array(data)
|
233
|
+
case data
|
234
|
+
when Array
|
235
|
+
data
|
236
|
+
else
|
237
|
+
data.respond_to?(:to_a) ? data.to_a : raise(ArgumentError, "Data must be convertible to Array")
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
# Class method to make it available to class methods
|
242
|
+
def self.ensure_array(data)
|
243
|
+
case data
|
244
|
+
when Array
|
245
|
+
data
|
246
|
+
else
|
247
|
+
data.respond_to?(:to_a) ? data.to_a : raise(ArgumentError, "Data must be convertible to Array")
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
data/lib/clusterkit/version.rb
CHANGED
data/lib/clusterkit.rb
CHANGED
@@ -29,6 +29,7 @@ module ClusterKit
|
|
29
29
|
# Load modules that depend on the extension
|
30
30
|
require_relative "clusterkit/dimensionality"
|
31
31
|
require_relative "clusterkit/clustering"
|
32
|
+
require_relative "clusterkit/hnsw"
|
32
33
|
|
33
34
|
# Make RustUMAP private - it's an implementation detail
|
34
35
|
# Users should use Dimensionality::UMAP instead
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: clusterkit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Petersen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-09-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: csv
|
@@ -136,6 +136,9 @@ files:
|
|
136
136
|
- docs/UMAP_EXPLAINED.md
|
137
137
|
- docs/UMAP_TROUBLESHOOTING.md
|
138
138
|
- docs/VERBOSE_OUTPUT.md
|
139
|
+
- docs/assets/clusterkit-wide.png
|
140
|
+
- docs/assets/clusterkit.png
|
141
|
+
- docs/assets/visualization.png
|
139
142
|
- examples/hdbscan_example.rb
|
140
143
|
- examples/optimal_kmeans_example.rb
|
141
144
|
- examples/pca_example.rb
|
@@ -146,6 +149,7 @@ files:
|
|
146
149
|
- ext/clusterkit/src/clustering.rs
|
147
150
|
- ext/clusterkit/src/clustering/hdbscan_wrapper.rs
|
148
151
|
- ext/clusterkit/src/embedder.rs
|
152
|
+
- ext/clusterkit/src/hnsw.rs
|
149
153
|
- ext/clusterkit/src/lib.rs
|
150
154
|
- ext/clusterkit/src/svd.rs
|
151
155
|
- ext/clusterkit/src/tests.rs
|
@@ -160,6 +164,7 @@ files:
|
|
160
164
|
- lib/clusterkit/dimensionality/svd.rb
|
161
165
|
- lib/clusterkit/dimensionality/umap.rb
|
162
166
|
- lib/clusterkit/hdbscan_api_design.rb
|
167
|
+
- lib/clusterkit/hnsw.rb
|
163
168
|
- lib/clusterkit/preprocessing.rb
|
164
169
|
- lib/clusterkit/silence.rb
|
165
170
|
- lib/clusterkit/utils.rb
|