clusterkit 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +3236 -0
- data/README.md +227 -7
- data/docs/KNOWN_ISSUES.md +5 -5
- data/docs/RUST_ERROR_HANDLING.md +6 -6
- data/docs/assets/clusterkit-wide.png +0 -0
- data/docs/assets/clusterkit.png +0 -0
- data/docs/assets/visualization.png +0 -0
- data/ext/clusterkit/Cargo.toml +5 -4
- data/ext/clusterkit/extconf.rb +9 -1
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +27 -62
- data/ext/clusterkit/src/clustering.rs +68 -114
- data/ext/clusterkit/src/embedder.rs +48 -131
- data/ext/clusterkit/src/hnsw.rs +579 -0
- data/ext/clusterkit/src/lib.rs +7 -5
- data/ext/clusterkit/src/svd.rs +35 -58
- data/ext/clusterkit/src/utils.rs +159 -9
- data/lib/clusterkit/clustering/hdbscan.rb +4 -17
- data/lib/clusterkit/clustering.rb +4 -23
- data/lib/clusterkit/data_validator.rb +132 -0
- data/lib/clusterkit/dimensionality/pca.rb +12 -12
- data/lib/clusterkit/dimensionality/svd.rb +47 -16
- data/lib/clusterkit/dimensionality/umap.rb +7 -40
- data/lib/clusterkit/hnsw.rb +251 -0
- data/lib/clusterkit/version.rb +1 -1
- data/lib/clusterkit.rb +2 -1
- metadata +40 -20
- data/clusterkit.gemspec +0 -45
data/README.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
<img src="/docs/assets/clusterkit-wide.png" alt="clusterkit" height="80px">
|
|
2
2
|
|
|
3
3
|
A high-performance clustering and dimensionality reduction toolkit for Ruby, powered by best-in-class Rust implementations.
|
|
4
4
|
|
|
@@ -44,7 +44,7 @@ ClusterKit organizes its functionality into clear modules:
|
|
|
44
44
|
|
|
45
45
|
- **`ClusterKit::Dimensionality`** - All dimensionality reduction algorithms
|
|
46
46
|
- `ClusterKit::Dimensionality::UMAP` - UMAP implementation
|
|
47
|
-
- `ClusterKit::Dimensionality::PCA` - PCA implementation
|
|
47
|
+
- `ClusterKit::Dimensionality::PCA` - PCA implementation
|
|
48
48
|
- `ClusterKit::Dimensionality::SVD` - SVD implementation
|
|
49
49
|
- **`ClusterKit::Clustering`** - All clustering algorithms
|
|
50
50
|
- `ClusterKit::Clustering::KMeans` - K-means clustering
|
|
@@ -96,7 +96,7 @@ data = []
|
|
|
96
96
|
3.times do |cluster|
|
|
97
97
|
# Each cluster has a different center, well-separated
|
|
98
98
|
center = Array.new(50) { rand * 0.1 + cluster * 2.0 }
|
|
99
|
-
|
|
99
|
+
|
|
100
100
|
# Add 33 points around each center with controlled noise
|
|
101
101
|
33.times do
|
|
102
102
|
point = center.map { |c| c + (rand - 0.5) * 0.3 }
|
|
@@ -329,6 +329,223 @@ probabilities = hdbscan.probabilities # Cluster membership probabilities
|
|
|
329
329
|
outlier_scores = hdbscan.outlier_scores # Outlier scores for each point
|
|
330
330
|
```
|
|
331
331
|
|
|
332
|
+
### HNSW - Fast Nearest Neighbor Search
|
|
333
|
+
|
|
334
|
+
ClusterKit includes HNSW (Hierarchical Navigable Small World) for fast approximate nearest neighbor search, useful for building recommendation systems, similarity search, and as a building block for other algorithms.
|
|
335
|
+
|
|
336
|
+
Copy and paste this **entire block** into IRB to try HNSW with real embeddings:
|
|
337
|
+
|
|
338
|
+
```ruby
|
|
339
|
+
require 'clusterkit'
|
|
340
|
+
require 'candle'
|
|
341
|
+
|
|
342
|
+
# Step 1: Initialize the embedding model
|
|
343
|
+
puts "Loading embedding model..."
|
|
344
|
+
embedding_model = Candle::EmbeddingModel.from_pretrained(
|
|
345
|
+
'sentence-transformers/all-MiniLM-L6-v2',
|
|
346
|
+
device: Candle::Device.best
|
|
347
|
+
)
|
|
348
|
+
puts " ✓ Model loaded: #{embedding_model.model_id}"
|
|
349
|
+
|
|
350
|
+
# Step 2: Create sample documents for semantic search
|
|
351
|
+
documents = [
|
|
352
|
+
"The cat sat on the mat",
|
|
353
|
+
"Dogs are loyal pets that love their owners",
|
|
354
|
+
"Machine learning algorithms can classify text documents",
|
|
355
|
+
"Natural language processing helps computers understand human language",
|
|
356
|
+
"Ruby is a programming language known for its simplicity",
|
|
357
|
+
"Python is popular for data science and machine learning",
|
|
358
|
+
"The weather today is sunny and warm",
|
|
359
|
+
"Climate change affects global weather patterns",
|
|
360
|
+
"Artificial intelligence is transforming many industries",
|
|
361
|
+
"Deep learning models require large amounts of training data",
|
|
362
|
+
"Cats and dogs are common household pets",
|
|
363
|
+
"Software engineering requires problem-solving skills",
|
|
364
|
+
"The ocean contains many different species of fish",
|
|
365
|
+
"Marine biology studies life in aquatic environments",
|
|
366
|
+
"Cooking requires understanding of ingredients and techniques"
|
|
367
|
+
]
|
|
368
|
+
|
|
369
|
+
puts "\nGenerating embeddings for #{documents.size} documents..."
|
|
370
|
+
|
|
371
|
+
# Step 3: Generate embeddings for all documents
|
|
372
|
+
embeddings = documents.map do |doc|
|
|
373
|
+
embedding_model.embedding(doc).first.to_a
|
|
374
|
+
end
|
|
375
|
+
puts " ✓ Generated embeddings: #{embeddings.first.count} dimensions each"
|
|
376
|
+
|
|
377
|
+
# Step 4: Create HNSW index
|
|
378
|
+
puts "\nBuilding HNSW search index..."
|
|
379
|
+
index = ClusterKit::HNSW.new(
|
|
380
|
+
dim: embeddings.first.count, # 384 dimensions for all-MiniLM-L6-v2
|
|
381
|
+
space: :euclidean,
|
|
382
|
+
m: 16, # Good balance of speed vs accuracy
|
|
383
|
+
ef_construction: 200, # Build quality
|
|
384
|
+
max_elements: documents.size,
|
|
385
|
+
random_seed: 42 # For reproducible results
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
# Step 5: Add all documents to the index
|
|
389
|
+
documents.each_with_index do |doc, i|
|
|
390
|
+
index.add_item(
|
|
391
|
+
embeddings[i],
|
|
392
|
+
label: "doc_#{i}",
|
|
393
|
+
metadata: {
|
|
394
|
+
'text' => doc,
|
|
395
|
+
'length' => doc.length,
|
|
396
|
+
'word_count' => doc.split.size
|
|
397
|
+
}
|
|
398
|
+
)
|
|
399
|
+
end
|
|
400
|
+
puts " ✓ Added #{documents.size} documents to index"
|
|
401
|
+
|
|
402
|
+
# Step 6: Perform semantic searches
|
|
403
|
+
puts "\n" + "="*50
|
|
404
|
+
puts "SEMANTIC SEARCH DEMO"
|
|
405
|
+
puts "="*50
|
|
406
|
+
|
|
407
|
+
queries = [
|
|
408
|
+
"pets and animals",
|
|
409
|
+
"computer programming",
|
|
410
|
+
"weather and environment"
|
|
411
|
+
]
|
|
412
|
+
|
|
413
|
+
queries.each do |query|
|
|
414
|
+
puts "\nQuery: '#{query}'"
|
|
415
|
+
puts "-" * 30
|
|
416
|
+
|
|
417
|
+
# Generate query embedding
|
|
418
|
+
query_embedding = embedding_model.embedding(query).first.to_a
|
|
419
|
+
|
|
420
|
+
# Search for similar documents
|
|
421
|
+
results = index.search_with_metadata(query_embedding, k: 3)
|
|
422
|
+
|
|
423
|
+
results.each_with_index do |result, i|
|
|
424
|
+
similarity = (1.0 - result[:distance]).round(3) # Convert distance to similarity
|
|
425
|
+
text = result[:metadata]['text']
|
|
426
|
+
puts " #{i+1}. [#{similarity}] #{text}"
|
|
427
|
+
end
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
# Step 7: Demonstrate advanced features
|
|
431
|
+
puts "\n" + "="*50
|
|
432
|
+
puts "ADVANCED FEATURES"
|
|
433
|
+
puts "="*50
|
|
434
|
+
|
|
435
|
+
# Show search quality adjustment
|
|
436
|
+
puts "\nAdjusting search quality (ef parameter):"
|
|
437
|
+
index.set_ef(50) # Lower ef = faster but potentially less accurate
|
|
438
|
+
fast_results = index.search(embeddings[0], k: 3)
|
|
439
|
+
puts " Fast search (ef=50): #{fast_results}"
|
|
440
|
+
|
|
441
|
+
index.set_ef(200) # Higher ef = slower but more accurate
|
|
442
|
+
accurate_results = index.search(embeddings[0], k: 3)
|
|
443
|
+
puts " Accurate search (ef=200): #{accurate_results}"
|
|
444
|
+
|
|
445
|
+
# Show batch operations
|
|
446
|
+
puts "\nBatch search example:"
|
|
447
|
+
query_embeddings = [embeddings[0], embeddings[5], embeddings[10]]
|
|
448
|
+
batch_results = query_embeddings.map { |emb| index.search(emb, k: 2) }
|
|
449
|
+
puts " Found #{batch_results.size} result sets"
|
|
450
|
+
|
|
451
|
+
# Save and load demonstration
|
|
452
|
+
puts "\nSaving and loading index:"
|
|
453
|
+
index.save('demo_index')
|
|
454
|
+
puts " ✓ Index saved to 'demo_index'"
|
|
455
|
+
|
|
456
|
+
loaded_index = ClusterKit::HNSW.load('demo_index')
|
|
457
|
+
test_results = loaded_index.search(embeddings[0], k: 2)
|
|
458
|
+
puts " ✓ Loaded index works: #{test_results}"
|
|
459
|
+
|
|
460
|
+
puts "\n✅ HNSW demo complete!"
|
|
461
|
+
puts "\nTry your own queries by running:"
|
|
462
|
+
puts "query_embedding = embedding_model.embedding('your search query').first.to_a"
|
|
463
|
+
puts "results = index.search_with_metadata(query_embedding, k: 5)"
|
|
464
|
+
```
|
|
465
|
+
|
|
466
|
+
#### When to Use HNSW
|
|
467
|
+
|
|
468
|
+
HNSW is ideal for:
|
|
469
|
+
- **Recommendation Systems**: Find similar items/users quickly
|
|
470
|
+
- **Semantic Search**: Find documents with similar embeddings
|
|
471
|
+
- **Duplicate Detection**: Identify near-duplicate content
|
|
472
|
+
- **Clustering Support**: As a fast neighbor graph for HDBSCAN
|
|
473
|
+
- **Real-time Applications**: When you need sub-millisecond search times
|
|
474
|
+
|
|
475
|
+
#### Configuration Guidelines
|
|
476
|
+
|
|
477
|
+
```ruby
|
|
478
|
+
# High recall (>0.95) - Best quality, slower
|
|
479
|
+
ClusterKit::HNSW.new(
|
|
480
|
+
dim: dim,
|
|
481
|
+
m: 32,
|
|
482
|
+
ef_construction: 400
|
|
483
|
+
).tap { |idx| idx.set_ef(100) }
|
|
484
|
+
|
|
485
|
+
# Balanced (>0.90 recall) - Good quality, fast
|
|
486
|
+
ClusterKit::HNSW.new(
|
|
487
|
+
dim: dim,
|
|
488
|
+
m: 16,
|
|
489
|
+
ef_construction: 200
|
|
490
|
+
).tap { |idx| idx.set_ef(50) }
|
|
491
|
+
|
|
492
|
+
# Speed optimized (>0.85 recall) - Fastest, acceptable quality
|
|
493
|
+
ClusterKit::HNSW.new(
|
|
494
|
+
dim: dim,
|
|
495
|
+
m: 8,
|
|
496
|
+
ef_construction: 100
|
|
497
|
+
).tap { |idx| idx.set_ef(20) }
|
|
498
|
+
```
|
|
499
|
+
|
|
500
|
+
#### Important Notes
|
|
501
|
+
|
|
502
|
+
1. **Memory Usage**: HNSW keeps the entire index in memory. Estimate: `(num_items * (dim * 4 + m * 16))` bytes
|
|
503
|
+
2. **Distance Metrics**: Currently only Euclidean distance is fully supported
|
|
504
|
+
3. **Loading Behavior**: Due to Rust lifetime constraints, loading an index creates a small memory leak (the index metadata persists until program exit). This is typically negligible for most applications.
|
|
505
|
+
4. **Build Time**: Index construction is O(N * log(N)). For large datasets (>1M items), consider building offline
|
|
506
|
+
|
|
507
|
+
#### Example: Semantic Search System
|
|
508
|
+
|
|
509
|
+
```ruby
|
|
510
|
+
# Build a simple semantic search system
|
|
511
|
+
documents = load_documents()
|
|
512
|
+
embeddings = generate_embeddings(documents) # Use red-candle or similar
|
|
513
|
+
|
|
514
|
+
# Build search index
|
|
515
|
+
search_index = ClusterKit::HNSW.new(
|
|
516
|
+
dim: embeddings.first.size,
|
|
517
|
+
m: 16,
|
|
518
|
+
ef_construction: 200,
|
|
519
|
+
max_elements: documents.size
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
# Add all documents
|
|
523
|
+
documents.each_with_index do |doc, i|
|
|
524
|
+
search_index.add_item(
|
|
525
|
+
embeddings[i],
|
|
526
|
+
label: i,
|
|
527
|
+
metadata: { title: doc[:title], url: doc[:url] }
|
|
528
|
+
)
|
|
529
|
+
end
|
|
530
|
+
|
|
531
|
+
# Search function
|
|
532
|
+
def search(query, index, k: 10)
|
|
533
|
+
query_embedding = generate_embedding(query)
|
|
534
|
+
results = index.search_with_metadata(query_embedding, k: k)
|
|
535
|
+
|
|
536
|
+
results.map do |result|
|
|
537
|
+
{
|
|
538
|
+
title: result[:metadata]['title'],
|
|
539
|
+
url: result[:metadata]['url'],
|
|
540
|
+
similarity: 1.0 - result[:distance] # Convert distance to similarity
|
|
541
|
+
}
|
|
542
|
+
end
|
|
543
|
+
end
|
|
544
|
+
|
|
545
|
+
# Save for later use
|
|
546
|
+
search_index.save('document_index')
|
|
547
|
+
```
|
|
548
|
+
|
|
332
549
|
### Visualization
|
|
333
550
|
|
|
334
551
|
ClusterKit includes a built-in visualization tool:
|
|
@@ -350,6 +567,9 @@ This creates an interactive HTML file with:
|
|
|
350
567
|
- Performance metrics
|
|
351
568
|
- Interactive Plotly.js charts
|
|
352
569
|
|
|
570
|
+
<img src="/docs/assets/visualization.png" alt="rake clusterkit:visualize">
|
|
571
|
+
|
|
572
|
+
|
|
353
573
|
## Choosing the Right Algorithm
|
|
354
574
|
|
|
355
575
|
### Dimensionality Reduction
|
|
@@ -454,7 +674,7 @@ This error occurs when UMAP cannot find enough neighbors for some points. Soluti
|
|
|
454
674
|
```ruby
|
|
455
675
|
# Bad: Pure random data with no structure
|
|
456
676
|
data = Array.new(100) { Array.new(50) { rand } }
|
|
457
|
-
|
|
677
|
+
|
|
458
678
|
# Good: Data with clusters or patterns (see Quick Start example)
|
|
459
679
|
# Create clusters with centers and add points around them
|
|
460
680
|
```
|
|
@@ -500,7 +720,7 @@ COVERAGE=true bundle exec rspec
|
|
|
500
720
|
|
|
501
721
|
## Contributing
|
|
502
722
|
|
|
503
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
|
723
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/scientist-labs/clusterkit.
|
|
504
724
|
|
|
505
725
|
## License
|
|
506
726
|
|
|
@@ -515,10 +735,10 @@ If you use ClusterKit in your research, please cite:
|
|
|
515
735
|
author = {Chris Petersen},
|
|
516
736
|
title = {ClusterKit: High-Performance Clustering and Dimensionality Reduction for Ruby},
|
|
517
737
|
year = {2024},
|
|
518
|
-
url = {https://github.com/
|
|
738
|
+
url = {https://github.com/scientist-labs/clusterkit}
|
|
519
739
|
}
|
|
520
740
|
```
|
|
521
741
|
|
|
522
742
|
And please also cite the underlying libraries:
|
|
523
743
|
- [annembed](https://github.com/jean-pierreBoth/annembed) for dimensionality reduction algorithms
|
|
524
|
-
- [hdbscan](https://github.com/petabi/hdbscan) for HDBSCAN clustering
|
|
744
|
+
- [hdbscan](https://github.com/petabi/hdbscan) for HDBSCAN clustering
|
data/docs/KNOWN_ISSUES.md
CHANGED
|
@@ -14,7 +14,7 @@ This gem has three main categories of limitations:
|
|
|
14
14
|
|
|
15
15
|
**Reason**: UMAP needs sufficient data to construct a meaningful manifold approximation. With fewer than 10 points, the algorithm cannot create a reliable graph structure.
|
|
16
16
|
|
|
17
|
-
**Workaround**:
|
|
17
|
+
**Workaround**:
|
|
18
18
|
- Use PCA for datasets with fewer than 10 points
|
|
19
19
|
- The `transform` method can handle smaller datasets once the model is fitted on adequate training data
|
|
20
20
|
|
|
@@ -30,12 +30,12 @@ This gem has three main categories of limitations:
|
|
|
30
30
|
|
|
31
31
|
**Previous Issue**: The box_size assertion would panic and crash the Ruby process.
|
|
32
32
|
|
|
33
|
-
**Current Status**: **FIXED** in `
|
|
33
|
+
**Current Status**: **FIXED** in `scientist-labs/annembed:fix-box-size-panic` branch
|
|
34
34
|
- The `"assertion failed: (*f).abs() <= box_size"` panic has been converted to a catchable error
|
|
35
35
|
- Extreme value ranges are now handled gracefully through normalization
|
|
36
36
|
- NaN/Infinite values are detected and reported with clear error messages
|
|
37
37
|
|
|
38
|
-
**Remaining Uncatchable Errors**:
|
|
38
|
+
**Remaining Uncatchable Errors**:
|
|
39
39
|
- Array bounds violations (accessing out-of-bounds indices)
|
|
40
40
|
- Some `.unwrap()` calls on `None` or `Err` values
|
|
41
41
|
- These are much less common in normal usage
|
|
@@ -98,7 +98,7 @@ def safe_umap_transform(data, options = {})
|
|
|
98
98
|
# Save data to temporary file before processing
|
|
99
99
|
temp_file = "temp_umap_data_#{Time.now.to_i}.json"
|
|
100
100
|
File.write(temp_file, JSON.dump(data))
|
|
101
|
-
|
|
101
|
+
|
|
102
102
|
begin
|
|
103
103
|
umap = ClusterKit::Dimensionality::UMAP.new(**options)
|
|
104
104
|
result = umap.fit_transform(data)
|
|
@@ -127,4 +127,4 @@ def reduce_dimensions(data, n_components: 2)
|
|
|
127
127
|
pca.fit_transform(data)
|
|
128
128
|
end
|
|
129
129
|
end
|
|
130
|
-
```
|
|
130
|
+
```
|
data/docs/RUST_ERROR_HANDLING.md
CHANGED
|
@@ -37,11 +37,11 @@ These use Rust's `assert!` or `panic!` macros and CANNOT be caught. They will cr
|
|
|
37
37
|
|
|
38
38
|
| Error | Source | Location | Trigger Condition |
|
|
39
39
|
|-------|--------|----------|-------------------|
|
|
40
|
-
| ~~Box size assertion~~ | ~~annembed~~ | ~~`set_data_box`~~ | **FIXED in
|
|
40
|
+
| ~~Box size assertion~~ | ~~annembed~~ | ~~`set_data_box`~~ | **FIXED in scientist-labs/annembed:fix-box-size-panic** |
|
|
41
41
|
| Array bounds | Various | Index operations | Accessing out-of-bounds indices |
|
|
42
42
|
| Unwrap failures | Various | `.unwrap()` calls | Unwrapping `None` or `Err` |
|
|
43
43
|
|
|
44
|
-
**Update (2025-08-19):** The box size assertion has been fixed in the `fix-box-size-panic` branch of
|
|
44
|
+
**Update (2025-08-19):** The box size assertion has been fixed in the `fix-box-size-panic` branch of scientist-labs/annembed. It now returns a proper `Result<(), anyhow::Error>` that can be caught and handled gracefully:
|
|
45
45
|
|
|
46
46
|
```rust
|
|
47
47
|
// Previously (would panic):
|
|
@@ -96,13 +96,13 @@ when /isolated point/i
|
|
|
96
96
|
|
|
97
97
|
**Previous Issue:** Would panic and crash the Ruby process
|
|
98
98
|
|
|
99
|
-
**Current Status:** Fixed in `
|
|
100
|
-
- Now returns a catchable `anyhow::Error`
|
|
99
|
+
**Current Status:** Fixed in `scientist-labs/annembed:fix-box-size-panic` branch
|
|
100
|
+
- Now returns a catchable `anyhow::Error`
|
|
101
101
|
- Detects NaN/Infinite values during normalization
|
|
102
102
|
- Handles constant data (max_max = 0) gracefully
|
|
103
103
|
- Extreme value ranges are normalized successfully
|
|
104
104
|
|
|
105
|
-
**User-visible behavior:**
|
|
105
|
+
**User-visible behavior:**
|
|
106
106
|
- Previously: Ruby process would crash with assertion failure
|
|
107
107
|
- Now: Raises a catchable Ruby exception with helpful error message
|
|
108
108
|
|
|
@@ -161,4 +161,4 @@ when /isolated point/i
|
|
|
161
161
|
|
|
162
162
|
The test suite mocks Rust errors to verify our error handling logic works correctly. However, actual panic conditions cannot be tested without crashing the test process.
|
|
163
163
|
|
|
164
|
-
See `spec/clusterkit/error_handling_spec.rb` for error handling tests.
|
|
164
|
+
See `spec/clusterkit/error_handling_spec.rb` for error handling tests.
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
data/ext/clusterkit/Cargo.toml
CHANGED
|
@@ -7,9 +7,9 @@ edition = "2021"
|
|
|
7
7
|
crate-type = ["cdylib"]
|
|
8
8
|
|
|
9
9
|
[dependencies]
|
|
10
|
-
magnus = { version = "0.
|
|
11
|
-
annembed = { git = "https://github.com/
|
|
12
|
-
hnsw_rs = { git = "https://github.com/
|
|
10
|
+
magnus = { version = "0.8", features = ["embed"] }
|
|
11
|
+
annembed = { git = "https://github.com/scientist-labs/annembed", tag = "clusterkit-0.1.1" }
|
|
12
|
+
hnsw_rs = { git = "https://github.com/scientist-labs/hnswlib-rs", tag = "clusterkit-0.1.0" }
|
|
13
13
|
hdbscan = "0.11"
|
|
14
14
|
ndarray = "0.16"
|
|
15
15
|
num-traits = "0.2"
|
|
@@ -22,4 +22,5 @@ rand = "0.8"
|
|
|
22
22
|
default = ["openblas-static"]
|
|
23
23
|
openblas-static = ["annembed/openblas-static"]
|
|
24
24
|
openblas-system = ["annembed/openblas-system"]
|
|
25
|
-
intel-mkl-static = ["annembed/intel-mkl-static"]
|
|
25
|
+
intel-mkl-static = ["annembed/intel-mkl-static"]
|
|
26
|
+
macos-accelerate = ["annembed/macos-accelerate"]
|
data/ext/clusterkit/extconf.rb
CHANGED
|
@@ -1,4 +1,12 @@
|
|
|
1
1
|
require "mkmf"
|
|
2
2
|
require "rb_sys/mkmf"
|
|
3
3
|
|
|
4
|
-
create_rust_makefile("clusterkit/clusterkit")
|
|
4
|
+
create_rust_makefile("clusterkit/clusterkit") do |r|
|
|
5
|
+
if ENV["CLUSTERKIT_FEATURES"]
|
|
6
|
+
r.extra_cargo_args += ["--no-default-features"]
|
|
7
|
+
r.features = ENV["CLUSTERKIT_FEATURES"].split(",")
|
|
8
|
+
elsif RUBY_PLATFORM =~ /darwin/
|
|
9
|
+
r.extra_cargo_args += ["--no-default-features"]
|
|
10
|
+
r.features = ["macos-accelerate"]
|
|
11
|
+
end
|
|
12
|
+
end
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
use magnus::{function, prelude::*, Error, Value,
|
|
1
|
+
use magnus::{function, prelude::*, Error, Value, RHash, Ruby};
|
|
2
2
|
use hdbscan::{Hdbscan, HdbscanHyperParams};
|
|
3
|
+
use crate::utils::ruby_array_to_vec_vec_f64;
|
|
3
4
|
|
|
4
5
|
/// Perform HDBSCAN clustering
|
|
5
6
|
/// Returns a hash with labels and basic statistics
|
|
@@ -9,98 +10,62 @@ pub fn hdbscan_fit(
|
|
|
9
10
|
min_cluster_size: usize,
|
|
10
11
|
metric: String,
|
|
11
12
|
) -> Result<RHash, Error> {
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
magnus::exception::arg_error(),
|
|
19
|
-
"Data cannot be empty",
|
|
20
|
-
));
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
// Get dimensions
|
|
24
|
-
let first_row: RArray = rarray.entry::<RArray>(0)?;
|
|
25
|
-
let n_features = first_row.len();
|
|
26
|
-
|
|
27
|
-
// Convert to Vec<Vec<f64>> format expected by hdbscan crate
|
|
28
|
-
let mut data_vec: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
|
|
29
|
-
for i in 0..n_samples {
|
|
30
|
-
let row: RArray = rarray.entry(i as isize)?;
|
|
31
|
-
let mut row_vec: Vec<f64> = Vec::with_capacity(n_features);
|
|
32
|
-
for j in 0..n_features {
|
|
33
|
-
let val: f64 = row.entry(j as isize)?;
|
|
34
|
-
row_vec.push(val);
|
|
35
|
-
}
|
|
36
|
-
data_vec.push(row_vec);
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
// Note: hdbscan crate doesn't support custom metrics directly
|
|
40
|
-
// We'll use the default Euclidean distance for now
|
|
13
|
+
let ruby = Ruby::get().unwrap();
|
|
14
|
+
|
|
15
|
+
// Convert Ruby array to Vec<Vec<f64>> using shared helper
|
|
16
|
+
let data_vec = ruby_array_to_vec_vec_f64(data)?;
|
|
17
|
+
let n_samples = data_vec.len();
|
|
18
|
+
|
|
41
19
|
if metric != "euclidean" && metric != "l2" {
|
|
42
20
|
eprintln!("Warning: Current hdbscan version only supports Euclidean distance. Using Euclidean.");
|
|
43
21
|
}
|
|
44
|
-
|
|
22
|
+
|
|
45
23
|
// Adjust parameters to avoid index out of bounds errors
|
|
46
|
-
// The hdbscan crate has issues when min_samples >= n_samples
|
|
47
24
|
let adjusted_min_samples = min_samples.min(n_samples.saturating_sub(1)).max(1);
|
|
48
25
|
let adjusted_min_cluster_size = min_cluster_size.min(n_samples).max(2);
|
|
49
|
-
|
|
26
|
+
|
|
50
27
|
// Create hyperparameters
|
|
51
28
|
let hyper_params = HdbscanHyperParams::builder()
|
|
52
29
|
.min_cluster_size(adjusted_min_cluster_size)
|
|
53
30
|
.min_samples(adjusted_min_samples)
|
|
54
31
|
.build();
|
|
55
|
-
|
|
32
|
+
|
|
56
33
|
// Create HDBSCAN instance and run clustering
|
|
57
34
|
let clusterer = Hdbscan::new(&data_vec, hyper_params);
|
|
58
|
-
|
|
59
|
-
// Run the clustering algorithm - cluster() returns Result<Vec<i32>, HdbscanError>
|
|
35
|
+
|
|
60
36
|
let labels = clusterer.cluster().map_err(|e| {
|
|
61
37
|
Error::new(
|
|
62
|
-
|
|
38
|
+
ruby.exception_runtime_error(),
|
|
63
39
|
format!("HDBSCAN clustering failed: {:?}", e)
|
|
64
40
|
)
|
|
65
41
|
})?;
|
|
66
|
-
|
|
42
|
+
|
|
67
43
|
// Convert results to Ruby types
|
|
68
|
-
let
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
// Convert labels (i32 to Ruby Integer, -1 for noise)
|
|
72
|
-
let labels_array = RArray::new();
|
|
44
|
+
let result = ruby.hash_new();
|
|
45
|
+
|
|
46
|
+
let labels_array = ruby.ary_new();
|
|
73
47
|
for &label in labels.iter() {
|
|
74
|
-
labels_array.push(
|
|
75
|
-
ruby.eval(&format!("{}", label)).unwrap()
|
|
76
|
-
).unwrap())?;
|
|
48
|
+
labels_array.push(ruby.integer_from_i64(label as i64))?;
|
|
77
49
|
}
|
|
78
50
|
result.aset("labels", labels_array)?;
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
// since the basic hdbscan crate doesn't provide these
|
|
82
|
-
// In the future, we could calculate these ourselves or use a more advanced implementation
|
|
83
|
-
|
|
84
|
-
// Create probabilities array (all 1.0 for clustered points, 0.0 for noise)
|
|
85
|
-
let probs_array = RArray::new();
|
|
51
|
+
|
|
52
|
+
let probs_array = ruby.ary_new();
|
|
86
53
|
for &label in labels.iter() {
|
|
87
54
|
let prob = if label == -1 { 0.0 } else { 1.0 };
|
|
88
55
|
probs_array.push(prob)?;
|
|
89
56
|
}
|
|
90
57
|
result.aset("probabilities", probs_array)?;
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
let outlier_array = RArray::new();
|
|
58
|
+
|
|
59
|
+
let outlier_array = ruby.ary_new();
|
|
94
60
|
for &label in labels.iter() {
|
|
95
61
|
let score = if label == -1 { 1.0 } else { 0.0 };
|
|
96
62
|
outlier_array.push(score)?;
|
|
97
63
|
}
|
|
98
64
|
result.aset("outlier_scores", outlier_array)?;
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
let persistence_hash = RHash::new();
|
|
65
|
+
|
|
66
|
+
let persistence_hash = ruby.hash_new();
|
|
102
67
|
result.aset("cluster_persistence", persistence_hash)?;
|
|
103
|
-
|
|
68
|
+
|
|
104
69
|
Ok(result)
|
|
105
70
|
}
|
|
106
71
|
|
|
@@ -110,6 +75,6 @@ pub fn init(clustering_module: &magnus::RModule) -> Result<(), Error> {
|
|
|
110
75
|
"hdbscan_rust",
|
|
111
76
|
function!(hdbscan_fit, 4),
|
|
112
77
|
)?;
|
|
113
|
-
|
|
78
|
+
|
|
114
79
|
Ok(())
|
|
115
|
-
}
|
|
80
|
+
}
|