clusterkit 0.3.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.simplecov +47 -0
- data/CHANGELOG.md +35 -0
- data/CLAUDE.md +226 -0
- data/Cargo.lock +3228 -0
- data/Cargo.toml +8 -0
- data/Gemfile +17 -0
- data/IMPLEMENTATION_NOTES.md +143 -0
- data/LICENSE.txt +21 -0
- data/PYTHON_COMPARISON.md +183 -0
- data/README.md +744 -0
- data/Rakefile +259 -0
- data/docs/KNOWN_ISSUES.md +130 -0
- data/docs/RUST_ERROR_HANDLING.md +164 -0
- data/docs/TEST_FIXTURES.md +170 -0
- data/docs/UMAP_EXPLAINED.md +362 -0
- data/docs/UMAP_TROUBLESHOOTING.md +284 -0
- data/docs/VERBOSE_OUTPUT.md +84 -0
- data/docs/assets/clusterkit-wide.png +0 -0
- data/docs/assets/clusterkit.png +0 -0
- data/docs/assets/visualization.png +0 -0
- data/examples/hdbscan_example.rb +147 -0
- data/examples/optimal_kmeans_example.rb +96 -0
- data/examples/pca_example.rb +114 -0
- data/examples/reproducible_umap.rb +99 -0
- data/examples/verbose_control.rb +43 -0
- data/ext/clusterkit/Cargo.toml +26 -0
- data/ext/clusterkit/extconf.rb +23 -0
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +80 -0
- data/ext/clusterkit/src/clustering.rs +221 -0
- data/ext/clusterkit/src/embedder.rs +349 -0
- data/ext/clusterkit/src/hnsw.rs +579 -0
- data/ext/clusterkit/src/lib.rs +24 -0
- data/ext/clusterkit/src/svd.rs +89 -0
- data/ext/clusterkit/src/tests.rs +16 -0
- data/ext/clusterkit/src/utils.rs +183 -0
- data/lib/clusterkit/3.1/clusterkit.so +0 -0
- data/lib/clusterkit/3.2/clusterkit.so +0 -0
- data/lib/clusterkit/3.3/clusterkit.so +0 -0
- data/lib/clusterkit/3.4/clusterkit.so +0 -0
- data/lib/clusterkit/clustering/hdbscan.rb +164 -0
- data/lib/clusterkit/clustering.rb +194 -0
- data/lib/clusterkit/clusterkit.rb +14 -0
- data/lib/clusterkit/configuration.rb +24 -0
- data/lib/clusterkit/data_validator.rb +132 -0
- data/lib/clusterkit/dimensionality/pca.rb +251 -0
- data/lib/clusterkit/dimensionality/svd.rb +175 -0
- data/lib/clusterkit/dimensionality/umap.rb +282 -0
- data/lib/clusterkit/dimensionality.rb +29 -0
- data/lib/clusterkit/hdbscan_api_design.rb +142 -0
- data/lib/clusterkit/hnsw.rb +251 -0
- data/lib/clusterkit/preprocessing.rb +106 -0
- data/lib/clusterkit/silence.rb +42 -0
- data/lib/clusterkit/utils.rb +51 -0
- data/lib/clusterkit/version.rb +5 -0
- data/lib/clusterkit.rb +105 -0
- data/lib/tasks/visualize.rake +641 -0
- metadata +220 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: b07d5fa4fef10a91504c99bbd4916658d0962e6f5e23d7182b0d772209351ad8
|
|
4
|
+
data.tar.gz: 5bebe7bc4ccf98182b9cdd25f9cc2ca042ae73b594c4ef76928640388408f2f5
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 1beb6f4efb0181221756e7a8229f9013aa431c96e2c4ff166f501c48d36d442b963993fafdc3c610eba00c51524166644bb70a40c1ce1f8e05f6d815387b5d34
|
|
7
|
+
data.tar.gz: f55dfd8181eb12095fca46ba763109f7c483ba22f05aad3ec8fd2ee7ff794716677608b0bb782b5dabf91a292ea9b6ff85868a9df620701ffb574f9803eecf3a
|
data/.rspec
ADDED
data/.simplecov
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
SimpleCov.configure do
|
|
4
|
+
# Add custom groups
|
|
5
|
+
add_group 'Core', 'lib/annembed/embedder'
|
|
6
|
+
add_group 'UMAP', 'lib/annembed/umap'
|
|
7
|
+
add_group 'Utils', 'lib/annembed/utils'
|
|
8
|
+
add_group 'Configuration', 'lib/annembed/config'
|
|
9
|
+
|
|
10
|
+
# Track branches as well as lines
|
|
11
|
+
enable_coverage :branch
|
|
12
|
+
|
|
13
|
+
# Set thresholds (temporarily disabled to diagnose issues)
|
|
14
|
+
# minimum_coverage line: 50, branch: 40
|
|
15
|
+
|
|
16
|
+
# Don't refuse to run tests if coverage drops (during development)
|
|
17
|
+
# refuse_coverage_drop
|
|
18
|
+
|
|
19
|
+
# Maximum coverage drop allowed
|
|
20
|
+
maximum_coverage_drop 5
|
|
21
|
+
|
|
22
|
+
# Configure output directory
|
|
23
|
+
coverage_dir 'coverage'
|
|
24
|
+
|
|
25
|
+
# Track test files separately
|
|
26
|
+
track_files 'lib/**/*.rb'
|
|
27
|
+
|
|
28
|
+
# Custom filters
|
|
29
|
+
add_filter do |source_file|
|
|
30
|
+
# Skip version file
|
|
31
|
+
source_file.filename.include?('version.rb')
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Include timestamp in coverage report
|
|
35
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new([
|
|
36
|
+
SimpleCov::Formatter::HTMLFormatter,
|
|
37
|
+
])
|
|
38
|
+
|
|
39
|
+
# Set project name
|
|
40
|
+
command_name 'RSpec'
|
|
41
|
+
|
|
42
|
+
# Merge results from multiple test runs
|
|
43
|
+
use_merging true
|
|
44
|
+
|
|
45
|
+
# Set result cache timeout (in seconds)
|
|
46
|
+
merge_timeout 3600
|
|
47
|
+
end
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- Clean, scikit-learn-like API for UMAP
|
|
12
|
+
- `fit(data)` - Train the model
|
|
13
|
+
- `transform(data)` - Transform new data
|
|
14
|
+
- `fit_transform(data)` - Train and transform in one step
|
|
15
|
+
- `fitted?` - Check if model is trained
|
|
16
|
+
- `save(path)` - Save trained model
|
|
17
|
+
- `load(path)` - Load trained model
|
|
18
|
+
- Model persistence with save/load functionality
|
|
19
|
+
- Data export/import utilities for caching results
|
|
20
|
+
- Comprehensive test suite for UMAP interface
|
|
21
|
+
- Detailed README with practical examples
|
|
22
|
+
|
|
23
|
+
### Changed
|
|
24
|
+
- Complete API redesign to follow ML library conventions
|
|
25
|
+
- Removed confusing `save_embeddings`/`load_embeddings` methods
|
|
26
|
+
- Separated model operations from data caching concerns
|
|
27
|
+
|
|
28
|
+
### Fixed
|
|
29
|
+
- Intermittent test failures with boundary assertions
|
|
30
|
+
- Data normalization issues with extreme values
|
|
31
|
+
|
|
32
|
+
## [0.1.0] - TBD
|
|
33
|
+
|
|
34
|
+
### Added
|
|
35
|
+
- Initial release with basic embedding functionality
|
data/CLAUDE.md
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# CLAUDE.md - clusterkit Project Guide
|
|
2
|
+
|
|
3
|
+
## Project Vision
|
|
4
|
+
clusterkit brings high-performance dimensionality reduction and embedding algorithms to Ruby by wrapping the annembed Rust crate. This gem is part of the ruby-nlp ecosystem, which aims to provide Ruby developers with native machine learning and NLP capabilities through best-in-breed Rust implementations.
|
|
5
|
+
|
|
6
|
+
## Core Principles
|
|
7
|
+
|
|
8
|
+
### 1. Ruby-First Design
|
|
9
|
+
- Provide an idiomatic Ruby API that feels natural to Ruby developers
|
|
10
|
+
- Follow Ruby naming conventions (snake_case methods, proper use of symbols)
|
|
11
|
+
- Support Ruby's duck typing while maintaining type safety at the Rust boundary
|
|
12
|
+
- Integrate seamlessly with Ruby's data science ecosystem
|
|
13
|
+
|
|
14
|
+
### 2. Performance Without Compromise
|
|
15
|
+
- Leverage Rust's performance for compute-intensive operations
|
|
16
|
+
- Use Magnus for zero-copy data transfer where possible
|
|
17
|
+
- Enable parallelization by default
|
|
18
|
+
- Provide progress feedback for long-running operations
|
|
19
|
+
|
|
20
|
+
### 3. Ecosystem Integration
|
|
21
|
+
- Primary support for Numo::NArray (the NumPy of Ruby)
|
|
22
|
+
- Work well with other ruby-nlp gems (lancelot, red-candle)
|
|
23
|
+
- Support common Ruby data formats and visualization tools
|
|
24
|
+
- Play nice with Jupyter notebooks (iruby)
|
|
25
|
+
|
|
26
|
+
## Technical Guidelines
|
|
27
|
+
|
|
28
|
+
### Magnus Best Practices
|
|
29
|
+
|
|
30
|
+
1. **Memory Management**
|
|
31
|
+
```rust
|
|
32
|
+
// Good: Let Magnus handle Ruby object lifecycle
|
|
33
|
+
let array: RArray = data.try_convert()?;
|
|
34
|
+
|
|
35
|
+
// Avoid: Manual memory management
|
|
36
|
+
// Don't try to manually free Ruby objects
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
2. **Error Handling**
|
|
40
|
+
```rust
|
|
41
|
+
// Always wrap errors properly
|
|
42
|
+
use magnus::Error;
|
|
43
|
+
|
|
44
|
+
fn risky_operation() -> Result<RArray, Error> {
|
|
45
|
+
annembed_call()
|
|
46
|
+
.map_err(|e| Error::new(exception::runtime_error(), e.to_string()))?
|
|
47
|
+
}
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
3. **Type Conversions**
|
|
51
|
+
```rust
|
|
52
|
+
// Define clear conversion traits
|
|
53
|
+
impl TryFrom<Value> for EmbedConfig {
|
|
54
|
+
type Error = Error;
|
|
55
|
+
// Robust conversion with good error messages
|
|
56
|
+
}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Ruby API Design
|
|
60
|
+
|
|
61
|
+
1. **Method Naming**
|
|
62
|
+
- Use Ruby conventions: `fit_transform`, not `fitTransform`
|
|
63
|
+
- Predicates end with `?`: `converged?`, `fitted?`
|
|
64
|
+
- Dangerous methods end with `!`: `normalize!`
|
|
65
|
+
|
|
66
|
+
2. **Parameter Handling**
|
|
67
|
+
```ruby
|
|
68
|
+
# Good: Use keyword arguments with defaults
|
|
69
|
+
def initialize(method: :umap, n_components: 2, **options)
|
|
70
|
+
|
|
71
|
+
# Avoid: Positional arguments for configuration
|
|
72
|
+
def initialize(method, n_components, min_dist, spread, ...)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
3. **Return Values**
|
|
76
|
+
- Return Ruby arrays for small results
|
|
77
|
+
- Return Numo::NArray for large matrices
|
|
78
|
+
- Support multiple return formats via options
|
|
79
|
+
|
|
80
|
+
### Performance Considerations
|
|
81
|
+
|
|
82
|
+
1. **Data Transfer**
|
|
83
|
+
- Minimize copying between Ruby and Rust
|
|
84
|
+
- Use view/slice operations when possible
|
|
85
|
+
- Support streaming for large datasets
|
|
86
|
+
|
|
87
|
+
2. **Threading**
|
|
88
|
+
- Respect Ruby's GVL (Global VM Lock)
|
|
89
|
+
- Release GVL for long-running Rust operations
|
|
90
|
+
- Use Rust's parallelization, not Ruby threads
|
|
91
|
+
|
|
92
|
+
3. **Memory Usage**
|
|
93
|
+
- Provide memory estimates for large operations
|
|
94
|
+
- Support out-of-core processing for huge datasets
|
|
95
|
+
- Clear progress indication for long operations
|
|
96
|
+
|
|
97
|
+
## Code Style Guidelines
|
|
98
|
+
|
|
99
|
+
### Rust Side
|
|
100
|
+
- Follow Rust standard style (rustfmt)
|
|
101
|
+
- Comprehensive error types with context
|
|
102
|
+
- Document all public functions
|
|
103
|
+
- Use type aliases for clarity
|
|
104
|
+
|
|
105
|
+
### Ruby Side
|
|
106
|
+
- Follow Ruby Style Guide
|
|
107
|
+
- Use YARD documentation format
|
|
108
|
+
- Provide type signatures where helpful
|
|
109
|
+
- Include usage examples in docs
|
|
110
|
+
|
|
111
|
+
## Testing Philosophy
|
|
112
|
+
|
|
113
|
+
1. **Comprehensive Coverage**
|
|
114
|
+
- Unit tests for all public methods
|
|
115
|
+
- Integration tests with real datasets
|
|
116
|
+
- Performance benchmarks
|
|
117
|
+
- Memory leak tests
|
|
118
|
+
|
|
119
|
+
2. **Test Data**
|
|
120
|
+
- Use standard ML datasets (Iris, MNIST samples)
|
|
121
|
+
- Generate synthetic data for edge cases
|
|
122
|
+
- Test with various Ruby object types
|
|
123
|
+
|
|
124
|
+
3. **Platform Testing**
|
|
125
|
+
- Test on multiple Ruby versions
|
|
126
|
+
- Test on different operating systems
|
|
127
|
+
- Verify precompiled gem distribution
|
|
128
|
+
|
|
129
|
+
## Documentation Standards
|
|
130
|
+
|
|
131
|
+
1. **README**
|
|
132
|
+
- Clear installation instructions
|
|
133
|
+
- Quick start example that works
|
|
134
|
+
- Link to full documentation
|
|
135
|
+
- Performance comparisons
|
|
136
|
+
|
|
137
|
+
2. **API Documentation**
|
|
138
|
+
- Every public method documented
|
|
139
|
+
- Parameter types and ranges specified
|
|
140
|
+
- Return values clearly described
|
|
141
|
+
- Usage examples for complex methods
|
|
142
|
+
|
|
143
|
+
3. **Tutorials**
|
|
144
|
+
- Jupyter notebook examples
|
|
145
|
+
- Common use case walkthroughs
|
|
146
|
+
- Integration examples with other gems
|
|
147
|
+
|
|
148
|
+
## Common Patterns
|
|
149
|
+
|
|
150
|
+
### Configuration Objects
|
|
151
|
+
```ruby
|
|
152
|
+
# Prefer configuration objects over many parameters
|
|
153
|
+
config = Annembed::Config.new(
|
|
154
|
+
method: :umap,
|
|
155
|
+
n_neighbors: 15,
|
|
156
|
+
min_dist: 0.1
|
|
157
|
+
)
|
|
158
|
+
embedder = Annembed::Embedder.new(config)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Progress Callbacks
|
|
162
|
+
```ruby
|
|
163
|
+
# Support progress monitoring
|
|
164
|
+
embedder.on_progress do |iteration, total|
|
|
165
|
+
puts "Progress: #{iteration}/#{total}"
|
|
166
|
+
end
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Flexible Input/Output
|
|
170
|
+
```ruby
|
|
171
|
+
# Accept multiple input formats
|
|
172
|
+
embedder.fit_transform(data) # Array, NArray, or CSV path
|
|
173
|
+
|
|
174
|
+
# Support different output formats
|
|
175
|
+
result = embedder.transform(data, output: :array) # Ruby Array
|
|
176
|
+
result = embedder.transform(data, output: :narray) # Numo::NArray
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## Development Workflow
|
|
180
|
+
|
|
181
|
+
1. **Branch Strategy**
|
|
182
|
+
- `main` - stable release
|
|
183
|
+
- `develop` - integration branch
|
|
184
|
+
- `feature/*` - new features
|
|
185
|
+
- `fix/*` - bug fixes
|
|
186
|
+
|
|
187
|
+
2. **Release Process**
|
|
188
|
+
- Version bump in version.rb
|
|
189
|
+
- Update CHANGELOG.md
|
|
190
|
+
- Run full test suite
|
|
191
|
+
- Build precompiled gems
|
|
192
|
+
- Tag release
|
|
193
|
+
- Push to RubyGems
|
|
194
|
+
|
|
195
|
+
3. **Continuous Integration**
|
|
196
|
+
- Run tests on each push
|
|
197
|
+
- Build gems for multiple platforms
|
|
198
|
+
- Check documentation building
|
|
199
|
+
- Performance regression tests
|
|
200
|
+
|
|
201
|
+
## Future Considerations
|
|
202
|
+
|
|
203
|
+
1. **GPU Support**
|
|
204
|
+
- Monitor annembed for GPU features
|
|
205
|
+
- Plan bindings if GPU support is added
|
|
206
|
+
- Consider alternative GPU libraries
|
|
207
|
+
|
|
208
|
+
2. **Web Integration**
|
|
209
|
+
- Consider Rails integration
|
|
210
|
+
- WebAssembly compilation?
|
|
211
|
+
- REST API wrapper?
|
|
212
|
+
|
|
213
|
+
3. **Visualization**
|
|
214
|
+
- Built-in plotting helpers?
|
|
215
|
+
- Export to common formats
|
|
216
|
+
- Interactive visualizations?
|
|
217
|
+
|
|
218
|
+
## Getting Help
|
|
219
|
+
|
|
220
|
+
When implementing new features:
|
|
221
|
+
1. Check existing patterns in lancelot and red-candle
|
|
222
|
+
2. Consult annembed documentation
|
|
223
|
+
3. Ask in ruby-nlp discussions
|
|
224
|
+
4. Profile before optimizing
|
|
225
|
+
|
|
226
|
+
Remember: The goal is to make advanced embedding algorithms accessible and performant for Ruby developers while maintaining the simplicity and elegance that makes Ruby special.
|