clusterkit 0.3.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.simplecov +47 -0
  4. data/CHANGELOG.md +35 -0
  5. data/CLAUDE.md +226 -0
  6. data/Cargo.lock +3228 -0
  7. data/Cargo.toml +8 -0
  8. data/Gemfile +17 -0
  9. data/IMPLEMENTATION_NOTES.md +143 -0
  10. data/LICENSE.txt +21 -0
  11. data/PYTHON_COMPARISON.md +183 -0
  12. data/README.md +744 -0
  13. data/Rakefile +259 -0
  14. data/docs/KNOWN_ISSUES.md +130 -0
  15. data/docs/RUST_ERROR_HANDLING.md +164 -0
  16. data/docs/TEST_FIXTURES.md +170 -0
  17. data/docs/UMAP_EXPLAINED.md +362 -0
  18. data/docs/UMAP_TROUBLESHOOTING.md +284 -0
  19. data/docs/VERBOSE_OUTPUT.md +84 -0
  20. data/docs/assets/clusterkit-wide.png +0 -0
  21. data/docs/assets/clusterkit.png +0 -0
  22. data/docs/assets/visualization.png +0 -0
  23. data/examples/hdbscan_example.rb +147 -0
  24. data/examples/optimal_kmeans_example.rb +96 -0
  25. data/examples/pca_example.rb +114 -0
  26. data/examples/reproducible_umap.rb +99 -0
  27. data/examples/verbose_control.rb +43 -0
  28. data/ext/clusterkit/Cargo.toml +26 -0
  29. data/ext/clusterkit/extconf.rb +23 -0
  30. data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +80 -0
  31. data/ext/clusterkit/src/clustering.rs +221 -0
  32. data/ext/clusterkit/src/embedder.rs +349 -0
  33. data/ext/clusterkit/src/hnsw.rs +579 -0
  34. data/ext/clusterkit/src/lib.rs +24 -0
  35. data/ext/clusterkit/src/svd.rs +89 -0
  36. data/ext/clusterkit/src/tests.rs +16 -0
  37. data/ext/clusterkit/src/utils.rs +183 -0
  38. data/lib/clusterkit/3.1/clusterkit.so +0 -0
  39. data/lib/clusterkit/3.2/clusterkit.so +0 -0
  40. data/lib/clusterkit/3.3/clusterkit.so +0 -0
  41. data/lib/clusterkit/3.4/clusterkit.so +0 -0
  42. data/lib/clusterkit/clustering/hdbscan.rb +164 -0
  43. data/lib/clusterkit/clustering.rb +194 -0
  44. data/lib/clusterkit/clusterkit.rb +14 -0
  45. data/lib/clusterkit/configuration.rb +24 -0
  46. data/lib/clusterkit/data_validator.rb +132 -0
  47. data/lib/clusterkit/dimensionality/pca.rb +251 -0
  48. data/lib/clusterkit/dimensionality/svd.rb +175 -0
  49. data/lib/clusterkit/dimensionality/umap.rb +282 -0
  50. data/lib/clusterkit/dimensionality.rb +29 -0
  51. data/lib/clusterkit/hdbscan_api_design.rb +142 -0
  52. data/lib/clusterkit/hnsw.rb +251 -0
  53. data/lib/clusterkit/preprocessing.rb +106 -0
  54. data/lib/clusterkit/silence.rb +42 -0
  55. data/lib/clusterkit/utils.rb +51 -0
  56. data/lib/clusterkit/version.rb +5 -0
  57. data/lib/clusterkit.rb +105 -0
  58. data/lib/tasks/visualize.rake +641 -0
  59. metadata +220 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: b07d5fa4fef10a91504c99bbd4916658d0962e6f5e23d7182b0d772209351ad8
4
+ data.tar.gz: 5bebe7bc4ccf98182b9cdd25f9cc2ca042ae73b594c4ef76928640388408f2f5
5
+ SHA512:
6
+ metadata.gz: 1beb6f4efb0181221756e7a8229f9013aa431c96e2c4ff166f501c48d36d442b963993fafdc3c610eba00c51524166644bb70a40c1ce1f8e05f6d815387b5d34
7
+ data.tar.gz: f55dfd8181eb12095fca46ba763109f7c483ba22f05aad3ec8fd2ee7ff794716677608b0bb782b5dabf91a292ea9b6ff85868a9df620701ffb574f9803eecf3a
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --require spec_helper
2
+ --format documentation
3
+ --color
data/.simplecov ADDED
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ SimpleCov.configure do
4
+ # Add custom groups
5
+ add_group 'Core', 'lib/annembed/embedder'
6
+ add_group 'UMAP', 'lib/annembed/umap'
7
+ add_group 'Utils', 'lib/annembed/utils'
8
+ add_group 'Configuration', 'lib/annembed/config'
9
+
10
+ # Track branches as well as lines
11
+ enable_coverage :branch
12
+
13
+ # Set thresholds (temporarily disabled to diagnose issues)
14
+ # minimum_coverage line: 50, branch: 40
15
+
16
+ # Don't refuse to run tests if coverage drops (during development)
17
+ # refuse_coverage_drop
18
+
19
+ # Maximum coverage drop allowed
20
+ maximum_coverage_drop 5
21
+
22
+ # Configure output directory
23
+ coverage_dir 'coverage'
24
+
25
+ # Track test files separately
26
+ track_files 'lib/**/*.rb'
27
+
28
+ # Custom filters
29
+ add_filter do |source_file|
30
+ # Skip version file
31
+ source_file.filename.include?('version.rb')
32
+ end
33
+
34
+ # Include timestamp in coverage report
35
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new([
36
+ SimpleCov::Formatter::HTMLFormatter,
37
+ ])
38
+
39
+ # Set project name
40
+ command_name 'RSpec'
41
+
42
+ # Merge results from multiple test runs
43
+ use_merging true
44
+
45
+ # Set result cache timeout (in seconds)
46
+ merge_timeout 3600
47
+ end
data/CHANGELOG.md ADDED
@@ -0,0 +1,35 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ### Added
11
+ - Clean, scikit-learn-like API for UMAP
12
+ - `fit(data)` - Train the model
13
+ - `transform(data)` - Transform new data
14
+ - `fit_transform(data)` - Train and transform in one step
15
+ - `fitted?` - Check if model is trained
16
+ - `save(path)` - Save trained model
17
+ - `load(path)` - Load trained model
18
+ - Model persistence with save/load functionality
19
+ - Data export/import utilities for caching results
20
+ - Comprehensive test suite for UMAP interface
21
+ - Detailed README with practical examples
22
+
23
+ ### Changed
24
+ - Complete API redesign to follow ML library conventions
25
+ - Removed confusing `save_embeddings`/`load_embeddings` methods
26
+ - Separated model operations from data caching concerns
27
+
28
+ ### Fixed
29
+ - Intermittent test failures with boundary assertions
30
+ - Data normalization issues with extreme values
31
+
32
+ ## [0.1.0] - TBD
33
+
34
+ ### Added
35
+ - Initial release with basic embedding functionality
data/CLAUDE.md ADDED
@@ -0,0 +1,226 @@
1
+ # CLAUDE.md - clusterkit Project Guide
2
+
3
+ ## Project Vision
4
+ clusterkit brings high-performance dimensionality reduction and embedding algorithms to Ruby by wrapping the annembed Rust crate. This gem is part of the ruby-nlp ecosystem, which aims to provide Ruby developers with native machine learning and NLP capabilities through best-in-breed Rust implementations.
5
+
6
+ ## Core Principles
7
+
8
+ ### 1. Ruby-First Design
9
+ - Provide an idiomatic Ruby API that feels natural to Ruby developers
10
+ - Follow Ruby naming conventions (snake_case methods, proper use of symbols)
11
+ - Support Ruby's duck typing while maintaining type safety at the Rust boundary
12
+ - Integrate seamlessly with Ruby's data science ecosystem
13
+
14
+ ### 2. Performance Without Compromise
15
+ - Leverage Rust's performance for compute-intensive operations
16
+ - Use Magnus for zero-copy data transfer where possible
17
+ - Enable parallelization by default
18
+ - Provide progress feedback for long-running operations
19
+
20
+ ### 3. Ecosystem Integration
21
+ - Primary support for Numo::NArray (the NumPy of Ruby)
22
+ - Work well with other ruby-nlp gems (lancelot, red-candle)
23
+ - Support common Ruby data formats and visualization tools
24
+ - Play nice with Jupyter notebooks (iruby)
25
+
26
+ ## Technical Guidelines
27
+
28
+ ### Magnus Best Practices
29
+
30
+ 1. **Memory Management**
31
+ ```rust
32
+ // Good: Let Magnus handle Ruby object lifecycle
33
+ let array: RArray = data.try_convert()?;
34
+
35
+ // Avoid: Manual memory management
36
+ // Don't try to manually free Ruby objects
37
+ ```
38
+
39
+ 2. **Error Handling**
40
+ ```rust
41
+ // Always wrap errors properly
42
+ use magnus::Error;
43
+
44
+ fn risky_operation() -> Result<RArray, Error> {
45
+ annembed_call()
46
+ .map_err(|e| Error::new(exception::runtime_error(), e.to_string()))?
47
+ }
48
+ ```
49
+
50
+ 3. **Type Conversions**
51
+ ```rust
52
+ // Define clear conversion traits
53
+ impl TryFrom<Value> for EmbedConfig {
54
+ type Error = Error;
55
+ // Robust conversion with good error messages
56
+ }
57
+ ```
58
+
59
+ ### Ruby API Design
60
+
61
+ 1. **Method Naming**
62
+ - Use Ruby conventions: `fit_transform`, not `fitTransform`
63
+ - Predicates end with `?`: `converged?`, `fitted?`
64
+ - Dangerous methods end with `!`: `normalize!`
65
+
66
+ 2. **Parameter Handling**
67
+ ```ruby
68
+ # Good: Use keyword arguments with defaults
69
+ def initialize(method: :umap, n_components: 2, **options)
70
+
71
+ # Avoid: Positional arguments for configuration
72
+ def initialize(method, n_components, min_dist, spread, ...)
73
+ ```
74
+
75
+ 3. **Return Values**
76
+ - Return Ruby arrays for small results
77
+ - Return Numo::NArray for large matrices
78
+ - Support multiple return formats via options
79
+
80
+ ### Performance Considerations
81
+
82
+ 1. **Data Transfer**
83
+ - Minimize copying between Ruby and Rust
84
+ - Use view/slice operations when possible
85
+ - Support streaming for large datasets
86
+
87
+ 2. **Threading**
88
+ - Respect Ruby's GVL (Global VM Lock)
89
+ - Release GVL for long-running Rust operations
90
+ - Use Rust's parallelization, not Ruby threads
91
+
92
+ 3. **Memory Usage**
93
+ - Provide memory estimates for large operations
94
+ - Support out-of-core processing for huge datasets
95
+ - Clear progress indication for long operations
96
+
97
+ ## Code Style Guidelines
98
+
99
+ ### Rust Side
100
+ - Follow Rust standard style (rustfmt)
101
+ - Comprehensive error types with context
102
+ - Document all public functions
103
+ - Use type aliases for clarity
104
+
105
+ ### Ruby Side
106
+ - Follow Ruby Style Guide
107
+ - Use YARD documentation format
108
+ - Provide type signatures where helpful
109
+ - Include usage examples in docs
110
+
111
+ ## Testing Philosophy
112
+
113
+ 1. **Comprehensive Coverage**
114
+ - Unit tests for all public methods
115
+ - Integration tests with real datasets
116
+ - Performance benchmarks
117
+ - Memory leak tests
118
+
119
+ 2. **Test Data**
120
+ - Use standard ML datasets (Iris, MNIST samples)
121
+ - Generate synthetic data for edge cases
122
+ - Test with various Ruby object types
123
+
124
+ 3. **Platform Testing**
125
+ - Test on multiple Ruby versions
126
+ - Test on different operating systems
127
+ - Verify precompiled gem distribution
128
+
129
+ ## Documentation Standards
130
+
131
+ 1. **README**
132
+ - Clear installation instructions
133
+ - Quick start example that works
134
+ - Link to full documentation
135
+ - Performance comparisons
136
+
137
+ 2. **API Documentation**
138
+ - Every public method documented
139
+ - Parameter types and ranges specified
140
+ - Return values clearly described
141
+ - Usage examples for complex methods
142
+
143
+ 3. **Tutorials**
144
+ - Jupyter notebook examples
145
+ - Common use case walkthroughs
146
+ - Integration examples with other gems
147
+
148
+ ## Common Patterns
149
+
150
+ ### Configuration Objects
151
+ ```ruby
152
+ # Prefer configuration objects over many parameters
153
+ config = Annembed::Config.new(
154
+ method: :umap,
155
+ n_neighbors: 15,
156
+ min_dist: 0.1
157
+ )
158
+ embedder = Annembed::Embedder.new(config)
159
+ ```
160
+
161
+ ### Progress Callbacks
162
+ ```ruby
163
+ # Support progress monitoring
164
+ embedder.on_progress do |iteration, total|
165
+ puts "Progress: #{iteration}/#{total}"
166
+ end
167
+ ```
168
+
169
+ ### Flexible Input/Output
170
+ ```ruby
171
+ # Accept multiple input formats
172
+ embedder.fit_transform(data) # Array, NArray, or CSV path
173
+
174
+ # Support different output formats
175
+ result = embedder.transform(data, output: :array) # Ruby Array
176
+ result = embedder.transform(data, output: :narray) # Numo::NArray
177
+ ```
178
+
179
+ ## Development Workflow
180
+
181
+ 1. **Branch Strategy**
182
+ - `main` - stable release
183
+ - `develop` - integration branch
184
+ - `feature/*` - new features
185
+ - `fix/*` - bug fixes
186
+
187
+ 2. **Release Process**
188
+ - Version bump in version.rb
189
+ - Update CHANGELOG.md
190
+ - Run full test suite
191
+ - Build precompiled gems
192
+ - Tag release
193
+ - Push to RubyGems
194
+
195
+ 3. **Continuous Integration**
196
+ - Run tests on each push
197
+ - Build gems for multiple platforms
198
+ - Check documentation building
199
+ - Performance regression tests
200
+
201
+ ## Future Considerations
202
+
203
+ 1. **GPU Support**
204
+ - Monitor annembed for GPU features
205
+ - Plan bindings if GPU support is added
206
+ - Consider alternative GPU libraries
207
+
208
+ 2. **Web Integration**
209
+ - Consider Rails integration
210
+ - WebAssembly compilation?
211
+ - REST API wrapper?
212
+
213
+ 3. **Visualization**
214
+ - Built-in plotting helpers?
215
+ - Export to common formats
216
+ - Interactive visualizations?
217
+
218
+ ## Getting Help
219
+
220
+ When implementing new features:
221
+ 1. Check existing patterns in lancelot and red-candle
222
+ 2. Consult annembed documentation
223
+ 3. Ask in ruby-nlp discussions
224
+ 4. Profile before optimizing
225
+
226
+ Remember: The goal is to make advanced embedding algorithms accessible and performant for Ruby developers while maintaining the simplicity and elegance that makes Ruby special.