clusterkit 0.3.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.simplecov +47 -0
  4. data/CHANGELOG.md +35 -0
  5. data/CLAUDE.md +226 -0
  6. data/Cargo.lock +3228 -0
  7. data/Cargo.toml +8 -0
  8. data/Gemfile +17 -0
  9. data/IMPLEMENTATION_NOTES.md +143 -0
  10. data/LICENSE.txt +21 -0
  11. data/PYTHON_COMPARISON.md +183 -0
  12. data/README.md +744 -0
  13. data/Rakefile +259 -0
  14. data/docs/KNOWN_ISSUES.md +130 -0
  15. data/docs/RUST_ERROR_HANDLING.md +164 -0
  16. data/docs/TEST_FIXTURES.md +170 -0
  17. data/docs/UMAP_EXPLAINED.md +362 -0
  18. data/docs/UMAP_TROUBLESHOOTING.md +284 -0
  19. data/docs/VERBOSE_OUTPUT.md +84 -0
  20. data/docs/assets/clusterkit-wide.png +0 -0
  21. data/docs/assets/clusterkit.png +0 -0
  22. data/docs/assets/visualization.png +0 -0
  23. data/examples/hdbscan_example.rb +147 -0
  24. data/examples/optimal_kmeans_example.rb +96 -0
  25. data/examples/pca_example.rb +114 -0
  26. data/examples/reproducible_umap.rb +99 -0
  27. data/examples/verbose_control.rb +43 -0
  28. data/ext/clusterkit/Cargo.toml +26 -0
  29. data/ext/clusterkit/extconf.rb +23 -0
  30. data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +80 -0
  31. data/ext/clusterkit/src/clustering.rs +221 -0
  32. data/ext/clusterkit/src/embedder.rs +349 -0
  33. data/ext/clusterkit/src/hnsw.rs +579 -0
  34. data/ext/clusterkit/src/lib.rs +24 -0
  35. data/ext/clusterkit/src/svd.rs +89 -0
  36. data/ext/clusterkit/src/tests.rs +16 -0
  37. data/ext/clusterkit/src/utils.rs +183 -0
  38. data/lib/clusterkit/3.1/clusterkit.so +0 -0
  39. data/lib/clusterkit/3.2/clusterkit.so +0 -0
  40. data/lib/clusterkit/3.3/clusterkit.so +0 -0
  41. data/lib/clusterkit/3.4/clusterkit.so +0 -0
  42. data/lib/clusterkit/clustering/hdbscan.rb +164 -0
  43. data/lib/clusterkit/clustering.rb +194 -0
  44. data/lib/clusterkit/clusterkit.rb +14 -0
  45. data/lib/clusterkit/configuration.rb +24 -0
  46. data/lib/clusterkit/data_validator.rb +132 -0
  47. data/lib/clusterkit/dimensionality/pca.rb +251 -0
  48. data/lib/clusterkit/dimensionality/svd.rb +175 -0
  49. data/lib/clusterkit/dimensionality/umap.rb +282 -0
  50. data/lib/clusterkit/dimensionality.rb +29 -0
  51. data/lib/clusterkit/hdbscan_api_design.rb +142 -0
  52. data/lib/clusterkit/hnsw.rb +251 -0
  53. data/lib/clusterkit/preprocessing.rb +106 -0
  54. data/lib/clusterkit/silence.rb +42 -0
  55. data/lib/clusterkit/utils.rb +51 -0
  56. data/lib/clusterkit/version.rb +5 -0
  57. data/lib/clusterkit.rb +105 -0
  58. data/lib/tasks/visualize.rake +641 -0
  59. metadata +220 -0
data/Rakefile ADDED
@@ -0,0 +1,259 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/extensiontask"
3
+
4
+ # rspec is a DEVELOPMENT-only dependency. The cross-compile build container
5
+ # (rb-sys-dock, via scientist-labs/rust-gem-release) installs the runtime bundle
6
+ # only, so this require would raise LoadError and abort `rake` before the native
7
+ # build task can run. Guard it so this Rakefile always loads; the spec task simply
8
+ # isn't available in a build-only environment.
9
+ begin
10
+ require "rspec/core/rake_task"
11
+ RSpec::Core::RakeTask.new(:spec)
12
+ rescue LoadError
13
+ desc "run specs (rspec unavailable in this environment)"
14
+ task(:spec) { abort "rspec is a development dependency and is not installed here" }
15
+ end
16
+
17
+ # Define the Rust extension. Passing the loaded gemspec to ExtensionTask makes
18
+ # rake-compiler expose the `native:<platform>` tasks rb-sys-dock invokes for each
19
+ # precompiled leg; without it the cross build fails with "Don't know how to build task".
20
+ GEMSPEC = Gem::Specification.load("clusterkit.gemspec")
21
+ Rake::ExtensionTask.new("clusterkit", GEMSPEC) do |ext|
22
+ ext.lib_dir = "lib/clusterkit"
23
+ ext.source_pattern = "*.{rs,toml}"
24
+ ext.cross_compile = true
25
+ # Union of the precompiled targets this gem ships: both glibc linux arches
26
+ # (assembled by oxidize-rb cross-gem in rb-sys-dock) plus Apple Silicon
27
+ # (built natively on a macOS runner).
28
+ ext.cross_platform = %w[x86_64-linux aarch64-linux arm64-darwin]
29
+ end
30
+
31
+ task default: [:compile, :spec]
32
+
33
+ # Documentation task
34
+ begin
35
+ require "yard"
36
+ YARD::Rake::YardocTask.new do |t|
37
+ t.files = ["lib/**/*.rb"]
38
+ t.options = ["--no-private", "--readme", "README.md"]
39
+ end
40
+ rescue LoadError
41
+ desc "YARD documentation task not available"
42
+ task :yard do
43
+ puts "YARD is not available. Please install it with: gem install yard"
44
+ end
45
+ end
46
+
47
+ # Benchmarking task
48
+ desc "Run benchmarks"
49
+ task :benchmark do
50
+ ruby "test/benchmark/benchmarks.rb"
51
+ end
52
+
53
+ # Console task for interactive testing
54
+ desc "Open an interactive console with the gem loaded"
55
+ task :console do
56
+ require "irb"
57
+ require "clusterkit"
58
+ ARGV.clear
59
+ IRB.start
60
+ end
61
+
62
+ # Rust-specific tasks
63
+ namespace :rust do
64
+ desc "Run cargo fmt"
65
+ task :fmt do
66
+ Dir.chdir("ext/clusterkit") do
67
+ sh "cargo fmt"
68
+ end
69
+ end
70
+
71
+ desc "Run cargo clippy"
72
+ task :clippy do
73
+ Dir.chdir("ext/clusterkit") do
74
+ sh "cargo clippy -- -D warnings"
75
+ end
76
+ end
77
+
78
+ desc "Run cargo test"
79
+ task :test do
80
+ Dir.chdir("ext/clusterkit") do
81
+ sh "cargo test"
82
+ end
83
+ end
84
+ end
85
+
86
+ # Coverage task
87
+ desc "Run specs with code coverage"
88
+ task :coverage do
89
+ ENV['COVERAGE'] = 'true'
90
+ Rake::Task["spec"].invoke
91
+ end
92
+
93
+ # Coverage report task
94
+ desc "Open coverage report in browser"
95
+ task :"coverage:report" => :coverage do
96
+ if RUBY_PLATFORM =~ /darwin/
97
+ sh "open coverage/index.html"
98
+ elsif RUBY_PLATFORM =~ /linux/
99
+ sh "xdg-open coverage/index.html"
100
+ else
101
+ puts "Coverage report generated at coverage/index.html"
102
+ end
103
+ end
104
+
105
+ # CI task that runs all checks
106
+ desc "Run all CI checks"
107
+ task ci: ["rust:fmt", "rust:clippy", "compile", "spec", "coverage"]
108
+
109
+ # Load custom rake tasks
110
+ Dir.glob('lib/tasks/*.rake').each { |r| load r }
111
+
112
+ # Test fixture generation
113
+ namespace :fixtures do
114
+ desc "Generate real embedding fixtures for tests using red-candle"
115
+ task :generate_embeddings do
116
+ begin
117
+ require 'red-candle'
118
+ require 'json'
119
+ require 'fileutils'
120
+
121
+ puts "Loading embedding model..."
122
+ # Use a small, fast model for generating test embeddings
123
+ model = Candle::EmbeddingModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
124
+
125
+ # Create fixtures directory
126
+ fixtures_dir = File.join(__dir__, 'spec', 'fixtures', 'embeddings')
127
+ FileUtils.mkdir_p(fixtures_dir)
128
+
129
+ # Generate embeddings for different test scenarios
130
+ test_cases = {
131
+ # Basic test set - 15 sentences for general testing
132
+ 'basic_15' => [
133
+ "The quick brown fox jumps over the lazy dog.",
134
+ "Machine learning is transforming how we process data.",
135
+ "Ruby is a dynamic programming language.",
136
+ "Natural language processing enables computers to understand text.",
137
+ "The weather today is sunny and warm.",
138
+ "Coffee is a popular morning beverage.",
139
+ "Books are a gateway to knowledge and imagination.",
140
+ "The ocean waves crash against the shore.",
141
+ "Technology continues to evolve rapidly.",
142
+ "Music has the power to evoke emotions.",
143
+ "The mountain peak was covered in snow.",
144
+ "Cooking is both an art and a science.",
145
+ "Exercise is important for maintaining health.",
146
+ "The stars shine brightly in the night sky.",
147
+ "History teaches us valuable lessons."
148
+ ],
149
+
150
+ # Clustered data - 3 distinct topics for testing clustering
151
+ 'clusters_30' => [
152
+ # Technology cluster (10 items)
153
+ "Artificial intelligence is revolutionizing industries.",
154
+ "Python is widely used for data science.",
155
+ "Cloud computing provides scalable infrastructure.",
156
+ "Cybersecurity is crucial for protecting data.",
157
+ "Blockchain technology enables decentralized systems.",
158
+ "Quantum computing may solve complex problems.",
159
+ "APIs enable software integration.",
160
+ "DevOps practices improve deployment efficiency.",
161
+ "Microservices architecture promotes modularity.",
162
+ "Machine learning models require training data.",
163
+
164
+ # Nature cluster (10 items)
165
+ "The rainforest ecosystem is incredibly diverse.",
166
+ "Mountains are formed by tectonic activity.",
167
+ "Coral reefs support marine biodiversity.",
168
+ "Rivers flow from highlands to the sea.",
169
+ "Deserts have adapted to water scarcity.",
170
+ "Forests produce oxygen and absorb carbon.",
171
+ "The arctic ice is melting due to climate change.",
172
+ "Volcanoes release molten rock from Earth's interior.",
173
+ "Wetlands filter water naturally.",
174
+ "The savanna supports large herbivore populations.",
175
+
176
+ # Food/Cooking cluster (10 items)
177
+ "Italian cuisine features pasta and tomatoes.",
178
+ "Sushi is a traditional Japanese dish.",
179
+ "Baking bread requires yeast for fermentation.",
180
+ "Spices add flavor and aroma to dishes.",
181
+ "Vegetarian diets exclude meat products.",
182
+ "Wine is produced through grape fermentation.",
183
+ "Chocolate comes from cacao beans.",
184
+ "Grilling imparts a smoky flavor to food.",
185
+ "Fresh herbs enhance culinary creations.",
186
+ "Fermented foods contain beneficial probiotics."
187
+ ],
188
+
189
+ # Small set for minimum viable dataset testing
190
+ 'minimal_6' => [
191
+ "Data science involves statistical analysis.",
192
+ "The sunset painted the sky orange.",
193
+ "Coffee beans are roasted before brewing.",
194
+ "Programming requires logical thinking.",
195
+ "Gardens need water and sunlight.",
196
+ "Music festivals bring people together."
197
+ ],
198
+
199
+ # Large set for high-dimensional testing
200
+ 'large_100' => (1..100).map { |i| "This is test sentence number #{i} with some variation in content." }
201
+ }
202
+
203
+ puts "Generating embeddings for test cases..."
204
+ test_cases.each do |name, texts|
205
+ puts " Generating #{name} (#{texts.length} texts)..."
206
+
207
+ # Generate embeddings
208
+ # Each embedding is a 1x384 tensor, so we need to extract the array
209
+ embeddings_array = texts.map { |text| model.embedding(text).to_a.first.to_a }
210
+
211
+ # Save as JSON
212
+ output_file = File.join(fixtures_dir, "#{name}.json")
213
+ File.write(output_file, JSON.pretty_generate({
214
+ 'description' => "Test embeddings for #{name}",
215
+ 'model' => 'sentence-transformers/all-MiniLM-L6-v2',
216
+ 'dimension' => embeddings_array.first.length,
217
+ 'count' => embeddings_array.length,
218
+ 'embeddings' => embeddings_array
219
+ }))
220
+
221
+ puts " Saved to #{output_file}"
222
+ end
223
+
224
+ puts "\nEmbedding fixtures generated successfully!"
225
+ puts "You can now use these in your specs with:"
226
+ puts ' embeddings = JSON.parse(File.read("spec/fixtures/embeddings/basic_15.json"))["embeddings"]'
227
+
228
+ rescue LoadError
229
+ puts "Error: red-candle gem not found."
230
+ puts "Please run: bundle install --with development"
231
+ exit 1
232
+ rescue => e
233
+ puts "Error generating embeddings: #{e.message}"
234
+ puts e.backtrace.first(5)
235
+ exit 1
236
+ end
237
+ end
238
+
239
+ desc "List available embedding fixtures"
240
+ task :list do
241
+ require 'json'
242
+ fixtures_dir = File.join(__dir__, 'spec', 'fixtures', 'embeddings')
243
+ if Dir.exist?(fixtures_dir)
244
+ files = Dir.glob(File.join(fixtures_dir, '*.json'))
245
+ if files.empty?
246
+ puts "No embedding fixtures found. Run 'rake fixtures:generate_embeddings' to create them."
247
+ else
248
+ puts "Available embedding fixtures:"
249
+ files.each do |file|
250
+ data = JSON.parse(File.read(file))
251
+ basename = File.basename(file)
252
+ puts " #{basename}: #{data['count']} embeddings, #{data['dimension']}D"
253
+ end
254
+ end
255
+ else
256
+ puts "Fixtures directory not found. Run 'rake fixtures:generate_embeddings' to create fixtures."
257
+ end
258
+ end
259
+ end
@@ -0,0 +1,130 @@
1
+ # Known Issues
2
+
3
+ ## Summary
4
+
5
+ This gem has three main categories of limitations:
6
+
7
+ 1. **Minimum dataset requirements** - UMAP needs at least 10 data points
8
+ 2. **Performance trade-offs** - Reproducibility (with seed) is ~25-35% slower than parallel mode
9
+ 3. **Uncatchable Rust panics** - Some error conditions crash the Ruby process (cannot be caught)
10
+
11
+ ## Minimum Dataset Size Requirement
12
+
13
+ **Limitation**: UMAP requires at least 10 data points to function properly.
14
+
15
+ **Reason**: UMAP needs sufficient data to construct a meaningful manifold approximation. With fewer than 10 points, the algorithm cannot create a reliable graph structure.
16
+
17
+ **Workaround**:
18
+ - Use PCA for datasets with fewer than 10 points
19
+ - The `transform` method can handle smaller datasets once the model is fitted on adequate training data
20
+
21
+ ## Performance vs Reproducibility Trade-off
22
+
23
+ **Design Choice**: When using `random_seed` for reproducibility, UMAP uses serial processing which is approximately 25-35% slower than parallel processing.
24
+
25
+ **Recommendation**:
26
+ - For production workloads where speed is critical: omit the `random_seed` parameter
27
+ - For research, testing, or when reproducibility is required: provide a `random_seed` value
28
+
29
+ ## Rust Panic Conditions (Mostly Fixed)
30
+
31
+ **Previous Issue**: The box_size assertion would panic and crash the Ruby process.
32
+
33
+ **Current Status**: **FIXED** in `scientist-labs/annembed:fix-box-size-panic` branch
34
+ - The `"assertion failed: (*f).abs() <= box_size"` panic has been converted to a catchable error
35
+ - Extreme value ranges are now handled gracefully through normalization
36
+ - NaN/Infinite values are detected and reported with clear error messages
37
+
38
+ **Remaining Uncatchable Errors**:
39
+ - Array bounds violations (accessing out-of-bounds indices)
40
+ - Some `.unwrap()` calls on `None` or `Err` values
41
+ - These are much less common in normal usage
42
+
43
+ **Best Practices** (still recommended):
44
+ - Normalize your data to a reasonable range (e.g., 0-1) for best performance
45
+ - Remove or handle NaN/Infinite values before processing
46
+ - Use conservative parameters when data quality is uncertain
47
+
48
+ **For more details**: See [RUST_ERROR_HANDLING.md](RUST_ERROR_HANDLING.md) for comprehensive documentation of error handling limitations.
49
+
50
+ ## Best Practices to Avoid Issues
51
+
52
+ ### Data Preprocessing
53
+
54
+ Always preprocess your data before using UMAP:
55
+
56
+ ```ruby
57
+ # 1. Remove NaN and Infinite values
58
+ data.reject! { |row| row.any? { |v| v.nan? || v.infinite? } }
59
+
60
+ # 2. Normalize to [0, 1] range
61
+ data = data.map do |row|
62
+ min, max = row.minmax
63
+ range = max - min
64
+ row.map { |v| range > 0 ? (v - min) / range : 0.5 }
65
+ end
66
+
67
+ # 3. Check for extreme outliers
68
+ data.each do |row|
69
+ row.each do |val|
70
+ if val.abs > 100
71
+ warn "Warning: Extreme value #{val} detected"
72
+ end
73
+ end
74
+ end
75
+ ```
76
+
77
+ ### Safe Parameter Defaults
78
+
79
+ Use conservative parameters when data quality is uncertain:
80
+
81
+ ```ruby
82
+ # Safer configuration
83
+ umap = ClusterKit::Dimensionality::UMAP.new(
84
+ n_components: 2,
85
+ n_neighbors: 5, # Lower is safer (default: 15)
86
+ random_seed: 42, # For reproducibility during debugging
87
+ nb_grad_batch: 10, # Default is usually fine
88
+ nb_sampling_by_edge: 8 # Default is usually fine
89
+ )
90
+ ```
91
+
92
+ ### Error Recovery Strategy
93
+
94
+ Since some errors cannot be caught, implement a recovery strategy:
95
+
96
+ ```ruby
97
+ def safe_umap_transform(data, options = {})
98
+ # Save data to temporary file before processing
99
+ temp_file = "temp_umap_data_#{Time.now.to_i}.json"
100
+ File.write(temp_file, JSON.dump(data))
101
+
102
+ begin
103
+ umap = ClusterKit::Dimensionality::UMAP.new(**options)
104
+ result = umap.fit_transform(data)
105
+ File.delete(temp_file) if File.exist?(temp_file)
106
+ result
107
+ rescue => e
108
+ puts "UMAP failed: #{e.message}"
109
+ puts "Data saved to #{temp_file} for debugging"
110
+ raise
111
+ end
112
+ end
113
+ ```
114
+
115
+ ### Alternative for Problematic Data
116
+
117
+ If UMAP consistently fails, use PCA as a fallback:
118
+
119
+ ```ruby
120
+ def reduce_dimensions(data, n_components: 2)
121
+ begin
122
+ umap = ClusterKit::Dimensionality::UMAP.new(n_components: n_components)
123
+ umap.fit_transform(data)
124
+ rescue => e
125
+ warn "UMAP failed, falling back to PCA: #{e.message}"
126
+ pca = ClusterKit::Dimensionality::PCA.new(n_components: n_components)
127
+ pca.fit_transform(data)
128
+ end
129
+ end
130
+ ```
@@ -0,0 +1,164 @@
1
+ # Rust Layer Error Handling Documentation
2
+
3
+ ## Overview
4
+
5
+ The annembed-ruby gem wraps Rust libraries (annembed and hnsw-rs) which have different error handling mechanisms. Some errors can be caught and handled gracefully, while others cause panics that crash the Ruby process.
6
+
7
+ ## Error Categories
8
+
9
+ ### 1. Catchable Errors (Result<T, E> types)
10
+
11
+ These errors use Rust's `Result` type and can be caught and converted to Ruby exceptions:
12
+
13
+ | Error | Source | Location | Ruby Exception |
14
+ |-------|--------|----------|----------------|
15
+ | Isolated point | annembed | `kgraph_from_hnsw_all` | `ClusterKit::IsolatedPointError` |
16
+ | Graph construction failure | annembed | `kgraph_from_hnsw_all` | `RuntimeError` with message |
17
+ | Embedding failure | annembed | `embedder.embed()` | Generic `RuntimeError` |
18
+
19
+ **Example from annembed:**
20
+ ```rust
21
+ // This can be caught
22
+ return Err(anyhow!(
23
+ "kgraph_from_hnsw_all: graph will not be connected, isolated point at layer {} , pos in layer : {}",
24
+ p_id.0, p_id.1
25
+ ));
26
+ ```
27
+
28
+ **How we handle it in embedder.rs:**
29
+ ```rust
30
+ let kgraph = annembed::fromhnsw::kgraph::kgraph_from_hnsw_all(&hnsw, self.n_neighbors)
31
+ .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
32
+ ```
33
+
34
+ ### 2. Uncatchable Errors (Panics/Assertions)
35
+
36
+ These use Rust's `assert!` or `panic!` macros and CANNOT be caught. They will crash the Ruby process:
37
+
38
+ | Error | Source | Location | Trigger Condition |
39
+ |-------|--------|----------|-------------------|
40
+ | ~~Box size assertion~~ | ~~annembed~~ | ~~`set_data_box`~~ | **FIXED in scientist-labs/annembed:fix-box-size-panic** |
41
+ | Array bounds | Various | Index operations | Accessing out-of-bounds indices |
42
+ | Unwrap failures | Various | `.unwrap()` calls | Unwrapping `None` or `Err` |
43
+
44
+ **Update (2025-08-19):** The box size assertion has been fixed in the `fix-box-size-panic` branch of scientist-labs/annembed. It now returns a proper `Result<(), anyhow::Error>` that can be caught and handled gracefully:
45
+
46
+ ```rust
47
+ // Previously (would panic):
48
+ assert!((*f).abs() <= box_size);
49
+
50
+ // Now (returns catchable error):
51
+ if (*f).is_nan() || (*f).is_infinite() {
52
+ return Err(anyhow!("Data normalization failed..."));
53
+ }
54
+ ```
55
+
56
+ ## Current Mitigation Strategies
57
+
58
+ ### 1. Ruby Layer Validation
59
+
60
+ We validate data before sending to Rust to prevent common panic conditions:
61
+
62
+ - Check for NaN and Infinite values
63
+ - Ensure minimum dataset size (10 points)
64
+ - Validate array dimensions consistency
65
+ - Warn about extreme value ranges
66
+
67
+ ### 2. Parameter Adjustment
68
+
69
+ We automatically adjust parameters to avoid error conditions:
70
+
71
+ ```ruby
72
+ # Automatically reduce n_neighbors if too large for dataset
73
+ adjusted_n_neighbors = [suggested_neighbors, max_neighbors].min
74
+ ```
75
+
76
+ ### 3. Error Message Enhancement
77
+
78
+ When we can catch Rust errors, we provide helpful Ruby-level error messages:
79
+
80
+ ```ruby
81
+ case error_msg
82
+ when /isolated point/i
83
+ raise ::ClusterKit::IsolatedPointError, <<~MSG
84
+ UMAP found isolated points in your data...
85
+ Solutions:
86
+ 1. Reduce n_neighbors...
87
+ 2. Remove outliers...
88
+ MSG
89
+ ```
90
+
91
+ ## Previously Uncatchable Panic Conditions (Now Fixed)
92
+
93
+ ### 1. "assertion failed: (*f).abs() <= box_size" - **FIXED**
94
+
95
+ **Location:** `annembed/src/embedder.rs::set_data_box`
96
+
97
+ **Previous Issue:** Would panic and crash the Ruby process
98
+
99
+ **Current Status:** Fixed in `scientist-labs/annembed:fix-box-size-panic` branch
100
+ - Now returns a catchable `anyhow::Error`
101
+ - Detects NaN/Infinite values during normalization
102
+ - Handles constant data (max_max = 0) gracefully
103
+ - Extreme value ranges are normalized successfully
104
+
105
+ **User-visible behavior:**
106
+ - Previously: Ruby process would crash with assertion failure
107
+ - Now: Raises a catchable Ruby exception with helpful error message
108
+
109
+ ## Recommendations for Users
110
+
111
+ ### To Avoid Crashes:
112
+
113
+ 1. **Always normalize your data:**
114
+ ```ruby
115
+ # Scale to [0, 1] range
116
+ data = data.map do |row|
117
+ min, max = row.minmax
118
+ range = max - min
119
+ row.map { |v| range > 0 ? (v - min) / range : 0.5 }
120
+ end
121
+ ```
122
+
123
+ 2. **Check for extreme values:**
124
+ ```ruby
125
+ data.flatten.each do |val|
126
+ raise "Extreme value detected" if val.abs > 1e6
127
+ end
128
+ ```
129
+
130
+ 3. **Use conservative parameters for uncertain data:**
131
+ ```ruby
132
+ umap = ClusterKit::Dimensionality::UMAP.new(
133
+ n_neighbors: 5, # Lower is safer
134
+ n_components: 2
135
+ )
136
+ ```
137
+
138
+ ## Future Improvements
139
+
140
+ ### Potential Solutions:
141
+
142
+ 1. **Modify annembed to use Result instead of assert:**
143
+ - Would require upstream changes to annembed
144
+ - Convert `assert!` to `if` checks that return `Err`
145
+
146
+ 2. **Add panic catching in Rust layer:**
147
+ - Use `std::panic::catch_unwind` (limited effectiveness)
148
+ - May not work for all panic types
149
+
150
+ 3. **Pre-validation in Rust:**
151
+ - Add more checks before calling annembed functions
152
+ - Predict and prevent panic conditions
153
+
154
+ ### Current Limitations:
155
+
156
+ - Cannot catch Rust panics from Ruby
157
+ - Some numerical instabilities are hard to predict
158
+ - Trade-off between performance and safety checks
159
+
160
+ ## Testing Error Handling
161
+
162
+ The test suite mocks Rust errors to verify our error handling logic works correctly. However, actual panic conditions cannot be tested without crashing the test process.
163
+
164
+ See `spec/clusterkit/error_handling_spec.rb` for error handling tests.