clusterkit 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.simplecov +47 -0
  4. data/CHANGELOG.md +35 -0
  5. data/CLAUDE.md +226 -0
  6. data/Cargo.toml +8 -0
  7. data/Gemfile +17 -0
  8. data/IMPLEMENTATION_NOTES.md +143 -0
  9. data/LICENSE.txt +21 -0
  10. data/PYTHON_COMPARISON.md +183 -0
  11. data/README.md +499 -0
  12. data/Rakefile +245 -0
  13. data/clusterkit.gemspec +45 -0
  14. data/docs/KNOWN_ISSUES.md +130 -0
  15. data/docs/RUST_ERROR_HANDLING.md +164 -0
  16. data/docs/TEST_FIXTURES.md +170 -0
  17. data/docs/UMAP_EXPLAINED.md +362 -0
  18. data/docs/UMAP_TROUBLESHOOTING.md +284 -0
  19. data/docs/VERBOSE_OUTPUT.md +84 -0
  20. data/examples/hdbscan_example.rb +147 -0
  21. data/examples/optimal_kmeans_example.rb +96 -0
  22. data/examples/pca_example.rb +114 -0
  23. data/examples/reproducible_umap.rb +99 -0
  24. data/examples/verbose_control.rb +43 -0
  25. data/ext/clusterkit/Cargo.toml +25 -0
  26. data/ext/clusterkit/extconf.rb +4 -0
  27. data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +115 -0
  28. data/ext/clusterkit/src/clustering.rs +267 -0
  29. data/ext/clusterkit/src/embedder.rs +413 -0
  30. data/ext/clusterkit/src/lib.rs +22 -0
  31. data/ext/clusterkit/src/svd.rs +112 -0
  32. data/ext/clusterkit/src/tests.rs +16 -0
  33. data/ext/clusterkit/src/utils.rs +33 -0
  34. data/lib/clusterkit/clustering/hdbscan.rb +177 -0
  35. data/lib/clusterkit/clustering.rb +213 -0
  36. data/lib/clusterkit/clusterkit.rb +9 -0
  37. data/lib/clusterkit/configuration.rb +24 -0
  38. data/lib/clusterkit/dimensionality/pca.rb +251 -0
  39. data/lib/clusterkit/dimensionality/svd.rb +144 -0
  40. data/lib/clusterkit/dimensionality/umap.rb +311 -0
  41. data/lib/clusterkit/dimensionality.rb +29 -0
  42. data/lib/clusterkit/hdbscan_api_design.rb +142 -0
  43. data/lib/clusterkit/preprocessing.rb +106 -0
  44. data/lib/clusterkit/silence.rb +42 -0
  45. data/lib/clusterkit/utils.rb +51 -0
  46. data/lib/clusterkit/version.rb +5 -0
  47. data/lib/clusterkit.rb +93 -0
  48. data/lib/tasks/visualize.rake +641 -0
  49. metadata +194 -0
data/Rakefile ADDED
@@ -0,0 +1,245 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/extensiontask"
3
+ require "rspec/core/rake_task"
4
+
5
+ # RSpec test task
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ # Define the Rust extension
9
+ spec = Gem::Specification.load("clusterkit.gemspec")
10
+ Rake::ExtensionTask.new("clusterkit", spec) do |ext|
11
+ ext.lib_dir = "lib/clusterkit"
12
+ ext.source_pattern = "*.{rs,toml}"
13
+ ext.cross_compile = true
14
+ ext.cross_platform = %w[x86-mingw32 x64-mingw32 x86-linux x86_64-linux x86_64-darwin arm64-darwin]
15
+ end
16
+
17
+ task default: [:compile, :spec]
18
+
19
+ # Documentation task
20
+ begin
21
+ require "yard"
22
+ YARD::Rake::YardocTask.new do |t|
23
+ t.files = ["lib/**/*.rb"]
24
+ t.options = ["--no-private", "--readme", "README.md"]
25
+ end
26
+ rescue LoadError
27
+ desc "YARD documentation task not available"
28
+ task :yard do
29
+ puts "YARD is not available. Please install it with: gem install yard"
30
+ end
31
+ end
32
+
33
+ # Benchmarking task
34
+ desc "Run benchmarks"
35
+ task :benchmark do
36
+ ruby "test/benchmark/benchmarks.rb"
37
+ end
38
+
39
+ # Console task for interactive testing
40
+ desc "Open an interactive console with the gem loaded"
41
+ task :console do
42
+ require "irb"
43
+ require "clusterkit"
44
+ ARGV.clear
45
+ IRB.start
46
+ end
47
+
48
+ # Rust-specific tasks
49
+ namespace :rust do
50
+ desc "Run cargo fmt"
51
+ task :fmt do
52
+ Dir.chdir("ext/clusterkit") do
53
+ sh "cargo fmt"
54
+ end
55
+ end
56
+
57
+ desc "Run cargo clippy"
58
+ task :clippy do
59
+ Dir.chdir("ext/clusterkit") do
60
+ sh "cargo clippy -- -D warnings"
61
+ end
62
+ end
63
+
64
+ desc "Run cargo test"
65
+ task :test do
66
+ Dir.chdir("ext/clusterkit") do
67
+ sh "cargo test"
68
+ end
69
+ end
70
+ end
71
+
72
+ # Coverage task
73
+ desc "Run specs with code coverage"
74
+ task :coverage do
75
+ ENV['COVERAGE'] = 'true'
76
+ Rake::Task["spec"].invoke
77
+ end
78
+
79
+ # Coverage report task
80
+ desc "Open coverage report in browser"
81
+ task :"coverage:report" => :coverage do
82
+ if RUBY_PLATFORM =~ /darwin/
83
+ sh "open coverage/index.html"
84
+ elsif RUBY_PLATFORM =~ /linux/
85
+ sh "xdg-open coverage/index.html"
86
+ else
87
+ puts "Coverage report generated at coverage/index.html"
88
+ end
89
+ end
90
+
91
+ # CI task that runs all checks
92
+ desc "Run all CI checks"
93
+ task ci: ["rust:fmt", "rust:clippy", "compile", "spec", "coverage"]
94
+
95
+ # Load custom rake tasks
96
+ Dir.glob('lib/tasks/*.rake').each { |r| load r }
97
+
98
+ # Test fixture generation
99
+ namespace :fixtures do
100
+ desc "Generate real embedding fixtures for tests using red-candle"
101
+ task :generate_embeddings do
102
+ begin
103
+ require 'red-candle'
104
+ require 'json'
105
+ require 'fileutils'
106
+
107
+ puts "Loading embedding model..."
108
+ # Use a small, fast model for generating test embeddings
109
+ model = Candle::EmbeddingModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
110
+
111
+ # Create fixtures directory
112
+ fixtures_dir = File.join(__dir__, 'spec', 'fixtures', 'embeddings')
113
+ FileUtils.mkdir_p(fixtures_dir)
114
+
115
+ # Generate embeddings for different test scenarios
116
+ test_cases = {
117
+ # Basic test set - 15 sentences for general testing
118
+ 'basic_15' => [
119
+ "The quick brown fox jumps over the lazy dog.",
120
+ "Machine learning is transforming how we process data.",
121
+ "Ruby is a dynamic programming language.",
122
+ "Natural language processing enables computers to understand text.",
123
+ "The weather today is sunny and warm.",
124
+ "Coffee is a popular morning beverage.",
125
+ "Books are a gateway to knowledge and imagination.",
126
+ "The ocean waves crash against the shore.",
127
+ "Technology continues to evolve rapidly.",
128
+ "Music has the power to evoke emotions.",
129
+ "The mountain peak was covered in snow.",
130
+ "Cooking is both an art and a science.",
131
+ "Exercise is important for maintaining health.",
132
+ "The stars shine brightly in the night sky.",
133
+ "History teaches us valuable lessons."
134
+ ],
135
+
136
+ # Clustered data - 3 distinct topics for testing clustering
137
+ 'clusters_30' => [
138
+ # Technology cluster (10 items)
139
+ "Artificial intelligence is revolutionizing industries.",
140
+ "Python is widely used for data science.",
141
+ "Cloud computing provides scalable infrastructure.",
142
+ "Cybersecurity is crucial for protecting data.",
143
+ "Blockchain technology enables decentralized systems.",
144
+ "Quantum computing may solve complex problems.",
145
+ "APIs enable software integration.",
146
+ "DevOps practices improve deployment efficiency.",
147
+ "Microservices architecture promotes modularity.",
148
+ "Machine learning models require training data.",
149
+
150
+ # Nature cluster (10 items)
151
+ "The rainforest ecosystem is incredibly diverse.",
152
+ "Mountains are formed by tectonic activity.",
153
+ "Coral reefs support marine biodiversity.",
154
+ "Rivers flow from highlands to the sea.",
155
+ "Deserts have adapted to water scarcity.",
156
+ "Forests produce oxygen and absorb carbon.",
157
+ "The arctic ice is melting due to climate change.",
158
+ "Volcanoes release molten rock from Earth's interior.",
159
+ "Wetlands filter water naturally.",
160
+ "The savanna supports large herbivore populations.",
161
+
162
+ # Food/Cooking cluster (10 items)
163
+ "Italian cuisine features pasta and tomatoes.",
164
+ "Sushi is a traditional Japanese dish.",
165
+ "Baking bread requires yeast for fermentation.",
166
+ "Spices add flavor and aroma to dishes.",
167
+ "Vegetarian diets exclude meat products.",
168
+ "Wine is produced through grape fermentation.",
169
+ "Chocolate comes from cacao beans.",
170
+ "Grilling imparts a smoky flavor to food.",
171
+ "Fresh herbs enhance culinary creations.",
172
+ "Fermented foods contain beneficial probiotics."
173
+ ],
174
+
175
+ # Small set for minimum viable dataset testing
176
+ 'minimal_6' => [
177
+ "Data science involves statistical analysis.",
178
+ "The sunset painted the sky orange.",
179
+ "Coffee beans are roasted before brewing.",
180
+ "Programming requires logical thinking.",
181
+ "Gardens need water and sunlight.",
182
+ "Music festivals bring people together."
183
+ ],
184
+
185
+ # Large set for high-dimensional testing
186
+ 'large_100' => (1..100).map { |i| "This is test sentence number #{i} with some variation in content." }
187
+ }
188
+
189
+ puts "Generating embeddings for test cases..."
190
+ test_cases.each do |name, texts|
191
+ puts " Generating #{name} (#{texts.length} texts)..."
192
+
193
+ # Generate embeddings
194
+ # Each embedding is a 1x384 tensor, so we need to extract the array
195
+ embeddings_array = texts.map { |text| model.embedding(text).to_a.first.to_a }
196
+
197
+ # Save as JSON
198
+ output_file = File.join(fixtures_dir, "#{name}.json")
199
+ File.write(output_file, JSON.pretty_generate({
200
+ 'description' => "Test embeddings for #{name}",
201
+ 'model' => 'sentence-transformers/all-MiniLM-L6-v2',
202
+ 'dimension' => embeddings_array.first.length,
203
+ 'count' => embeddings_array.length,
204
+ 'embeddings' => embeddings_array
205
+ }))
206
+
207
+ puts " Saved to #{output_file}"
208
+ end
209
+
210
+ puts "\nEmbedding fixtures generated successfully!"
211
+ puts "You can now use these in your specs with:"
212
+ puts ' embeddings = JSON.parse(File.read("spec/fixtures/embeddings/basic_15.json"))["embeddings"]'
213
+
214
+ rescue LoadError
215
+ puts "Error: red-candle gem not found."
216
+ puts "Please run: bundle install --with development"
217
+ exit 1
218
+ rescue => e
219
+ puts "Error generating embeddings: #{e.message}"
220
+ puts e.backtrace.first(5)
221
+ exit 1
222
+ end
223
+ end
224
+
225
+ desc "List available embedding fixtures"
226
+ task :list do
227
+ require 'json'
228
+ fixtures_dir = File.join(__dir__, 'spec', 'fixtures', 'embeddings')
229
+ if Dir.exist?(fixtures_dir)
230
+ files = Dir.glob(File.join(fixtures_dir, '*.json'))
231
+ if files.empty?
232
+ puts "No embedding fixtures found. Run 'rake fixtures:generate_embeddings' to create them."
233
+ else
234
+ puts "Available embedding fixtures:"
235
+ files.each do |file|
236
+ data = JSON.parse(File.read(file))
237
+ basename = File.basename(file)
238
+ puts " #{basename}: #{data['count']} embeddings, #{data['dimension']}D"
239
+ end
240
+ end
241
+ else
242
+ puts "Fixtures directory not found. Run 'rake fixtures:generate_embeddings' to create fixtures."
243
+ end
244
+ end
245
+ end
@@ -0,0 +1,45 @@
1
+ require_relative "lib/clusterkit/version"
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "clusterkit"
5
+ spec.version = ClusterKit::VERSION
6
+ spec.authors = ["Chris Petersen"]
7
+ spec.email = ["chris@petersen.io"]
8
+
9
+ spec.summary = "High-performance clustering and dimensionality reduction for Ruby"
10
+ spec.description = "A comprehensive clustering toolkit for Ruby, providing UMAP, PCA, K-means, HDBSCAN and more. Built on top of annembed and hdbscan Rust crates for blazing-fast performance."
11
+ spec.homepage = "https://github.com/cpetersen/clusterkit"
12
+ spec.license = "MIT"
13
+ spec.required_ruby_version = ">= 2.7.0"
14
+
15
+ spec.metadata["homepage_uri"] = spec.homepage
16
+ spec.metadata["source_code_uri"] = spec.homepage
17
+ spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ spec.files = Dir.chdir(__dir__) do
21
+ `git ls-files -z`.split("\x0").reject do |f|
22
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
23
+ end + Dir["ext/**/*.rs", "ext/**/*.toml"]
24
+ end
25
+ spec.bindir = "exe"
26
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
27
+ spec.require_paths = ["lib"]
28
+ spec.extensions = ["ext/clusterkit/extconf.rb"]
29
+
30
+ # Runtime dependencies
31
+ # Numo is optional but recommended for better performance
32
+ # spec.add_dependency "numo-narray", "~> 0.9"
33
+
34
+ # Development dependencies
35
+ spec.add_development_dependency "csv"
36
+ spec.add_development_dependency "rake", "~> 13.0"
37
+ spec.add_development_dependency "rake-compiler", "~> 1.2"
38
+ spec.add_development_dependency "rb_sys", "~> 0.9"
39
+ spec.add_development_dependency "rspec", "~> 3.0"
40
+ spec.add_development_dependency "simplecov", "~> 0.22"
41
+ spec.add_development_dependency "yard", "~> 0.9"
42
+
43
+ # For more information and examples about making a new gem, check out our
44
+ # guide at: https://bundler.io/guides/creating_gem.html
45
+ end
@@ -0,0 +1,130 @@
1
+ # Known Issues
2
+
3
+ ## Summary
4
+
5
+ This gem has three main categories of limitations:
6
+
7
+ 1. **Minimum dataset requirements** - UMAP needs at least 10 data points
8
+ 2. **Performance trade-offs** - Reproducibility (with seed) is ~25-35% slower than parallel mode
9
+ 3. **Uncatchable Rust panics** - Some error conditions crash the Ruby process (cannot be caught)
10
+
11
+ ## Minimum Dataset Size Requirement
12
+
13
+ **Limitation**: UMAP requires at least 10 data points to function properly.
14
+
15
+ **Reason**: UMAP needs sufficient data to construct a meaningful manifold approximation. With fewer than 10 points, the algorithm cannot create a reliable graph structure.
16
+
17
+ **Workaround**:
18
+ - Use PCA for datasets with fewer than 10 points
19
+ - The `transform` method can handle smaller datasets once the model is fitted on adequate training data
20
+
21
+ ## Performance vs Reproducibility Trade-off
22
+
23
+ **Design Choice**: When using `random_seed` for reproducibility, UMAP uses serial processing which is approximately 25-35% slower than parallel processing.
24
+
25
+ **Recommendation**:
26
+ - For production workloads where speed is critical: omit the `random_seed` parameter
27
+ - For research, testing, or when reproducibility is required: provide a `random_seed` value
28
+
29
+ ## Rust Panic Conditions (Mostly Fixed)
30
+
31
+ **Previous Issue**: The box_size assertion would panic and crash the Ruby process.
32
+
33
+ **Current Status**: **FIXED** in `cpetersen/annembed:fix-box-size-panic` branch
34
+ - The `"assertion failed: (*f).abs() <= box_size"` panic has been converted to a catchable error
35
+ - Extreme value ranges are now handled gracefully through normalization
36
+ - NaN/Infinite values are detected and reported with clear error messages
37
+
38
+ **Remaining Uncatchable Errors**:
39
+ - Array bounds violations (accessing out-of-bounds indices)
40
+ - Some `.unwrap()` calls on `None` or `Err` values
41
+ - These are much less common in normal usage
42
+
43
+ **Best Practices** (still recommended):
44
+ - Normalize your data to a reasonable range (e.g., 0-1) for best performance
45
+ - Remove or handle NaN/Infinite values before processing
46
+ - Use conservative parameters when data quality is uncertain
47
+
48
+ **For more details**: See [RUST_ERROR_HANDLING.md](RUST_ERROR_HANDLING.md) for comprehensive documentation of error handling limitations.
49
+
50
+ ## Best Practices to Avoid Issues
51
+
52
+ ### Data Preprocessing
53
+
54
+ Always preprocess your data before using UMAP:
55
+
56
+ ```ruby
57
+ # 1. Remove NaN and Infinite values
58
+ data.reject! { |row| row.any? { |v| v.nan? || v.infinite? } }
59
+
60
+ # 2. Normalize to [0, 1] range
61
+ data = data.map do |row|
62
+ min, max = row.minmax
63
+ range = max - min
64
+ row.map { |v| range > 0 ? (v - min) / range : 0.5 }
65
+ end
66
+
67
+ # 3. Check for extreme outliers
68
+ data.each do |row|
69
+ row.each do |val|
70
+ if val.abs > 100
71
+ warn "Warning: Extreme value #{val} detected"
72
+ end
73
+ end
74
+ end
75
+ ```
76
+
77
+ ### Safe Parameter Defaults
78
+
79
+ Use conservative parameters when data quality is uncertain:
80
+
81
+ ```ruby
82
+ # Safer configuration
83
+ umap = ClusterKit::Dimensionality::UMAP.new(
84
+ n_components: 2,
85
+ n_neighbors: 5, # Lower is safer (default: 15)
86
+ random_seed: 42, # For reproducibility during debugging
87
+ nb_grad_batch: 10, # Default is usually fine
88
+ nb_sampling_by_edge: 8 # Default is usually fine
89
+ )
90
+ ```
91
+
92
+ ### Error Recovery Strategy
93
+
94
+ Since some errors cannot be caught, implement a recovery strategy:
95
+
96
+ ```ruby
97
+ def safe_umap_transform(data, options = {})
98
+ # Save data to temporary file before processing
99
+ temp_file = "temp_umap_data_#{Time.now.to_i}.json"
100
+ File.write(temp_file, JSON.dump(data))
101
+
102
+ begin
103
+ umap = ClusterKit::Dimensionality::UMAP.new(**options)
104
+ result = umap.fit_transform(data)
105
+ File.delete(temp_file) if File.exist?(temp_file)
106
+ result
107
+ rescue => e
108
+ puts "UMAP failed: #{e.message}"
109
+ puts "Data saved to #{temp_file} for debugging"
110
+ raise
111
+ end
112
+ end
113
+ ```
114
+
115
+ ### Alternative for Problematic Data
116
+
117
+ If UMAP consistently fails, use PCA as a fallback:
118
+
119
+ ```ruby
120
+ def reduce_dimensions(data, n_components: 2)
121
+ begin
122
+ umap = ClusterKit::Dimensionality::UMAP.new(n_components: n_components)
123
+ umap.fit_transform(data)
124
+ rescue => e
125
+ warn "UMAP failed, falling back to PCA: #{e.message}"
126
+ pca = ClusterKit::Dimensionality::PCA.new(n_components: n_components)
127
+ pca.fit_transform(data)
128
+ end
129
+ end
130
+ ```
@@ -0,0 +1,164 @@
1
+ # Rust Layer Error Handling Documentation
2
+
3
+ ## Overview
4
+
5
+ The annembed-ruby gem wraps Rust libraries (annembed and hnsw-rs) which have different error handling mechanisms. Some errors can be caught and handled gracefully, while others cause panics that crash the Ruby process.
6
+
7
+ ## Error Categories
8
+
9
+ ### 1. Catchable Errors (Result<T, E> types)
10
+
11
+ These errors use Rust's `Result` type and can be caught and converted to Ruby exceptions:
12
+
13
+ | Error | Source | Location | Ruby Exception |
14
+ |-------|--------|----------|----------------|
15
+ | Isolated point | annembed | `kgraph_from_hnsw_all` | `ClusterKit::IsolatedPointError` |
16
+ | Graph construction failure | annembed | `kgraph_from_hnsw_all` | `RuntimeError` with message |
17
+ | Embedding failure | annembed | `embedder.embed()` | Generic `RuntimeError` |
18
+
19
+ **Example from annembed:**
20
+ ```rust
21
+ // This can be caught
22
+ return Err(anyhow!(
23
+ "kgraph_from_hnsw_all: graph will not be connected, isolated point at layer {} , pos in layer : {}",
24
+ p_id.0, p_id.1
25
+ ));
26
+ ```
27
+
28
+ **How we handle it in embedder.rs:**
29
+ ```rust
30
+ let kgraph = annembed::fromhnsw::kgraph::kgraph_from_hnsw_all(&hnsw, self.n_neighbors)
31
+ .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
32
+ ```
33
+
34
+ ### 2. Uncatchable Errors (Panics/Assertions)
35
+
36
+ These use Rust's `assert!` or `panic!` macros and CANNOT be caught. They will crash the Ruby process:
37
+
38
+ | Error | Source | Location | Trigger Condition |
39
+ |-------|--------|----------|-------------------|
40
+ | ~~Box size assertion~~ | ~~annembed~~ | ~~`set_data_box`~~ | **FIXED in cpetersen/annembed:fix-box-size-panic** |
41
+ | Array bounds | Various | Index operations | Accessing out-of-bounds indices |
42
+ | Unwrap failures | Various | `.unwrap()` calls | Unwrapping `None` or `Err` |
43
+
44
+ **Update (2025-08-19):** The box size assertion has been fixed in the `fix-box-size-panic` branch of cpetersen/annembed. It now returns a proper `Result<(), anyhow::Error>` that can be caught and handled gracefully:
45
+
46
+ ```rust
47
+ // Previously (would panic):
48
+ assert!((*f).abs() <= box_size);
49
+
50
+ // Now (returns catchable error):
51
+ if (*f).is_nan() || (*f).is_infinite() {
52
+ return Err(anyhow!("Data normalization failed..."));
53
+ }
54
+ ```
55
+
56
+ ## Current Mitigation Strategies
57
+
58
+ ### 1. Ruby Layer Validation
59
+
60
+ We validate data before sending to Rust to prevent common panic conditions:
61
+
62
+ - Check for NaN and Infinite values
63
+ - Ensure minimum dataset size (10 points)
64
+ - Validate array dimensions consistency
65
+ - Warn about extreme value ranges
66
+
67
+ ### 2. Parameter Adjustment
68
+
69
+ We automatically adjust parameters to avoid error conditions:
70
+
71
+ ```ruby
72
+ # Automatically reduce n_neighbors if too large for dataset
73
+ adjusted_n_neighbors = [suggested_neighbors, max_neighbors].min
74
+ ```
75
+
76
+ ### 3. Error Message Enhancement
77
+
78
+ When we can catch Rust errors, we provide helpful Ruby-level error messages:
79
+
80
+ ```ruby
81
+ case error_msg
82
+ when /isolated point/i
83
+ raise ::ClusterKit::IsolatedPointError, <<~MSG
84
+ UMAP found isolated points in your data...
85
+ Solutions:
86
+ 1. Reduce n_neighbors...
87
+ 2. Remove outliers...
88
+ MSG
89
+ ```
90
+
91
+ ## Previously Uncatchable Panic Conditions (Now Fixed)
92
+
93
+ ### 1. "assertion failed: (*f).abs() <= box_size" - **FIXED**
94
+
95
+ **Location:** `annembed/src/embedder.rs::set_data_box`
96
+
97
+ **Previous Issue:** Would panic and crash the Ruby process
98
+
99
+ **Current Status:** Fixed in `cpetersen/annembed:fix-box-size-panic` branch
100
+ - Now returns a catchable `anyhow::Error`
101
+ - Detects NaN/Infinite values during normalization
102
+ - Handles constant data (max_max = 0) gracefully
103
+ - Extreme value ranges are normalized successfully
104
+
105
+ **User-visible behavior:**
106
+ - Previously: Ruby process would crash with assertion failure
107
+ - Now: Raises a catchable Ruby exception with helpful error message
108
+
109
+ ## Recommendations for Users
110
+
111
+ ### To Avoid Crashes:
112
+
113
+ 1. **Always normalize your data:**
114
+ ```ruby
115
+ # Scale to [0, 1] range
116
+ data = data.map do |row|
117
+ min, max = row.minmax
118
+ range = max - min
119
+ row.map { |v| range > 0 ? (v - min) / range : 0.5 }
120
+ end
121
+ ```
122
+
123
+ 2. **Check for extreme values:**
124
+ ```ruby
125
+ data.flatten.each do |val|
126
+ raise "Extreme value detected" if val.abs > 1e6
127
+ end
128
+ ```
129
+
130
+ 3. **Use conservative parameters for uncertain data:**
131
+ ```ruby
132
+ umap = ClusterKit::Dimensionality::UMAP.new(
133
+ n_neighbors: 5, # Lower is safer
134
+ n_components: 2
135
+ )
136
+ ```
137
+
138
+ ## Future Improvements
139
+
140
+ ### Potential Solutions:
141
+
142
+ 1. **Modify annembed to use Result instead of assert:**
143
+ - Would require upstream changes to annembed
144
+ - Convert `assert!` to `if` checks that return `Err`
145
+
146
+ 2. **Add panic catching in Rust layer:**
147
+ - Use `std::panic::catch_unwind` (limited effectiveness)
148
+ - May not work for all panic types
149
+
150
+ 3. **Pre-validation in Rust:**
151
+ - Add more checks before calling annembed functions
152
+ - Predict and prevent panic conditions
153
+
154
+ ### Current Limitations:
155
+
156
+ - Cannot catch Rust panics from Ruby
157
+ - Some numerical instabilities are hard to predict
158
+ - Trade-off between performance and safety checks
159
+
160
+ ## Testing Error Handling
161
+
162
+ The test suite mocks Rust errors to verify our error handling logic works correctly. However, actual panic conditions cannot be tested without crashing the test process.
163
+
164
+ See `spec/clusterkit/error_handling_spec.rb` for error handling tests.