tokenkit 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.standard.yml +3 -0
  4. data/.yardopts +12 -0
  5. data/CODE_OF_CONDUCT.md +132 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +644 -0
  8. data/Rakefile +18 -0
  9. data/benchmarks/cache_test.rb +63 -0
  10. data/benchmarks/final_comparison.rb +83 -0
  11. data/benchmarks/tokenizer_benchmark.rb +250 -0
  12. data/docs/ARCHITECTURE.md +469 -0
  13. data/docs/PERFORMANCE.md +382 -0
  14. data/docs/README.md +118 -0
  15. data/ext/tokenkit/Cargo.toml +21 -0
  16. data/ext/tokenkit/extconf.rb +4 -0
  17. data/ext/tokenkit/src/config.rs +37 -0
  18. data/ext/tokenkit/src/error.rs +67 -0
  19. data/ext/tokenkit/src/lib.rs +346 -0
  20. data/ext/tokenkit/src/tokenizer/base.rs +41 -0
  21. data/ext/tokenkit/src/tokenizer/char_group.rs +62 -0
  22. data/ext/tokenkit/src/tokenizer/edge_ngram.rs +73 -0
  23. data/ext/tokenkit/src/tokenizer/grapheme.rs +26 -0
  24. data/ext/tokenkit/src/tokenizer/keyword.rs +25 -0
  25. data/ext/tokenkit/src/tokenizer/letter.rs +41 -0
  26. data/ext/tokenkit/src/tokenizer/lowercase.rs +51 -0
  27. data/ext/tokenkit/src/tokenizer/mod.rs +254 -0
  28. data/ext/tokenkit/src/tokenizer/ngram.rs +80 -0
  29. data/ext/tokenkit/src/tokenizer/path_hierarchy.rs +187 -0
  30. data/ext/tokenkit/src/tokenizer/pattern.rs +38 -0
  31. data/ext/tokenkit/src/tokenizer/sentence.rs +89 -0
  32. data/ext/tokenkit/src/tokenizer/unicode.rs +36 -0
  33. data/ext/tokenkit/src/tokenizer/url_email.rs +108 -0
  34. data/ext/tokenkit/src/tokenizer/whitespace.rs +31 -0
  35. data/lib/tokenkit/config.rb +74 -0
  36. data/lib/tokenkit/config_builder.rb +209 -0
  37. data/lib/tokenkit/config_compat.rb +52 -0
  38. data/lib/tokenkit/configuration.rb +194 -0
  39. data/lib/tokenkit/regex_converter.rb +58 -0
  40. data/lib/tokenkit/version.rb +5 -0
  41. data/lib/tokenkit.rb +336 -0
  42. data/sig/tokenkit.rbs +4 -0
  43. metadata +172 -0
@@ -0,0 +1,382 @@
1
+ # Performance Guide
2
+
3
+ TokenKit is optimized for high-throughput tokenization with minimal memory overhead. This guide covers performance characteristics, optimization techniques, and best practices.
4
+
5
+ ## Performance Benchmarks
6
+
7
+ ### Baseline Performance
8
+
9
+ TokenKit can process ~100,000 documents per second for basic Unicode tokenization on modern hardware (Apple M-series, Intel i7+).
10
+
11
+ | Tokenizer | Operations/sec | Relative Speed | Use Case |
12
+ |-----------|---------------|----------------|----------|
13
+ | Lowercase | 870,000 | 1.0x (fastest) | Case normalization |
14
+ | Whitespace | 850,000 | 1.02x | Simple splitting |
15
+ | Unicode | 870,000 | 1.0x | **Recommended default** |
16
+ | Letter | 850,000 | 1.02x | Aggressive splitting |
17
+ | Pattern (simple) | 500,000 | 1.74x slower | Custom patterns |
18
+ | URL/Email | 400,000 | 2.17x slower | Web content |
19
+ | EdgeNgram | 388,000 | 2.24x slower | Autocomplete |
20
+ | Ngram | 350,000 | 2.49x slower | Fuzzy matching |
21
+ | CharGroup | 400,000 | 2.17x slower | CSV parsing |
22
+ | PathHierarchy | 300,000 | 2.90x slower | Path navigation |
23
+ | Pattern (complex) | 24,000 | 36x slower | Complex regex |
24
+ | Grapheme | 200,000 | 4.35x slower | Emoji handling |
25
+ | Sentence | 150,000 | 5.80x slower | Sentence splitting |
26
+ | Keyword | 1,000,000 | 0.87x faster | No splitting |
27
+
28
+ ### Pattern Preservation Impact
29
+
30
+ Pattern preservation adds overhead proportional to pattern complexity:
31
+
32
+ | Configuration | Ops/sec | Impact |
33
+ |--------------|---------|--------|
34
+ | No patterns | 870,000 | Baseline |
35
+ | 1 simple pattern | 600,000 | -31% |
36
+ | 4 patterns | 409,000 | -53% |
37
+ | 10 complex patterns | 150,000 | -83% |
38
+
39
+ ## Optimization Techniques
40
+
41
+ ### 1. Tokenizer Instance Caching (110x speedup)
42
+
43
+ **Problem**: Creating a new tokenizer and compiling regexes on every call.
44
+
45
+ **Solution**: Cache tokenizer instances and invalidate only on configuration changes.
46
+
47
+ ```rust
48
+ // Before: Created fresh tokenizer every time
49
+ fn tokenize(text: String) -> Vec<String> {
50
+ let tokenizer = from_config(config)?; // Recompiled regexes!
51
+ tokenizer.tokenize(&text)
52
+ }
53
+
54
+ // After: Cached tokenizer instance
55
+ static DEFAULT_CACHE: Lazy<Mutex<TokenizerCache>> = Lazy::new(|| {
56
+ Mutex::new(TokenizerCache {
57
+ config: TokenizerConfig::default(),
58
+ tokenizer: None, // Created once, reused many times
59
+ })
60
+ });
61
+ ```
62
+
63
+ **Impact**:
64
+ - With preserve patterns: 3,638 → 409,472 ops/sec (110x faster)
65
+ - Without patterns: 500,000 → 870,000 ops/sec (1.74x faster)
66
+
67
+ ### 2. Reduced String Allocations (20-30% improvement)
68
+
69
+ **Problem**: Creating intermediate string copies during pattern preservation.
70
+
71
+ **Solution**: Work with indices, allocate strings only when needed.
72
+
73
+ ```rust
74
+ // Before: Stored strings eagerly
75
+ let mut preserved_spans: Vec<(usize, usize, String)> = Vec::new();
76
+ for mat in pattern.find_iter(text) {
77
+ preserved_spans.push((mat.start(), mat.end(), mat.as_str().to_string()));
78
+ }
79
+
80
+ // After: Store indices, extract strings lazily
81
+ let mut preserved_spans: Vec<(usize, usize)> = Vec::with_capacity(32);
82
+ for mat in pattern.find_iter(text) {
83
+ preserved_spans.push((mat.start(), mat.end()));
84
+ }
85
+ // Extract string only when building final result
86
+ result.push(original_text[start..end].to_string());
87
+ ```
88
+
89
+ ### 3. In-Place Post-Processing
90
+
91
+ **Problem**: Creating new vectors for lowercase and punctuation removal.
92
+
93
+ **Solution**: Modify vectors in-place.
94
+
95
+ ```rust
96
+ // Before: Created new vector
97
+ tokens = tokens.into_iter().map(|t| t.to_lowercase()).collect();
98
+
99
+ // After: Modify in place
100
+ for token in tokens.iter_mut() {
101
+ *token = token.to_lowercase();
102
+ }
103
+ ```
104
+
105
+ ### 4. Pre-Allocated Vectors
106
+
107
+ **Problem**: Dynamic vector growth causes reallocations.
108
+
109
+ **Solution**: Pre-allocate with estimated capacity.
110
+
111
+ ```rust
112
+ // Estimate result size
113
+ let mut result = Vec::with_capacity(tokens.len() + preserved_spans.len());
114
+ ```
115
+
116
+ ### 5. Optimized Sorting
117
+
118
+ **Problem**: Stable sort is slower than necessary.
119
+
120
+ **Solution**: Use `sort_unstable_by` for better performance.
121
+
122
+ ```rust
123
+ // Before
124
+ spans.sort_by(|a, b| a.0.cmp(&b.0));
125
+
126
+ // After
127
+ spans.sort_unstable_by(|a, b| a.0.cmp(&b.0));
128
+ ```
129
+
130
+ ## Running Benchmarks
131
+
132
+ TokenKit includes comprehensive benchmarks to measure performance:
133
+
134
+ ```bash
135
+ # Install benchmark gems
136
+ bundle add benchmark-ips benchmark-memory
137
+
138
+ # Run all benchmarks
139
+ ruby benchmarks/tokenizer_benchmark.rb
140
+
141
+ # Run specific benchmark suites
142
+ ruby benchmarks/tokenizer_benchmark.rb tokenizers # Strategy comparison
143
+ ruby benchmarks/tokenizer_benchmark.rb config # Configuration impact
144
+ ruby benchmarks/tokenizer_benchmark.rb size # Text size scaling
145
+ ruby benchmarks/tokenizer_benchmark.rb memory # Memory usage
146
+ ```
147
+
148
+ ### Creating Custom Benchmarks
149
+
150
+ ```ruby
151
+ require 'benchmark/ips'
152
+ require 'tokenkit'
153
+
154
+ text = "Your sample text here"
155
+
156
+ Benchmark.ips do |x|
157
+ x.config(time: 5, warmup: 2)
158
+
159
+ x.report("Unicode") do
160
+ TokenKit.configure { |c| c.strategy = :unicode }
161
+ TokenKit.tokenize(text)
162
+ end
163
+
164
+ x.report("Pattern") do
165
+ TokenKit.configure { |c| c.strategy = :pattern; c.regex = /\w+/ }
166
+ TokenKit.tokenize(text)
167
+ end
168
+
169
+ x.compare!
170
+ end
171
+ ```
172
+
173
+ ## Performance Best Practices
174
+
175
+ ### 1. Choose the Right Tokenizer
176
+
177
+ - **Default to Unicode**: Best balance of correctness and performance
178
+ - **Use Whitespace**: When you know text is already well-formatted
179
+ - **Avoid Complex Patterns**: Each regex pattern has compilation and matching overhead
180
+
181
+ ### 2. Minimize Pattern Preservation
182
+
183
+ ```ruby
184
+ # Bad: Many overlapping patterns
185
+ config.preserve_patterns = [
186
+ /\d+/,
187
+ /\d+mg/,
188
+ /\d+ug/,
189
+ /\d+ml/
190
+ ]
191
+
192
+ # Good: Single comprehensive pattern
193
+ config.preserve_patterns = [
194
+ /\d+(mg|ug|ml)/
195
+ ]
196
+ ```
197
+
198
+ ### 3. Reuse Tokenizer Instances
199
+
200
+ ```ruby
201
+ # Good: Configure once, use many times
202
+ TokenKit.configure do |config|
203
+ config.strategy = :unicode
204
+ config.preserve_patterns = [...]
205
+ end
206
+
207
+ documents.each do |doc|
208
+ tokens = TokenKit.tokenize(doc) # Uses cached instance
209
+ end
210
+
211
+ # Avoid: Reconfiguring repeatedly
212
+ documents.each do |doc|
213
+ TokenKit.configure { |c| c.strategy = :unicode } # Invalidates cache!
214
+ tokens = TokenKit.tokenize(doc)
215
+ end
216
+ ```
217
+
218
+ ### 4. Use Instance API for Bulk Processing
219
+
220
+ ```ruby
221
+ # For bulk processing with different configurations
222
+ tokenizer = TokenKit::Tokenizer.new(
223
+ strategy: :unicode,
224
+ preserve_patterns: [...]
225
+ )
226
+
227
+ # Reuse the same instance
228
+ documents.map { |doc| tokenizer.tokenize(doc) }
229
+ ```
230
+
231
+ ### 5. Consider Memory vs Speed Tradeoffs
232
+
233
+ - **N-gram tokenizers**: Generate many tokens, higher memory usage
234
+ - **Pattern preservation**: Increases memory for regex storage
235
+ - **Remove punctuation**: Reduces token count, saves memory
236
+
237
+ ## Thread Safety and Concurrency
238
+
239
+ TokenKit is thread-safe and can be used in concurrent environments:
240
+
241
+ ```ruby
242
+ # Safe: Each thread uses the global cached tokenizer
243
+ threads = 10.times.map do
244
+ Thread.new do
245
+ 100.times do
246
+ TokenKit.tokenize("some text")
247
+ end
248
+ end
249
+ end
250
+ threads.each(&:join)
251
+ ```
252
+
253
+ Performance in concurrent environments:
254
+ - Single-threaded: ~870k ops/sec
255
+ - Multi-threaded (10 threads): ~850k ops/sec (minimal overhead)
256
+
257
+ ## Memory Usage
258
+
259
+ Memory usage varies by tokenizer and options:
260
+
261
+ | Configuration | Memory/Operation | Notes |
262
+ |--------------|------------------|-------|
263
+ | Basic Unicode | ~500 bytes | Minimal overhead |
264
+ | With preserve patterns | ~1-2 KB | Regex storage |
265
+ | EdgeNgram (max=10) | ~2-5 KB | Multiple tokens generated |
266
+ | Ngram (min=2, max=3) | ~3-8 KB | Many substring tokens |
267
+
268
+ ### Memory Profiling
269
+
270
+ ```ruby
271
+ require 'benchmark/memory'
272
+
273
+ Benchmark.memory do |x|
274
+ x.report("Unicode") do
275
+ TokenKit.configure { |c| c.strategy = :unicode }
276
+ 100.times { TokenKit.tokenize("sample text") }
277
+ end
278
+
279
+ x.compare!
280
+ end
281
+ ```
282
+
283
+ ## Compilation Optimizations
284
+
285
+ The Rust extension is compiled with aggressive optimizations:
286
+
287
+ ```toml
288
+ [profile.release]
289
+ lto = true # Link-time optimization
290
+ codegen-units = 1 # Single codegen unit for better optimization
291
+ ```
292
+
293
+ These settings increase compile time but improve runtime performance by ~15-20%.
294
+
295
+ ## Platform-Specific Notes
296
+
297
+ ### macOS (Apple Silicon)
298
+ - Best performance on M1/M2/M3 chips
299
+ - Native ARM64 compilation
300
+ - ~10-15% faster than Intel Macs
301
+
302
+ ### Linux
303
+ - Consistent performance across distributions
304
+ - Ensure Rust toolchain is up-to-date
305
+ - Consider using `jemalloc` for better memory allocation
306
+
307
+ ### Windows
308
+ - Slightly slower file I/O may affect benchmarks
309
+ - Use native Windows paths for PathHierarchy tokenizer
310
+
311
+ ## Troubleshooting Performance Issues
312
+
313
+ ### Slow Tokenization
314
+
315
+ 1. **Check pattern complexity**:
316
+ ```ruby
317
+ puts TokenKit.config.preserve_patterns
318
+ ```
319
+
320
+ 2. **Verify caching is working**:
321
+ ```ruby
322
+ # This should be fast after first call
323
+ 1000.times { TokenKit.tokenize("test") }
324
+ ```
325
+
326
+ 3. **Profile your patterns**:
327
+ ```ruby
328
+ require 'benchmark'
329
+
330
+ patterns = [/pattern1/, /pattern2/, ...]
331
+ text = "your text"
332
+
333
+ patterns.each do |pattern|
334
+ time = Benchmark.realtime do
335
+ 1000.times { pattern.match(text) }
336
+ end
337
+ puts "#{pattern}: #{time}s"
338
+ end
339
+ ```
340
+
341
+ ### High Memory Usage
342
+
343
+ 1. **Reduce n-gram sizes**:
344
+ ```ruby
345
+ config.max_gram = 5 # Instead of 10
346
+ ```
347
+
348
+ 2. **Limit preserve patterns**:
349
+ ```ruby
350
+ # Only essential patterns
351
+ config.preserve_patterns = [/critical_pattern/]
352
+ ```
353
+
354
+ 3. **Use streaming for large documents**:
355
+ ```ruby
356
+ # Process in chunks
357
+ text.each_line do |line|
358
+ tokens = TokenKit.tokenize(line)
359
+ process_tokens(tokens)
360
+ end
361
+ ```
362
+
363
+ ## Future Optimizations
364
+
365
+ Planned performance improvements:
366
+
367
+ 1. **SIMD vectorization** for character scanning
368
+ 2. **Parallel tokenization** for very large texts
369
+ 3. **Lazy pattern compilation** for rarely-used patterns
370
+ 4. **Memory pooling** for reduced allocations
371
+ 5. **Regex set optimization** for multiple patterns
372
+
373
+ ## Summary
374
+
375
+ TokenKit achieves high performance through:
376
+
377
+ - **Intelligent caching**: 110x speedup for pattern-heavy workloads
378
+ - **Minimal allocations**: 20-30% throughput improvement
379
+ - **Optimized algorithms**: Using efficient Rust implementations
380
+ - **Smart defaults**: Unicode tokenizer balances speed and correctness
381
+
382
+ For most use cases, the default Unicode tokenizer with minimal preserve patterns provides the best performance. Configure once at application startup and let TokenKit's caching handle the rest.
data/docs/README.md ADDED
@@ -0,0 +1,118 @@
1
+ # TokenKit Documentation
2
+
3
+ Welcome to the TokenKit documentation! This directory contains in-depth guides for developers and contributors.
4
+
5
+ ## 📚 Documentation Index
6
+
7
+ ### For Users
8
+
9
+ - **[README](../README.md)** - Getting started, usage examples, and API overview
10
+ - **[Performance Guide](PERFORMANCE.md)** - Benchmarks, optimization techniques, and best practices
11
+
12
+ ### For Contributors
13
+
14
+ - **[Architecture Guide](ARCHITECTURE.md)** - Internal design, Ruby-Rust bridge, and implementation details
15
+ - **[Code of Conduct](../CODE_OF_CONDUCT.md)** - Community standards and expectations
16
+
17
+ ### API Reference
18
+
19
+ - **[RubyDoc](https://rubydoc.info/gems/tokenkit)** - Complete API documentation (once published)
20
+ - Generate locally: `bundle exec yard doc` then open `doc/index.html`
21
+
22
+ ## 🚀 Quick Links
23
+
24
+ ### Getting Help
25
+
26
+ - [GitHub Issues](https://github.com/scientist-labs/tokenkit/issues) - Report bugs or request features
27
+ - [GitHub Discussions](https://github.com/scientist-labs/tokenkit/discussions) - Ask questions and share ideas
28
+
29
+ ### Key Features
30
+
31
+ - **13 Tokenization Strategies** - From simple whitespace to complex n-grams
32
+ - **Pattern Preservation** - Maintain domain-specific terms during tokenization
33
+ - **110x Performance Improvement** - Optimized caching and memory management
34
+ - **Thread-Safe** - Designed for concurrent production use
35
+ - **Comprehensive Error Handling** - Clear, actionable error messages
36
+
37
+ ### Performance Highlights
38
+
39
+ | Metric | Performance | Notes |
40
+ |--------|------------|-------|
41
+ | Basic Unicode | ~870K ops/sec | Baseline performance |
42
+ | With 4 Preserve Patterns | ~410K ops/sec | Was 3.6K before v0.3.0 |
43
+ | Memory Usage | ~500 bytes/op | Minimal overhead |
44
+ | Thread Safety | No degradation | Safe concurrent use |
45
+
46
+ ## 📖 Reading Order
47
+
48
+ For new users:
49
+ 1. [README](../README.md) - Start here
50
+ 2. [Performance Guide](PERFORMANCE.md) - Understand performance characteristics
51
+
52
+ For contributors:
53
+ 2. [Architecture Guide](ARCHITECTURE.md) - Understand the codebase
54
+ 3. [Performance Guide](PERFORMANCE.md) - Optimization techniques
55
+
56
+ ## 🔧 Development Commands
57
+
58
+ ```bash
59
+ # Setup
60
+ bundle install
61
+ bundle exec rake compile
62
+
63
+ # Testing
64
+ bundle exec rspec # Run tests
65
+ COVERAGE=true bundle exec rspec # With coverage report
66
+ bundle exec standardrb # Ruby linting
67
+
68
+ # Documentation
69
+ bundle exec yard doc # Generate API docs
70
+ open doc/index.html # View API docs
71
+
72
+ # Benchmarking
73
+ ruby benchmarks/tokenizer_benchmark.rb # Full benchmark suite
74
+ ruby benchmarks/cache_test.rb # Cache performance
75
+ ruby benchmarks/final_comparison.rb # Before/after comparison
76
+
77
+ # Building
78
+ gem build tokenkit.gemspec # Build gem
79
+ gem install ./tokenkit-*.gem # Install locally
80
+ ```
81
+
82
+ ## 📊 Test Coverage
83
+
84
+ Current coverage (v0.3.0):
85
+ - **Line Coverage**: 94.12%
86
+ - **Branch Coverage**: 87.8%
87
+ - **Total Tests**: 418
88
+
89
+ ## 🏗️ Architecture Overview
90
+
91
+ ```
92
+ TokenKit Architecture
93
+ ├── Ruby Layer (lib/)
94
+ │ ├── Public API (TokenKit module)
95
+ │ ├── Configuration management
96
+ │ └── Instance tokenizers
97
+ ├── Magnus Bridge (FFI)
98
+ │ └── Automatic type conversion
99
+ └── Rust Layer (ext/tokenkit/src/)
100
+ ├── Tokenizer trait
101
+ ├── 13 strategy implementations
102
+ ├── Pattern preservation
103
+ └── Error handling
104
+ ```
105
+
106
+ ## 📝 Version History
107
+
108
+ - **v0.3.0** (Unreleased) - Major performance improvements, proper error handling
109
+ - **v0.2.0** - Added 10 new tokenization strategies
110
+ - **v0.1.0** - Initial release with 3 core strategies
111
+
112
+ ## 📄 License
113
+
114
+ TokenKit is released under the [MIT License](../LICENSE.txt).
115
+
116
+ ---
117
+
118
+ *Last updated: 2025-09-29*
@@ -0,0 +1,21 @@
1
+ [package]
2
+ name = "tokenkit"
3
+ version = "0.2.0"
4
+ edition = "2021"
5
+
6
+ [lib]
7
+ crate-type = ["cdylib"]
8
+
9
+ [dependencies]
10
+ magnus = "0.7"
11
+ unicode-segmentation = "1.10"
12
+ regex = "1.10"
13
+ linkify = "0.10"
14
+ serde = { version = "1.0", features = ["derive"] }
15
+ serde_json = "1.0"
16
+ thiserror = "1.0"
17
+ once_cell = "1.19"
18
+
19
+ [profile.release]
20
+ lto = true
21
+ codegen-units = 1
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
3
+
4
+ create_rust_makefile("tokenkit/tokenkit")
@@ -0,0 +1,37 @@
1
+ use serde::{Deserialize, Serialize};
2
+
3
+ #[derive(Serialize, Deserialize, Clone, Debug)]
4
+ pub struct TokenizerConfig {
5
+ pub strategy: TokenizerStrategy,
6
+ pub lowercase: bool,
7
+ pub remove_punctuation: bool,
8
+ pub preserve_patterns: Vec<String>,
9
+ }
10
+
11
+ #[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
12
+ pub enum TokenizerStrategy {
13
+ Whitespace,
14
+ Unicode,
15
+ Pattern { regex: String },
16
+ Sentence,
17
+ Grapheme { extended: bool },
18
+ Keyword,
19
+ EdgeNgram { min_gram: usize, max_gram: usize },
20
+ Ngram { min_gram: usize, max_gram: usize },
21
+ PathHierarchy { delimiter: String },
22
+ UrlEmail,
23
+ CharGroup { split_on_chars: String },
24
+ Letter,
25
+ Lowercase,
26
+ }
27
+
28
+ impl Default for TokenizerConfig {
29
+ fn default() -> Self {
30
+ Self {
31
+ strategy: TokenizerStrategy::Unicode,
32
+ lowercase: true,
33
+ remove_punctuation: false,
34
+ preserve_patterns: Vec::new(),
35
+ }
36
+ }
37
+ }
@@ -0,0 +1,67 @@
1
+ use thiserror::Error;
2
+
3
+ #[derive(Error, Debug)]
4
+ pub enum TokenizerError {
5
+ #[error("Invalid configuration: {0}")]
6
+ InvalidConfiguration(String),
7
+
8
+ #[error("Invalid regex pattern '{pattern}': {error}")]
9
+ InvalidRegex {
10
+ pattern: String,
11
+ error: String,
12
+ },
13
+
14
+ #[error("Invalid n-gram configuration: min_gram ({min}) must be > 0 and <= max_gram ({max})")]
15
+ InvalidNgramConfig {
16
+ min: usize,
17
+ max: usize,
18
+ },
19
+
20
+ #[error("Empty delimiter is not allowed for {tokenizer} tokenizer")]
21
+ EmptyDelimiter {
22
+ tokenizer: String,
23
+ },
24
+
25
+ #[error("Unknown tokenizer strategy: {0}")]
26
+ UnknownStrategy(String),
27
+
28
+ #[error("Mutex lock failed: {0}")]
29
+ MutexError(String),
30
+
31
+ #[error("Ruby conversion error: {0}")]
32
+ RubyConversionError(String),
33
+ }
34
+
35
+ impl From<TokenizerError> for magnus::Error {
36
+ fn from(error: TokenizerError) -> Self {
37
+ use magnus::exception;
38
+
39
+ match error {
40
+ TokenizerError::InvalidConfiguration(_) |
41
+ TokenizerError::InvalidNgramConfig { .. } |
42
+ TokenizerError::EmptyDelimiter { .. } |
43
+ TokenizerError::UnknownStrategy(_) => {
44
+ magnus::Error::new(exception::arg_error(), error.to_string())
45
+ }
46
+ TokenizerError::InvalidRegex { .. } => {
47
+ magnus::Error::new(exception::regexp_error(), error.to_string())
48
+ }
49
+ TokenizerError::MutexError(_) => {
50
+ magnus::Error::new(exception::runtime_error(), error.to_string())
51
+ }
52
+ TokenizerError::RubyConversionError(_) => {
53
+ magnus::Error::new(exception::type_error(), error.to_string())
54
+ }
55
+ }
56
+ }
57
+ }
58
+
59
+ // For converting magnus conversion errors to our error type
60
+ impl From<magnus::Error> for TokenizerError {
61
+ fn from(error: magnus::Error) -> Self {
62
+ TokenizerError::RubyConversionError(error.to_string())
63
+ }
64
+ }
65
+
66
+ // Internal result type for Rust functions
67
+ pub type Result<T> = std::result::Result<T, TokenizerError>;