spellkit 0.1.0.pre.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +259 -33
- data/ext/spellkit/Cargo.lock +0 -57
- data/ext/spellkit/Cargo.toml +0 -2
- data/ext/spellkit/src/guards.rs +21 -3
- data/ext/spellkit/src/lib.rs +213 -75
- data/ext/spellkit/src/symspell.rs +115 -30
- data/ext/spellkit/target/debug/build/rb-sys-ead65721880de65e/out/bindings-0.9.117-mri-arm64-darwin24-3.3.0.rs +8902 -0
- data/ext/spellkit/target/debug/incremental/spellkit-07yduakb6espe/s-hbic3f250f-1cel1lt.lock +0 -0
- data/ext/spellkit/target/debug/incremental/spellkit-1d3zzknqc98bj/s-hbic3f250l-011iykk.lock +0 -0
- data/ext/spellkit/target/debug/incremental/spellkit-1pt6om2w642b5/s-hbihepi6zy-1r3p88g.lock +0 -0
- data/ext/spellkit/target/release/build/clang-sys-523e86284ef4dd76/out/common.rs +355 -0
- data/ext/spellkit/target/release/build/clang-sys-523e86284ef4dd76/out/dynamic.rs +276 -0
- data/ext/spellkit/target/release/build/clang-sys-523e86284ef4dd76/out/macros.rs +49 -0
- data/ext/spellkit/target/release/build/rb-sys-7d03ffe964952311/out/bindings-0.9.117-mri-arm64-darwin24-3.3.0.rs +8902 -0
- data/lib/spellkit/version.rb +1 -1
- data/lib/spellkit.rb +176 -31
- metadata +97 -6
- data/LICENSE +0 -21
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6e690fa50208d003679afff3117b6f00664e6938e7feb992eff6a6544fad279e
|
|
4
|
+
data.tar.gz: 17d41414cbd48e093913cfa8765aac063f1cb17283df13414ad801fdf0aa79ae
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: '0989bcacde87e9405c99f8674c681be65844a381cac11a00f8f05dd2f1a54312ce9d1799ca6317688255ee5fad454f4670845163b87e6a9541a981fff8685b35'
|
|
7
|
+
data.tar.gz: f12c69803c39ee74083a2090c85aac7bdfe7c39ac81995a95c09ad8c1e70df35cfb8d332de287413accf99aa4b6960e9e0c5f77ccd39b37d4ae092b06d8ea2d4
|
data/README.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
<img src="/docs/assets/spellkit-wide.png" alt="spellkit" height="160px">
|
|
2
2
|
|
|
3
3
|
Fast, safe typo correction for search-term extraction, wrapping the SymSpell algorithm in Rust via Magnus.
|
|
4
4
|
|
|
@@ -9,6 +9,26 @@ SpellKit provides:
|
|
|
9
9
|
- **Sub-millisecond latency** - p95 < 2µs on small dictionaries
|
|
10
10
|
- **Thread-safe** - built with Rust's Arc<RwLock> for safe concurrent access
|
|
11
11
|
|
|
12
|
+
## Why SpellKit?
|
|
13
|
+
|
|
14
|
+
### No Runtime Dependencies
|
|
15
|
+
SpellKit is a pure Ruby gem with a Rust extension. Just `gem install spellkit` and you're done. No need to install Aspell, Hunspell, or other system packages. This makes deployment simpler and more reliable across different environments.
|
|
16
|
+
|
|
17
|
+
### Fast Performance
|
|
18
|
+
Built on the SymSpell algorithm with Rust, SpellKit delivers:
|
|
19
|
+
- **350,000+ operations/second** for spell checking
|
|
20
|
+
- **3.7x faster** than Aspell for correctness checks
|
|
21
|
+
- **40x faster** than Aspell for generating suggestions
|
|
22
|
+
- **p99 latency < 25µs** even under load
|
|
23
|
+
|
|
24
|
+
See the [Benchmarks](#benchmarks) section for detailed comparisons.
|
|
25
|
+
|
|
26
|
+
### Production Ready
|
|
27
|
+
- Thread-safe concurrent access
|
|
28
|
+
- Hot reload dictionaries without restarts
|
|
29
|
+
- Instance-based API for multi-domain support
|
|
30
|
+
- Comprehensive error handling
|
|
31
|
+
|
|
12
32
|
## Installation
|
|
13
33
|
|
|
14
34
|
Add to your Gemfile:
|
|
@@ -42,13 +62,17 @@ end
|
|
|
42
62
|
# Or load from local file
|
|
43
63
|
# SpellKit.load!(dictionary: "path/to/dictionary.tsv")
|
|
44
64
|
|
|
65
|
+
# Check if a word is spelled correctly
|
|
66
|
+
puts SpellKit.correct?("hello")
|
|
67
|
+
# => true
|
|
68
|
+
|
|
45
69
|
# Get suggestions for a misspelled word
|
|
46
|
-
suggestions = SpellKit.
|
|
70
|
+
suggestions = SpellKit.suggestions("helllo", 5)
|
|
47
71
|
puts suggestions.inspect
|
|
48
72
|
# => [{"term"=>"hello", "distance"=>1, "freq"=>...}]
|
|
49
73
|
|
|
50
74
|
# Correct a typo
|
|
51
|
-
corrected = SpellKit.
|
|
75
|
+
corrected = SpellKit.correct("helllo")
|
|
52
76
|
puts corrected
|
|
53
77
|
# => "hello"
|
|
54
78
|
|
|
@@ -76,16 +100,20 @@ SpellKit.load!(dictionary: "https://example.com/dict.tsv")
|
|
|
76
100
|
# Or from local file
|
|
77
101
|
SpellKit.load!(dictionary: "models/dictionary.tsv", edit_distance: 1)
|
|
78
102
|
|
|
103
|
+
# Check if a word is correct
|
|
104
|
+
SpellKit.correct?("hello")
|
|
105
|
+
# => true
|
|
106
|
+
|
|
79
107
|
# Get suggestions
|
|
80
|
-
SpellKit.
|
|
108
|
+
SpellKit.suggestions("lyssis", 5)
|
|
81
109
|
# => [{"term"=>"lysis", "distance"=>1, "freq"=>2000}, ...]
|
|
82
110
|
|
|
83
111
|
# Correct a typo
|
|
84
|
-
SpellKit.
|
|
112
|
+
SpellKit.correct("helllo")
|
|
85
113
|
# => "hello"
|
|
86
114
|
|
|
87
115
|
# Batch correction
|
|
88
|
-
tokens = %w[
|
|
116
|
+
tokens = %w[helllo wrld ruby]
|
|
89
117
|
SpellKit.correct_tokens(tokens)
|
|
90
118
|
# => ["hello", "world", "ruby"]
|
|
91
119
|
```
|
|
@@ -118,13 +146,13 @@ SpellKit.load!(
|
|
|
118
146
|
protected_patterns: [/^[A-Z]{3,4}\d+$/]
|
|
119
147
|
)
|
|
120
148
|
|
|
121
|
-
#
|
|
122
|
-
SpellKit.
|
|
149
|
+
# Protected terms are automatically respected
|
|
150
|
+
SpellKit.correct("CDK10")
|
|
123
151
|
# => "CDK10" # protected, never changed
|
|
124
152
|
|
|
125
|
-
# Batch correction with
|
|
126
|
-
tokens = %w[
|
|
127
|
-
SpellKit.correct_tokens(tokens
|
|
153
|
+
# Batch correction with protection
|
|
154
|
+
tokens = %w[helllo wrld ABC-123 for CDK10]
|
|
155
|
+
SpellKit.correct_tokens(tokens)
|
|
128
156
|
# => ["hello", "world", "ABC-123", "for", "CDK10"]
|
|
129
157
|
```
|
|
130
158
|
|
|
@@ -147,8 +175,8 @@ legal_checker.load!(
|
|
|
147
175
|
)
|
|
148
176
|
|
|
149
177
|
# Use them independently
|
|
150
|
-
medical_checker.
|
|
151
|
-
legal_checker.
|
|
178
|
+
medical_checker.suggestions("lyssis", 5)
|
|
179
|
+
legal_checker.suggestions("contractt", 5)
|
|
152
180
|
|
|
153
181
|
# Each maintains its own state
|
|
154
182
|
medical_checker.stats # Shows medical dictionary stats
|
|
@@ -169,7 +197,7 @@ SpellKit.configure do |config|
|
|
|
169
197
|
end
|
|
170
198
|
|
|
171
199
|
# This becomes the default instance
|
|
172
|
-
SpellKit.
|
|
200
|
+
SpellKit.suggestions("word", 5) # Uses configured dictionary
|
|
173
201
|
```
|
|
174
202
|
|
|
175
203
|
## Dictionary Format
|
|
@@ -238,10 +266,96 @@ SpellKit.load!(
|
|
|
238
266
|
protected_path: "models/protected.txt", # optional
|
|
239
267
|
protected_patterns: [/^[A-Z]{3,4}\d+$/], # optional
|
|
240
268
|
edit_distance: 1, # 1 (default) or 2
|
|
241
|
-
frequency_threshold: 10.0
|
|
269
|
+
frequency_threshold: 10.0, # default: 10.0 (minimum frequency for corrections)
|
|
270
|
+
|
|
271
|
+
# Skip pattern filters (all default to false)
|
|
272
|
+
skip_urls: true, # Skip URLs (http://, https://, www.)
|
|
273
|
+
skip_emails: true, # Skip email addresses
|
|
274
|
+
skip_hostnames: true, # Skip hostnames (example.com)
|
|
275
|
+
skip_code_patterns: true, # Skip code identifiers (camelCase, snake_case, etc.)
|
|
276
|
+
skip_numbers: true # Skip numeric patterns (versions, IDs, measurements)
|
|
242
277
|
)
|
|
243
278
|
```
|
|
244
279
|
|
|
280
|
+
### Frequency Threshold
|
|
281
|
+
|
|
282
|
+
The `frequency_threshold` parameter controls which corrections are accepted by `correct` and `correct_tokens`:
|
|
283
|
+
|
|
284
|
+
- **For misspelled words** (not in dictionary): Only suggest corrections with frequency ≥ `frequency_threshold`
|
|
285
|
+
- **For dictionary words**: Only suggest alternatives with frequency ≥ `frequency_threshold × original_frequency`
|
|
286
|
+
|
|
287
|
+
This prevents suggesting rare words as corrections for common typos.
|
|
288
|
+
|
|
289
|
+
**Example:**
|
|
290
|
+
```ruby
|
|
291
|
+
# With default threshold (10.0), suggest any correction with freq ≥ 10
|
|
292
|
+
SpellKit.load!(dictionary: "dict.tsv")
|
|
293
|
+
SpellKit.correct("helllo") # => "hello" (if freq ≥ 10)
|
|
294
|
+
|
|
295
|
+
# With high threshold (1000.0), only suggest common corrections
|
|
296
|
+
SpellKit.load!(dictionary: "dict.tsv", frequency_threshold: 1000.0)
|
|
297
|
+
SpellKit.correct("helllo") # => "hello" (if freq ≥ 1000)
|
|
298
|
+
SpellKit.correct("rarword") # => "rarword" (no correction if freq < 1000)
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
### Skip Patterns
|
|
302
|
+
|
|
303
|
+
SpellKit can automatically skip certain patterns to avoid "correcting" technical terms, URLs, and other special content. Inspired by Aspell's filter modes, these patterns are automatically applied when configured.
|
|
304
|
+
|
|
305
|
+
**Available skip patterns:**
|
|
306
|
+
|
|
307
|
+
```ruby
|
|
308
|
+
SpellKit.load!(
|
|
309
|
+
dictionary: "dict.tsv",
|
|
310
|
+
skip_urls: true, # Skip URLs: https://example.com, www.example.com
|
|
311
|
+
skip_emails: true, # Skip emails: user@domain.com, admin+tag@example.com
|
|
312
|
+
skip_hostnames: true, # Skip hostnames: example.com, api.example.com
|
|
313
|
+
skip_code_patterns: true, # Skip code: camelCase, snake_case, PascalCase, dotted.paths
|
|
314
|
+
skip_numbers: true # Skip numbers: 1.2.3, #123, 5kg, 100mb
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# With skip patterns enabled, technical content is preserved
|
|
318
|
+
SpellKit.correct("https://example.com") # => "https://example.com"
|
|
319
|
+
SpellKit.correct("user@test.com") # => "user@test.com"
|
|
320
|
+
SpellKit.correct("getElementById") # => "getElementById"
|
|
321
|
+
SpellKit.correct("version-1.2.3") # => "version-1.2.3"
|
|
322
|
+
|
|
323
|
+
# Regular typos are still corrected
|
|
324
|
+
SpellKit.correct("helllo") # => "hello"
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
**What each skip pattern matches:**
|
|
328
|
+
|
|
329
|
+
- **`skip_urls`**: `http://`, `https://`, `www.` URLs
|
|
330
|
+
- **`skip_emails`**: Email addresses with standard formats including `+` and `.` in usernames
|
|
331
|
+
- **`skip_hostnames`**: Domain names like `example.com`, `api.example.co.uk`
|
|
332
|
+
- **`skip_code_patterns`**:
|
|
333
|
+
- `camelCase` (starts lowercase)
|
|
334
|
+
- `PascalCase` (starts uppercase, mixed case)
|
|
335
|
+
- `snake_case` and `SCREAMING_SNAKE_CASE`
|
|
336
|
+
- `dotted.paths` like `Array.map` or `config.yml`
|
|
337
|
+
- **`skip_numbers`**:
|
|
338
|
+
- Version numbers: `1.0`, `2.5.3`, `10.15.7.1`
|
|
339
|
+
- Hash/IDs: `#123`, `#4567`
|
|
340
|
+
- Measurements: `5kg`, `2.5m`, `100mb`, `16px`
|
|
341
|
+
- Words starting with digits: `5test`, `123abc`
|
|
342
|
+
|
|
343
|
+
**Combining with protected_patterns:**
|
|
344
|
+
|
|
345
|
+
Skip patterns work alongside your custom `protected_patterns`:
|
|
346
|
+
|
|
347
|
+
```ruby
|
|
348
|
+
SpellKit.load!(
|
|
349
|
+
dictionary: "dict.tsv",
|
|
350
|
+
skip_urls: true, # Built-in URL skipping
|
|
351
|
+
protected_patterns: [/^CUSTOM-\d+$/] # Your custom patterns
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# Both work together automatically
|
|
355
|
+
SpellKit.correct("https://example.com") # => "https://example.com" (skip_urls)
|
|
356
|
+
SpellKit.correct("CUSTOM-123") # => "CUSTOM-123" (custom pattern)
|
|
357
|
+
```
|
|
358
|
+
|
|
245
359
|
## API Reference
|
|
246
360
|
|
|
247
361
|
### `SpellKit.load!(**options)`
|
|
@@ -254,12 +368,24 @@ Load or reload dictionaries. Thread-safe atomic swap. Accepts URLs (auto-downloa
|
|
|
254
368
|
- `protected_patterns:` (optional) - Array of Regexp or String patterns to protect
|
|
255
369
|
- `edit_distance:` (default: 1) - Maximum edit distance (1 or 2)
|
|
256
370
|
- `frequency_threshold:` (default: 10.0) - Minimum frequency ratio for corrections
|
|
371
|
+
- `skip_urls:` (default: false) - Skip URLs (http://, https://, www.)
|
|
372
|
+
- `skip_emails:` (default: false) - Skip email addresses
|
|
373
|
+
- `skip_hostnames:` (default: false) - Skip hostnames (example.com)
|
|
374
|
+
- `skip_code_patterns:` (default: false) - Skip code identifiers (camelCase, snake_case, etc.)
|
|
375
|
+
- `skip_numbers:` (default: false) - Skip numeric patterns (versions, IDs, measurements)
|
|
257
376
|
|
|
258
377
|
**Examples:**
|
|
259
378
|
```ruby
|
|
260
379
|
# From URL (recommended for getting started)
|
|
261
380
|
SpellKit.load!(dictionary: SpellKit::DEFAULT_DICTIONARY_URL)
|
|
262
381
|
|
|
382
|
+
# With skip patterns for technical content
|
|
383
|
+
SpellKit.load!(
|
|
384
|
+
dictionary: SpellKit::DEFAULT_DICTIONARY_URL,
|
|
385
|
+
skip_urls: true,
|
|
386
|
+
skip_code_patterns: true
|
|
387
|
+
)
|
|
388
|
+
|
|
263
389
|
# From custom URL
|
|
264
390
|
SpellKit.load!(dictionary: "https://example.com/dict.tsv")
|
|
265
391
|
|
|
@@ -267,7 +393,24 @@ SpellKit.load!(dictionary: "https://example.com/dict.tsv")
|
|
|
267
393
|
SpellKit.load!(dictionary: "/path/to/dictionary.tsv")
|
|
268
394
|
```
|
|
269
395
|
|
|
270
|
-
### `SpellKit.
|
|
396
|
+
### `SpellKit.correct?(word)`
|
|
397
|
+
|
|
398
|
+
Check if a word is spelled correctly (exact dictionary match).
|
|
399
|
+
|
|
400
|
+
**Parameters:**
|
|
401
|
+
- `word` (required) - The word to check
|
|
402
|
+
|
|
403
|
+
**Returns:** Boolean - true if word exists in dictionary, false otherwise
|
|
404
|
+
|
|
405
|
+
**Performance:** Very fast O(1) HashMap lookup. Use this instead of `suggest()` when you only need to check correctness.
|
|
406
|
+
|
|
407
|
+
**Example:**
|
|
408
|
+
```ruby
|
|
409
|
+
SpellKit.correct?("hello") # => true
|
|
410
|
+
SpellKit.correct?("helllo") # => false
|
|
411
|
+
```
|
|
412
|
+
|
|
413
|
+
### `SpellKit.suggestions(word, max = 5)`
|
|
271
414
|
|
|
272
415
|
Get ranked suggestions for a word.
|
|
273
416
|
|
|
@@ -277,16 +420,38 @@ Get ranked suggestions for a word.
|
|
|
277
420
|
|
|
278
421
|
**Returns:** Array of hashes with `"term"`, `"distance"`, and `"freq"` keys
|
|
279
422
|
|
|
280
|
-
|
|
423
|
+
**Example:**
|
|
424
|
+
```ruby
|
|
425
|
+
SpellKit.suggestions("helllo", 5)
|
|
426
|
+
# => [{"term"=>"hello", "distance"=>1, "freq"=>10000}, ...]
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
### `SpellKit.correct(word)`
|
|
281
430
|
|
|
282
|
-
Return corrected word or original if no better match found.
|
|
431
|
+
Return corrected word or original if no better match found. Respects `frequency_threshold` configuration. Protected terms and skip patterns are automatically applied when configured.
|
|
283
432
|
|
|
284
|
-
**
|
|
285
|
-
- `
|
|
433
|
+
**Parameters:**
|
|
434
|
+
- `word` (required) - The word to correct
|
|
435
|
+
|
|
436
|
+
**Behavior:**
|
|
437
|
+
- Returns original word if it exists in dictionary
|
|
438
|
+
- For misspellings, only accepts corrections with frequency ≥ `frequency_threshold`
|
|
439
|
+
- Returns original word if no corrections pass the threshold
|
|
440
|
+
- Automatically respects protected terms and skip patterns configured in `load!`
|
|
441
|
+
|
|
442
|
+
**Example:**
|
|
443
|
+
```ruby
|
|
444
|
+
SpellKit.correct("helllo") # => "hello"
|
|
445
|
+
SpellKit.correct("hello") # => "hello" (already correct)
|
|
446
|
+
SpellKit.correct("CDK10") # => "CDK10" (protected if configured)
|
|
447
|
+
```
|
|
286
448
|
|
|
287
|
-
### `SpellKit.correct_tokens(tokens
|
|
449
|
+
### `SpellKit.correct_tokens(tokens)`
|
|
288
450
|
|
|
289
|
-
Batch correction of an array of tokens.
|
|
451
|
+
Batch correction of an array of tokens. Respects `frequency_threshold` configuration. Protected terms and skip patterns are automatically applied when configured.
|
|
452
|
+
|
|
453
|
+
**Parameters:**
|
|
454
|
+
- `tokens` (required) - Array of words to correct
|
|
290
455
|
|
|
291
456
|
**Returns:** Array of corrected strings
|
|
292
457
|
|
|
@@ -306,7 +471,7 @@ Verify system is properly loaded. Raises error if not.
|
|
|
306
471
|
|
|
307
472
|
## Term Protection
|
|
308
473
|
|
|
309
|
-
|
|
474
|
+
When configured, SpellKit automatically protects specific terms from correction:
|
|
310
475
|
|
|
311
476
|
### Exact Matches
|
|
312
477
|
Terms in `protected_path` file are never corrected, even if similar dictionary words exist. Matching is case-insensitive, but original casing is preserved in output.
|
|
@@ -370,24 +535,85 @@ end
|
|
|
370
535
|
class SearchPreprocessor
|
|
371
536
|
def self.correct_query(text)
|
|
372
537
|
tokens = text.downcase.split(/\s+/)
|
|
373
|
-
SpellKit.correct_tokens(tokens
|
|
538
|
+
SpellKit.correct_tokens(tokens).join(" ")
|
|
374
539
|
end
|
|
375
540
|
end
|
|
376
541
|
```
|
|
377
542
|
|
|
378
543
|
## Performance
|
|
379
544
|
|
|
380
|
-
|
|
545
|
+
### SpellKit Standalone (M1 MacBook Pro, Ruby 3.3.0, 80k dictionary)
|
|
546
|
+
|
|
547
|
+
**Single Word Suggestions:**
|
|
548
|
+
- 3,345 i/s (298.96 μs/i) with max: 1 suggestion
|
|
549
|
+
- 3,198 i/s (312.73 μs/i) with max: 5 suggestions
|
|
550
|
+
- 3,073 i/s (325.45 μs/i) with max: 10 suggestions
|
|
551
|
+
|
|
552
|
+
**Correction Performance:**
|
|
553
|
+
- `correct`: 1,858 i/s (538.17 μs/i)
|
|
554
|
+
- `correct_tokens` (batch): 2,005 i/s (498.76 μs/i)
|
|
555
|
+
|
|
556
|
+
**Protection Performance:**
|
|
557
|
+
- Without protection: 2,926 i/s (341.79 μs/i)
|
|
558
|
+
- With protection: 9,337 i/s (107.10 μs/i) - **3.19x faster!**
|
|
559
|
+
*(Protection checks short-circuit expensive dictionary lookups)*
|
|
560
|
+
|
|
561
|
+
**Latency Distribution (10,000 iterations):**
|
|
562
|
+
- p50: 61μs
|
|
563
|
+
- p95: 66μs
|
|
564
|
+
- p99: 105μs
|
|
565
|
+
- max: 298μs
|
|
566
|
+
|
|
567
|
+
**Raw Throughput:** 16,192 ops/sec
|
|
568
|
+
|
|
569
|
+
### Key Takeaways
|
|
570
|
+
1. **Consistent Performance**: p95 latency of 66μs with 80k dictionary, p99 at 105μs
|
|
571
|
+
2. **Guards are Fast**: Protected term checks improve performance by 3.2x by avoiding dictionary lookups
|
|
572
|
+
3. **High Throughput**: Over 16k operations per second with 80k word dictionary
|
|
573
|
+
4. **Scales Well**: Minimal performance difference between 1 vs 10 suggestions
|
|
574
|
+
|
|
575
|
+
## Benchmarks
|
|
576
|
+
|
|
577
|
+
SpellKit includes comprehensive benchmarks to measure performance and compare with other spell checkers.
|
|
578
|
+
|
|
579
|
+
### Running Benchmarks
|
|
580
|
+
|
|
581
|
+
**Performance Benchmark** - Comprehensive SpellKit performance analysis:
|
|
582
|
+
```bash
|
|
583
|
+
bundle exec ruby benchmark/performance.rb
|
|
584
|
+
```
|
|
585
|
+
|
|
586
|
+
Measures:
|
|
587
|
+
- Single word suggestions with varying result limits
|
|
588
|
+
- Correction performance on mixed datasets
|
|
589
|
+
- Batch correction throughput
|
|
590
|
+
- Guard/protection overhead
|
|
591
|
+
- Latency distribution (p50, p95, p99)
|
|
592
|
+
- Raw throughput (ops/sec)
|
|
593
|
+
|
|
594
|
+
**Aspell Comparison** - Direct comparison with Aspell:
|
|
595
|
+
```bash
|
|
596
|
+
# First install Aspell if needed:
|
|
597
|
+
# macOS: brew install aspell
|
|
598
|
+
# Ubuntu: sudo apt-get install aspell libaspell-dev
|
|
599
|
+
|
|
600
|
+
bundle exec ruby benchmark/comparison_aspell.rb
|
|
601
|
+
```
|
|
602
|
+
|
|
603
|
+
Compares SpellKit with Aspell on:
|
|
604
|
+
- Single word correction performance
|
|
605
|
+
- Spell checking (correctness tests)
|
|
606
|
+
- Latency distribution at scale
|
|
607
|
+
|
|
608
|
+
See [benchmark/README.md](benchmark/README.md) for detailed results and analysis.
|
|
609
|
+
|
|
610
|
+
### Why These Benchmarks?
|
|
381
611
|
|
|
382
|
-
|
|
383
|
-
- **
|
|
384
|
-
- **
|
|
385
|
-
- **Memory**: ~150MB for 1M term dictionary (estimated)
|
|
612
|
+
**SpellKit vs Aspell**: Both provide fuzzy matching and suggestions for misspelled words, but use different algorithms:
|
|
613
|
+
- **SpellKit (SymSpell)**: O(1) lookup complexity, optimized for speed with large dictionaries
|
|
614
|
+
- **Aspell**: Statistical scoring with phonetic similarity, good for natural language
|
|
386
615
|
|
|
387
|
-
|
|
388
|
-
- Load: < 500ms
|
|
389
|
-
- p50: < 30µs, p95: < 100µs
|
|
390
|
-
- Memory: 50-150MB
|
|
616
|
+
The comparison shows SpellKit's performance advantage while solving the same problem.
|
|
391
617
|
|
|
392
618
|
## Building Dictionaries
|
|
393
619
|
|
data/ext/spellkit/Cargo.lock
CHANGED
|
@@ -113,12 +113,6 @@ dependencies = [
|
|
|
113
113
|
"either",
|
|
114
114
|
]
|
|
115
115
|
|
|
116
|
-
[[package]]
|
|
117
|
-
name = "itoa"
|
|
118
|
-
version = "1.0.15"
|
|
119
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
120
|
-
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
|
121
|
-
|
|
122
116
|
[[package]]
|
|
123
117
|
name = "lazy_static"
|
|
124
118
|
version = "1.5.0"
|
|
@@ -275,61 +269,12 @@ version = "1.1.0"
|
|
|
275
269
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
276
270
|
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
|
277
271
|
|
|
278
|
-
[[package]]
|
|
279
|
-
name = "ryu"
|
|
280
|
-
version = "1.0.20"
|
|
281
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
282
|
-
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
|
|
283
|
-
|
|
284
272
|
[[package]]
|
|
285
273
|
name = "seq-macro"
|
|
286
274
|
version = "0.3.6"
|
|
287
275
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
288
276
|
checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc"
|
|
289
277
|
|
|
290
|
-
[[package]]
|
|
291
|
-
name = "serde"
|
|
292
|
-
version = "1.0.227"
|
|
293
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
294
|
-
checksum = "80ece43fc6fbed4eb5392ab50c07334d3e577cbf40997ee896fe7af40bba4245"
|
|
295
|
-
dependencies = [
|
|
296
|
-
"serde_core",
|
|
297
|
-
"serde_derive",
|
|
298
|
-
]
|
|
299
|
-
|
|
300
|
-
[[package]]
|
|
301
|
-
name = "serde_core"
|
|
302
|
-
version = "1.0.227"
|
|
303
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
304
|
-
checksum = "7a576275b607a2c86ea29e410193df32bc680303c82f31e275bbfcafe8b33be5"
|
|
305
|
-
dependencies = [
|
|
306
|
-
"serde_derive",
|
|
307
|
-
]
|
|
308
|
-
|
|
309
|
-
[[package]]
|
|
310
|
-
name = "serde_derive"
|
|
311
|
-
version = "1.0.227"
|
|
312
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
313
|
-
checksum = "51e694923b8824cf0e9b382adf0f60d4e05f348f357b38833a3fa5ed7c2ede04"
|
|
314
|
-
dependencies = [
|
|
315
|
-
"proc-macro2",
|
|
316
|
-
"quote",
|
|
317
|
-
"syn",
|
|
318
|
-
]
|
|
319
|
-
|
|
320
|
-
[[package]]
|
|
321
|
-
name = "serde_json"
|
|
322
|
-
version = "1.0.145"
|
|
323
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
324
|
-
checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
|
|
325
|
-
dependencies = [
|
|
326
|
-
"itoa",
|
|
327
|
-
"memchr",
|
|
328
|
-
"ryu",
|
|
329
|
-
"serde",
|
|
330
|
-
"serde_core",
|
|
331
|
-
]
|
|
332
|
-
|
|
333
278
|
[[package]]
|
|
334
279
|
name = "shell-words"
|
|
335
280
|
version = "1.1.0"
|
|
@@ -349,8 +294,6 @@ dependencies = [
|
|
|
349
294
|
"hashbrown",
|
|
350
295
|
"magnus",
|
|
351
296
|
"regex",
|
|
352
|
-
"serde",
|
|
353
|
-
"serde_json",
|
|
354
297
|
"unicode-normalization",
|
|
355
298
|
]
|
|
356
299
|
|
data/ext/spellkit/Cargo.toml
CHANGED
data/ext/spellkit/src/guards.rs
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
use hashbrown::HashSet;
|
|
2
|
-
use regex::Regex;
|
|
2
|
+
use regex::{Regex, RegexBuilder};
|
|
3
|
+
use crate::symspell::SymSpell;
|
|
3
4
|
|
|
4
5
|
#[derive(Debug, Clone)]
|
|
5
6
|
pub struct Guards {
|
|
@@ -19,14 +20,31 @@ impl Guards {
|
|
|
19
20
|
for line in content.lines() {
|
|
20
21
|
let trimmed = line.trim();
|
|
21
22
|
if !trimmed.is_empty() && !trimmed.starts_with('#') {
|
|
23
|
+
// Store literal form
|
|
22
24
|
self.protected_set.insert(trimmed.to_string());
|
|
25
|
+
// Store lowercase form
|
|
23
26
|
self.protected_set.insert(trimmed.to_lowercase());
|
|
27
|
+
// Store normalized form (strips whitespace, converts to lowercase)
|
|
28
|
+
// This ensures variants like "newyork" are protected if "New York" is in the list
|
|
29
|
+
let normalized = SymSpell::normalize_word(trimmed);
|
|
30
|
+
self.protected_set.insert(normalized);
|
|
24
31
|
}
|
|
25
32
|
}
|
|
26
33
|
}
|
|
27
34
|
|
|
28
|
-
pub fn
|
|
29
|
-
|
|
35
|
+
pub fn add_pattern_with_flags(
|
|
36
|
+
&mut self,
|
|
37
|
+
pattern: &str,
|
|
38
|
+
case_insensitive: bool,
|
|
39
|
+
multiline: bool,
|
|
40
|
+
extended: bool,
|
|
41
|
+
) -> Result<(), String> {
|
|
42
|
+
match RegexBuilder::new(pattern)
|
|
43
|
+
.case_insensitive(case_insensitive)
|
|
44
|
+
.multi_line(multiline)
|
|
45
|
+
.ignore_whitespace(extended)
|
|
46
|
+
.build()
|
|
47
|
+
{
|
|
30
48
|
Ok(regex) => {
|
|
31
49
|
self.protected_patterns.push(regex);
|
|
32
50
|
Ok(())
|