spellkit 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +55 -33
- data/ext/spellkit/Cargo.toml +1 -1
- data/ext/spellkit/src/lib.rs +10 -14
- data/lib/spellkit/version.rb +1 -1
- data/lib/spellkit.rb +8 -10
- metadata +6 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e36701637cedd91531b05d0b18680e57d26b9aae7e117d6a179f27d30a61e444
|
|
4
|
+
data.tar.gz: 560114d91574153e6b618c509a705bbcd82969e4a437aee777fa2ee7f10babcc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6cc82708564b26e943b4bc394ec41c13aa59c3ff7388027d113001cc8f8f8e83f219795b5547ab2ed318c9ab89733efbfbf3b937cdcf75d70a51836d5c2e4283
|
|
7
|
+
data.tar.gz: a6903fb208c64dc53bd07ee2b183e4a5bfe552d038ee87129e6afe3f32b14197c0d6546139abeada777ff98db3ee2466c86dd158561bf8d9d8beedacc6b7b335
|
data/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<img src="/docs/assets/spellkit-wide.png" alt="spellkit" height="160px">
|
|
2
2
|
|
|
3
|
-
Fast, safe typo correction for search-term extraction
|
|
3
|
+
Fast, safe typo correction for search-term extraction. A Ruby gem with a native Rust implementation of the SymSpell algorithm.
|
|
4
4
|
|
|
5
5
|
SpellKit provides:
|
|
6
6
|
- **Fast correction** using SymSpell with configurable edit distance (1 or 2)
|
|
@@ -9,6 +9,8 @@ SpellKit provides:
|
|
|
9
9
|
- **Sub-millisecond latency** - p95 < 2µs on small dictionaries
|
|
10
10
|
- **Thread-safe** - built with Rust's Arc<RwLock> for safe concurrent access
|
|
11
11
|
|
|
12
|
+
**Why a custom implementation?** Existing Rust SymSpell crates require lowercase dictionary entries, but SpellKit preserves canonical forms (NASA stays NASA, iPhone stays iPhone). We also needed domain-specific guards, hot-reload, and Aspell-style skip patterns - features not available in existing implementations.
|
|
13
|
+
|
|
12
14
|
## Why SpellKit?
|
|
13
15
|
|
|
14
16
|
### No Runtime Dependencies
|
|
@@ -146,13 +148,13 @@ SpellKit.load!(
|
|
|
146
148
|
protected_patterns: [/^[A-Z]{3,4}\d+$/]
|
|
147
149
|
)
|
|
148
150
|
|
|
149
|
-
#
|
|
150
|
-
SpellKit.correct("CDK10"
|
|
151
|
+
# Protected terms are automatically respected
|
|
152
|
+
SpellKit.correct("CDK10")
|
|
151
153
|
# => "CDK10" # protected, never changed
|
|
152
154
|
|
|
153
|
-
# Batch correction with
|
|
155
|
+
# Batch correction with protection
|
|
154
156
|
tokens = %w[helllo wrld ABC-123 for CDK10]
|
|
155
|
-
SpellKit.correct_tokens(tokens
|
|
157
|
+
SpellKit.correct_tokens(tokens)
|
|
156
158
|
# => ["hello", "world", "ABC-123", "for", "CDK10"]
|
|
157
159
|
```
|
|
158
160
|
|
|
@@ -300,7 +302,7 @@ SpellKit.correct("rarword") # => "rarword" (no correction if freq < 1000)
|
|
|
300
302
|
|
|
301
303
|
### Skip Patterns
|
|
302
304
|
|
|
303
|
-
SpellKit can automatically skip certain patterns to avoid "correcting" technical terms, URLs, and other special content. Inspired by Aspell's filter modes, these patterns are applied when
|
|
305
|
+
SpellKit can automatically skip certain patterns to avoid "correcting" technical terms, URLs, and other special content. Inspired by Aspell's filter modes, these patterns are automatically applied when configured.
|
|
304
306
|
|
|
305
307
|
**Available skip patterns:**
|
|
306
308
|
|
|
@@ -315,13 +317,13 @@ SpellKit.load!(
|
|
|
315
317
|
)
|
|
316
318
|
|
|
317
319
|
# With skip patterns enabled, technical content is preserved
|
|
318
|
-
SpellKit.correct("https://example.com"
|
|
319
|
-
SpellKit.correct("user@test.com"
|
|
320
|
-
SpellKit.correct("getElementById"
|
|
321
|
-
SpellKit.correct("version-1.2.3"
|
|
320
|
+
SpellKit.correct("https://example.com") # => "https://example.com"
|
|
321
|
+
SpellKit.correct("user@test.com") # => "user@test.com"
|
|
322
|
+
SpellKit.correct("getElementById") # => "getElementById"
|
|
323
|
+
SpellKit.correct("version-1.2.3") # => "version-1.2.3"
|
|
322
324
|
|
|
323
325
|
# Regular typos are still corrected
|
|
324
|
-
SpellKit.correct("helllo"
|
|
326
|
+
SpellKit.correct("helllo") # => "hello"
|
|
325
327
|
```
|
|
326
328
|
|
|
327
329
|
**What each skip pattern matches:**
|
|
@@ -351,9 +353,9 @@ SpellKit.load!(
|
|
|
351
353
|
protected_patterns: [/^CUSTOM-\d+$/] # Your custom patterns
|
|
352
354
|
)
|
|
353
355
|
|
|
354
|
-
# Both work together
|
|
355
|
-
SpellKit.correct("https://example.com"
|
|
356
|
-
SpellKit.correct("CUSTOM-123"
|
|
356
|
+
# Both work together automatically
|
|
357
|
+
SpellKit.correct("https://example.com") # => "https://example.com" (skip_urls)
|
|
358
|
+
SpellKit.correct("CUSTOM-123") # => "CUSTOM-123" (custom pattern)
|
|
357
359
|
```
|
|
358
360
|
|
|
359
361
|
## API Reference
|
|
@@ -426,33 +428,32 @@ SpellKit.suggestions("helllo", 5)
|
|
|
426
428
|
# => [{"term"=>"hello", "distance"=>1, "freq"=>10000}, ...]
|
|
427
429
|
```
|
|
428
430
|
|
|
429
|
-
### `SpellKit.correct(word
|
|
431
|
+
### `SpellKit.correct(word)`
|
|
430
432
|
|
|
431
|
-
Return corrected word or original if no better match found. Respects `frequency_threshold` configuration.
|
|
433
|
+
Return corrected word or original if no better match found. Respects `frequency_threshold` configuration. Protected terms and skip patterns are automatically applied when configured.
|
|
432
434
|
|
|
433
435
|
**Parameters:**
|
|
434
436
|
- `word` (required) - The word to correct
|
|
435
|
-
- `guard:` (optional) - Set to `:domain` to enable protection checks
|
|
436
437
|
|
|
437
438
|
**Behavior:**
|
|
438
439
|
- Returns original word if it exists in dictionary
|
|
439
440
|
- For misspellings, only accepts corrections with frequency ≥ `frequency_threshold`
|
|
440
441
|
- Returns original word if no corrections pass the threshold
|
|
441
|
-
-
|
|
442
|
+
- Automatically respects protected terms and skip patterns configured in `load!`
|
|
442
443
|
|
|
443
444
|
**Example:**
|
|
444
445
|
```ruby
|
|
445
|
-
SpellKit.correct("helllo")
|
|
446
|
-
SpellKit.correct("hello")
|
|
447
|
-
SpellKit.correct("CDK10"
|
|
446
|
+
SpellKit.correct("helllo") # => "hello"
|
|
447
|
+
SpellKit.correct("hello") # => "hello" (already correct)
|
|
448
|
+
SpellKit.correct("CDK10") # => "CDK10" (protected if configured)
|
|
448
449
|
```
|
|
449
450
|
|
|
450
|
-
### `SpellKit.correct_tokens(tokens
|
|
451
|
+
### `SpellKit.correct_tokens(tokens)`
|
|
451
452
|
|
|
452
|
-
Batch correction of an array of tokens. Respects `frequency_threshold` configuration.
|
|
453
|
+
Batch correction of an array of tokens. Respects `frequency_threshold` configuration. Protected terms and skip patterns are automatically applied when configured.
|
|
453
454
|
|
|
454
|
-
**
|
|
455
|
-
- `
|
|
455
|
+
**Parameters:**
|
|
456
|
+
- `tokens` (required) - Array of words to correct
|
|
456
457
|
|
|
457
458
|
**Returns:** Array of corrected strings
|
|
458
459
|
|
|
@@ -472,7 +473,7 @@ Verify system is properly loaded. Raises error if not.
|
|
|
472
473
|
|
|
473
474
|
## Term Protection
|
|
474
475
|
|
|
475
|
-
|
|
476
|
+
When configured, SpellKit automatically protects specific terms from correction:
|
|
476
477
|
|
|
477
478
|
### Exact Matches
|
|
478
479
|
Terms in `protected_path` file are never corrected, even if similar dictionary words exist. Matching is case-insensitive, but original casing is preserved in output.
|
|
@@ -536,14 +537,14 @@ end
|
|
|
536
537
|
class SearchPreprocessor
|
|
537
538
|
def self.correct_query(text)
|
|
538
539
|
tokens = text.downcase.split(/\s+/)
|
|
539
|
-
SpellKit.correct_tokens(tokens
|
|
540
|
+
SpellKit.correct_tokens(tokens).join(" ")
|
|
540
541
|
end
|
|
541
542
|
end
|
|
542
543
|
```
|
|
543
544
|
|
|
544
545
|
## Performance
|
|
545
546
|
|
|
546
|
-
### SpellKit Standalone (
|
|
547
|
+
### SpellKit Standalone (M4 Max MacBook Pro, Ruby 3.3.0, 80k dictionary)
|
|
547
548
|
|
|
548
549
|
**Single Word Suggestions:**
|
|
549
550
|
- 3,345 i/s (298.96 μs/i) with max: 1 suggestion
|
|
@@ -554,10 +555,10 @@ end
|
|
|
554
555
|
- `correct`: 1,858 i/s (538.17 μs/i)
|
|
555
556
|
- `correct_tokens` (batch): 2,005 i/s (498.76 μs/i)
|
|
556
557
|
|
|
557
|
-
**
|
|
558
|
-
- Without
|
|
559
|
-
- With
|
|
560
|
-
*(
|
|
558
|
+
**Protection Performance:**
|
|
559
|
+
- Without protection: 2,926 i/s (341.79 μs/i)
|
|
560
|
+
- With protection: 9,337 i/s (107.10 μs/i) - **3.19x faster!**
|
|
561
|
+
*(Protection checks short-circuit expensive dictionary lookups)*
|
|
561
562
|
|
|
562
563
|
**Latency Distribution (10,000 iterations):**
|
|
563
564
|
- p50: 61μs
|
|
@@ -573,6 +574,27 @@ end
|
|
|
573
574
|
3. **High Throughput**: Over 16k operations per second with 80k word dictionary
|
|
574
575
|
4. **Scales Well**: Minimal performance difference between 1 vs 10 suggestions
|
|
575
576
|
|
|
577
|
+
### Comparison with Aspell
|
|
578
|
+
|
|
579
|
+
SpellKit vs Aspell (M4 Max MacBook Pro, Ruby 3.3.0, 80k dictionary):
|
|
580
|
+
|
|
581
|
+
**Suggestion Performance (13 misspelled words):**
|
|
582
|
+
- SpellKit: 3,162 i/s (316μs per batch)
|
|
583
|
+
- Aspell: 433 i/s (2.31ms per batch)
|
|
584
|
+
- **SpellKit is 7.3x faster**
|
|
585
|
+
|
|
586
|
+
**Spell Checking (correct? on 26 words):**
|
|
587
|
+
- SpellKit: 263,279 i/s (3.8μs per batch)
|
|
588
|
+
- Aspell: 72,099 i/s (13.9μs per batch)
|
|
589
|
+
- **SpellKit is 3.65x faster**
|
|
590
|
+
|
|
591
|
+
**Latency Distribution (10,000 single-word suggestions):**
|
|
592
|
+
- SpellKit: p50=63μs, p95=69μs, p99=98μs
|
|
593
|
+
- Aspell: p50=105μs, p95=121μs, p99=182μs
|
|
594
|
+
- **SpellKit is 1.7x faster at p50, 1.75x faster at p95**
|
|
595
|
+
|
|
596
|
+
Both libraries provide high-quality spell checking, but SpellKit's SymSpell algorithm (O(1) lookup) offers significant performance advantages over Aspell's statistical approach, especially for high-throughput applications.
|
|
597
|
+
|
|
576
598
|
## Benchmarks
|
|
577
599
|
|
|
578
600
|
SpellKit includes comprehensive benchmarks to measure performance and compare with other spell checkers.
|
|
@@ -672,4 +694,4 @@ Bug reports and pull requests are welcome at https://github.com/scientist-labs/s
|
|
|
672
694
|
|
|
673
695
|
## License
|
|
674
696
|
|
|
675
|
-
MIT License - see [LICENSE](LICENSE) file for details.
|
|
697
|
+
MIT License - see [LICENSE](LICENSE.txt) file for details.
|
data/ext/spellkit/Cargo.toml
CHANGED
data/ext/spellkit/src/lib.rs
CHANGED
|
@@ -52,14 +52,11 @@ fn correct_word(
|
|
|
52
52
|
state: &CheckerState,
|
|
53
53
|
symspell: &SymSpell,
|
|
54
54
|
word: &str,
|
|
55
|
-
use_guard: bool,
|
|
56
55
|
) -> String {
|
|
57
|
-
//
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
return word.to_string();
|
|
62
|
-
}
|
|
56
|
+
// Always check if word is protected
|
|
57
|
+
let normalized = SymSpell::normalize_word(word);
|
|
58
|
+
if state.guards.is_protected_normalized(word, &normalized) {
|
|
59
|
+
return word.to_string();
|
|
63
60
|
}
|
|
64
61
|
|
|
65
62
|
let suggestions = symspell.suggestions(word, 5);
|
|
@@ -298,7 +295,7 @@ impl Checker {
|
|
|
298
295
|
}
|
|
299
296
|
}
|
|
300
297
|
|
|
301
|
-
fn correct_if_unknown(&self, word: String
|
|
298
|
+
fn correct_if_unknown(&self, word: String) -> Result<String, Error> {
|
|
302
299
|
let ruby = Ruby::get().unwrap();
|
|
303
300
|
let state = self.state.read().unwrap();
|
|
304
301
|
|
|
@@ -307,18 +304,17 @@ impl Checker {
|
|
|
307
304
|
}
|
|
308
305
|
|
|
309
306
|
if let Some(ref symspell) = state.symspell {
|
|
310
|
-
Ok(correct_word(&state, symspell, &word
|
|
307
|
+
Ok(correct_word(&state, symspell, &word))
|
|
311
308
|
} else {
|
|
312
309
|
Err(Error::new(ruby.exception_runtime_error(), "SymSpell not initialized"))
|
|
313
310
|
}
|
|
314
311
|
}
|
|
315
312
|
|
|
316
|
-
fn correct_tokens(&self, tokens: RArray
|
|
313
|
+
fn correct_tokens(&self, tokens: RArray) -> Result<RArray, Error> {
|
|
317
314
|
// Optimize batch correction by acquiring lock once for all tokens
|
|
318
315
|
// instead of calling correct_if_unknown per token (which re-locks each time)
|
|
319
316
|
let ruby = Ruby::get().unwrap();
|
|
320
317
|
let state = self.state.read().unwrap();
|
|
321
|
-
let use_guard = use_guard.unwrap_or(false);
|
|
322
318
|
|
|
323
319
|
if !state.loaded {
|
|
324
320
|
return Err(Error::new(ruby.exception_runtime_error(), "Dictionary not loaded. Call load! first"));
|
|
@@ -329,7 +325,7 @@ impl Checker {
|
|
|
329
325
|
if let Some(ref symspell) = state.symspell {
|
|
330
326
|
for token in tokens.into_iter() {
|
|
331
327
|
let word: String = TryConvert::try_convert(token)?;
|
|
332
|
-
let corrected = correct_word(&state, symspell, &word
|
|
328
|
+
let corrected = correct_word(&state, symspell, &word);
|
|
333
329
|
result.push(corrected)?;
|
|
334
330
|
}
|
|
335
331
|
|
|
@@ -388,8 +384,8 @@ fn init(_ruby: &Ruby) -> Result<(), Error> {
|
|
|
388
384
|
checker_class.define_method("load!", method!(Checker::load_full, 1))?;
|
|
389
385
|
checker_class.define_method("suggestions", method!(Checker::suggestions, 2))?;
|
|
390
386
|
checker_class.define_method("correct?", method!(Checker::correct, 1))?;
|
|
391
|
-
checker_class.define_method("correct", method!(Checker::correct_if_unknown,
|
|
392
|
-
checker_class.define_method("correct_tokens", method!(Checker::correct_tokens,
|
|
387
|
+
checker_class.define_method("correct", method!(Checker::correct_if_unknown, 1))?;
|
|
388
|
+
checker_class.define_method("correct_tokens", method!(Checker::correct_tokens, 1))?;
|
|
393
389
|
checker_class.define_method("stats", method!(Checker::stats, 0))?;
|
|
394
390
|
checker_class.define_method("healthcheck", method!(Checker::healthcheck, 0))?;
|
|
395
391
|
|
data/lib/spellkit/version.rb
CHANGED
data/lib/spellkit.rb
CHANGED
|
@@ -76,12 +76,12 @@ module SpellKit
|
|
|
76
76
|
default.correct?(word)
|
|
77
77
|
end
|
|
78
78
|
|
|
79
|
-
def correct(word
|
|
80
|
-
default.correct(word
|
|
79
|
+
def correct(word)
|
|
80
|
+
default.correct(word)
|
|
81
81
|
end
|
|
82
82
|
|
|
83
|
-
def correct_tokens(tokens
|
|
84
|
-
default.correct_tokens(tokens
|
|
83
|
+
def correct_tokens(tokens)
|
|
84
|
+
default.correct_tokens(tokens)
|
|
85
85
|
end
|
|
86
86
|
|
|
87
87
|
def stats
|
|
@@ -211,19 +211,17 @@ class SpellKit::Checker
|
|
|
211
211
|
_rust_correct?(word)
|
|
212
212
|
end
|
|
213
213
|
|
|
214
|
-
def correct(word
|
|
214
|
+
def correct(word)
|
|
215
215
|
raise SpellKit::InvalidArgumentError, "word cannot be nil" if word.nil?
|
|
216
216
|
raise SpellKit::InvalidArgumentError, "word cannot be empty" if word.to_s.empty?
|
|
217
217
|
|
|
218
|
-
|
|
219
|
-
_rust_correct(word, use_guard)
|
|
218
|
+
_rust_correct(word)
|
|
220
219
|
end
|
|
221
220
|
|
|
222
|
-
def correct_tokens(tokens
|
|
221
|
+
def correct_tokens(tokens)
|
|
223
222
|
raise SpellKit::InvalidArgumentError, "tokens must be an Array" unless tokens.is_a?(Array)
|
|
224
223
|
|
|
225
|
-
|
|
226
|
-
_rust_correct_tokens(tokens, use_guard)
|
|
224
|
+
_rust_correct_tokens(tokens)
|
|
227
225
|
end
|
|
228
226
|
|
|
229
227
|
def stats
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: spellkit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Chris Petersen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-09-
|
|
11
|
+
date: 2025-09-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -150,8 +150,8 @@ dependencies:
|
|
|
150
150
|
- - ">="
|
|
151
151
|
- !ruby/object:Gem::Version
|
|
152
152
|
version: '0'
|
|
153
|
-
description: A Ruby gem
|
|
154
|
-
with domain-specific term protection
|
|
153
|
+
description: A Ruby gem with a native Rust implementation of the SymSpell algorithm
|
|
154
|
+
for fast typo correction with domain-specific term protection
|
|
155
155
|
email:
|
|
156
156
|
- chris@petersen.io
|
|
157
157
|
executables: []
|
|
@@ -191,7 +191,6 @@ licenses:
|
|
|
191
191
|
metadata:
|
|
192
192
|
homepage_uri: https://github.com/scientist-labs/spellkit
|
|
193
193
|
source_code_uri: https://github.com/scientist-labs/spellkit
|
|
194
|
-
changelog_uri: https://github.com/scientist-labs/spellkit/blob/main/CHANGELOG.md
|
|
195
194
|
post_install_message:
|
|
196
195
|
rdoc_options: []
|
|
197
196
|
require_paths:
|
|
@@ -206,7 +205,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
206
205
|
- - ">="
|
|
207
206
|
- !ruby/object:Gem::Version
|
|
208
207
|
version: '0'
|
|
209
|
-
requirements:
|
|
208
|
+
requirements:
|
|
209
|
+
- Rust >= 1.85
|
|
210
210
|
rubygems_version: 3.5.3
|
|
211
211
|
signing_key:
|
|
212
212
|
specification_version: 4
|