spellkit 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +29 -30
- data/ext/spellkit/src/lib.rs +10 -14
- data/lib/spellkit/version.rb +1 -1
- data/lib/spellkit.rb +8 -10
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6e690fa50208d003679afff3117b6f00664e6938e7feb992eff6a6544fad279e
|
|
4
|
+
data.tar.gz: 17d41414cbd48e093913cfa8765aac063f1cb17283df13414ad801fdf0aa79ae
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: '0989bcacde87e9405c99f8674c681be65844a381cac11a00f8f05dd2f1a54312ce9d1799ca6317688255ee5fad454f4670845163b87e6a9541a981fff8685b35'
|
|
7
|
+
data.tar.gz: f12c69803c39ee74083a2090c85aac7bdfe7c39ac81995a95c09ad8c1e70df35cfb8d332de287413accf99aa4b6960e9e0c5f77ccd39b37d4ae092b06d8ea2d4
|
data/README.md
CHANGED
|
@@ -146,13 +146,13 @@ SpellKit.load!(
|
|
|
146
146
|
protected_patterns: [/^[A-Z]{3,4}\d+$/]
|
|
147
147
|
)
|
|
148
148
|
|
|
149
|
-
#
|
|
150
|
-
SpellKit.correct("CDK10"
|
|
149
|
+
# Protected terms are automatically respected
|
|
150
|
+
SpellKit.correct("CDK10")
|
|
151
151
|
# => "CDK10" # protected, never changed
|
|
152
152
|
|
|
153
|
-
# Batch correction with
|
|
153
|
+
# Batch correction with protection
|
|
154
154
|
tokens = %w[helllo wrld ABC-123 for CDK10]
|
|
155
|
-
SpellKit.correct_tokens(tokens
|
|
155
|
+
SpellKit.correct_tokens(tokens)
|
|
156
156
|
# => ["hello", "world", "ABC-123", "for", "CDK10"]
|
|
157
157
|
```
|
|
158
158
|
|
|
@@ -300,7 +300,7 @@ SpellKit.correct("rarword") # => "rarword" (no correction if freq < 1000)
|
|
|
300
300
|
|
|
301
301
|
### Skip Patterns
|
|
302
302
|
|
|
303
|
-
SpellKit can automatically skip certain patterns to avoid "correcting" technical terms, URLs, and other special content. Inspired by Aspell's filter modes, these patterns are applied when
|
|
303
|
+
SpellKit can automatically skip certain patterns to avoid "correcting" technical terms, URLs, and other special content. Inspired by Aspell's filter modes, these patterns are automatically applied when configured.
|
|
304
304
|
|
|
305
305
|
**Available skip patterns:**
|
|
306
306
|
|
|
@@ -315,13 +315,13 @@ SpellKit.load!(
|
|
|
315
315
|
)
|
|
316
316
|
|
|
317
317
|
# With skip patterns enabled, technical content is preserved
|
|
318
|
-
SpellKit.correct("https://example.com"
|
|
319
|
-
SpellKit.correct("user@test.com"
|
|
320
|
-
SpellKit.correct("getElementById"
|
|
321
|
-
SpellKit.correct("version-1.2.3"
|
|
318
|
+
SpellKit.correct("https://example.com") # => "https://example.com"
|
|
319
|
+
SpellKit.correct("user@test.com") # => "user@test.com"
|
|
320
|
+
SpellKit.correct("getElementById") # => "getElementById"
|
|
321
|
+
SpellKit.correct("version-1.2.3") # => "version-1.2.3"
|
|
322
322
|
|
|
323
323
|
# Regular typos are still corrected
|
|
324
|
-
SpellKit.correct("helllo"
|
|
324
|
+
SpellKit.correct("helllo") # => "hello"
|
|
325
325
|
```
|
|
326
326
|
|
|
327
327
|
**What each skip pattern matches:**
|
|
@@ -351,9 +351,9 @@ SpellKit.load!(
|
|
|
351
351
|
protected_patterns: [/^CUSTOM-\d+$/] # Your custom patterns
|
|
352
352
|
)
|
|
353
353
|
|
|
354
|
-
# Both work together
|
|
355
|
-
SpellKit.correct("https://example.com"
|
|
356
|
-
SpellKit.correct("CUSTOM-123"
|
|
354
|
+
# Both work together automatically
|
|
355
|
+
SpellKit.correct("https://example.com") # => "https://example.com" (skip_urls)
|
|
356
|
+
SpellKit.correct("CUSTOM-123") # => "CUSTOM-123" (custom pattern)
|
|
357
357
|
```
|
|
358
358
|
|
|
359
359
|
## API Reference
|
|
@@ -426,33 +426,32 @@ SpellKit.suggestions("helllo", 5)
|
|
|
426
426
|
# => [{"term"=>"hello", "distance"=>1, "freq"=>10000}, ...]
|
|
427
427
|
```
|
|
428
428
|
|
|
429
|
-
### `SpellKit.correct(word
|
|
429
|
+
### `SpellKit.correct(word)`
|
|
430
430
|
|
|
431
|
-
Return corrected word or original if no better match found. Respects `frequency_threshold` configuration.
|
|
431
|
+
Return corrected word or original if no better match found. Respects `frequency_threshold` configuration. Protected terms and skip patterns are automatically applied when configured.
|
|
432
432
|
|
|
433
433
|
**Parameters:**
|
|
434
434
|
- `word` (required) - The word to correct
|
|
435
|
-
- `guard:` (optional) - Set to `:domain` to enable protection checks
|
|
436
435
|
|
|
437
436
|
**Behavior:**
|
|
438
437
|
- Returns original word if it exists in dictionary
|
|
439
438
|
- For misspellings, only accepts corrections with frequency ≥ `frequency_threshold`
|
|
440
439
|
- Returns original word if no corrections pass the threshold
|
|
441
|
-
-
|
|
440
|
+
- Automatically respects protected terms and skip patterns configured in `load!`
|
|
442
441
|
|
|
443
442
|
**Example:**
|
|
444
443
|
```ruby
|
|
445
|
-
SpellKit.correct("helllo")
|
|
446
|
-
SpellKit.correct("hello")
|
|
447
|
-
SpellKit.correct("CDK10"
|
|
444
|
+
SpellKit.correct("helllo") # => "hello"
|
|
445
|
+
SpellKit.correct("hello") # => "hello" (already correct)
|
|
446
|
+
SpellKit.correct("CDK10") # => "CDK10" (protected if configured)
|
|
448
447
|
```
|
|
449
448
|
|
|
450
|
-
### `SpellKit.correct_tokens(tokens
|
|
449
|
+
### `SpellKit.correct_tokens(tokens)`
|
|
451
450
|
|
|
452
|
-
Batch correction of an array of tokens. Respects `frequency_threshold` configuration.
|
|
451
|
+
Batch correction of an array of tokens. Respects `frequency_threshold` configuration. Protected terms and skip patterns are automatically applied when configured.
|
|
453
452
|
|
|
454
|
-
**
|
|
455
|
-
- `
|
|
453
|
+
**Parameters:**
|
|
454
|
+
- `tokens` (required) - Array of words to correct
|
|
456
455
|
|
|
457
456
|
**Returns:** Array of corrected strings
|
|
458
457
|
|
|
@@ -472,7 +471,7 @@ Verify system is properly loaded. Raises error if not.
|
|
|
472
471
|
|
|
473
472
|
## Term Protection
|
|
474
473
|
|
|
475
|
-
|
|
474
|
+
When configured, SpellKit automatically protects specific terms from correction:
|
|
476
475
|
|
|
477
476
|
### Exact Matches
|
|
478
477
|
Terms in `protected_path` file are never corrected, even if similar dictionary words exist. Matching is case-insensitive, but original casing is preserved in output.
|
|
@@ -536,7 +535,7 @@ end
|
|
|
536
535
|
class SearchPreprocessor
|
|
537
536
|
def self.correct_query(text)
|
|
538
537
|
tokens = text.downcase.split(/\s+/)
|
|
539
|
-
SpellKit.correct_tokens(tokens
|
|
538
|
+
SpellKit.correct_tokens(tokens).join(" ")
|
|
540
539
|
end
|
|
541
540
|
end
|
|
542
541
|
```
|
|
@@ -554,10 +553,10 @@ end
|
|
|
554
553
|
- `correct`: 1,858 i/s (538.17 μs/i)
|
|
555
554
|
- `correct_tokens` (batch): 2,005 i/s (498.76 μs/i)
|
|
556
555
|
|
|
557
|
-
**
|
|
558
|
-
- Without
|
|
559
|
-
- With
|
|
560
|
-
*(
|
|
556
|
+
**Protection Performance:**
|
|
557
|
+
- Without protection: 2,926 i/s (341.79 μs/i)
|
|
558
|
+
- With protection: 9,337 i/s (107.10 μs/i) - **3.19x faster!**
|
|
559
|
+
*(Protection checks short-circuit expensive dictionary lookups)*
|
|
561
560
|
|
|
562
561
|
**Latency Distribution (10,000 iterations):**
|
|
563
562
|
- p50: 61μs
|
data/ext/spellkit/src/lib.rs
CHANGED
|
@@ -52,14 +52,11 @@ fn correct_word(
|
|
|
52
52
|
state: &CheckerState,
|
|
53
53
|
symspell: &SymSpell,
|
|
54
54
|
word: &str,
|
|
55
|
-
use_guard: bool,
|
|
56
55
|
) -> String {
|
|
57
|
-
//
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
return word.to_string();
|
|
62
|
-
}
|
|
56
|
+
// Always check if word is protected
|
|
57
|
+
let normalized = SymSpell::normalize_word(word);
|
|
58
|
+
if state.guards.is_protected_normalized(word, &normalized) {
|
|
59
|
+
return word.to_string();
|
|
63
60
|
}
|
|
64
61
|
|
|
65
62
|
let suggestions = symspell.suggestions(word, 5);
|
|
@@ -298,7 +295,7 @@ impl Checker {
|
|
|
298
295
|
}
|
|
299
296
|
}
|
|
300
297
|
|
|
301
|
-
fn correct_if_unknown(&self, word: String
|
|
298
|
+
fn correct_if_unknown(&self, word: String) -> Result<String, Error> {
|
|
302
299
|
let ruby = Ruby::get().unwrap();
|
|
303
300
|
let state = self.state.read().unwrap();
|
|
304
301
|
|
|
@@ -307,18 +304,17 @@ impl Checker {
|
|
|
307
304
|
}
|
|
308
305
|
|
|
309
306
|
if let Some(ref symspell) = state.symspell {
|
|
310
|
-
Ok(correct_word(&state, symspell, &word
|
|
307
|
+
Ok(correct_word(&state, symspell, &word))
|
|
311
308
|
} else {
|
|
312
309
|
Err(Error::new(ruby.exception_runtime_error(), "SymSpell not initialized"))
|
|
313
310
|
}
|
|
314
311
|
}
|
|
315
312
|
|
|
316
|
-
fn correct_tokens(&self, tokens: RArray
|
|
313
|
+
fn correct_tokens(&self, tokens: RArray) -> Result<RArray, Error> {
|
|
317
314
|
// Optimize batch correction by acquiring lock once for all tokens
|
|
318
315
|
// instead of calling correct_if_unknown per token (which re-locks each time)
|
|
319
316
|
let ruby = Ruby::get().unwrap();
|
|
320
317
|
let state = self.state.read().unwrap();
|
|
321
|
-
let use_guard = use_guard.unwrap_or(false);
|
|
322
318
|
|
|
323
319
|
if !state.loaded {
|
|
324
320
|
return Err(Error::new(ruby.exception_runtime_error(), "Dictionary not loaded. Call load! first"));
|
|
@@ -329,7 +325,7 @@ impl Checker {
|
|
|
329
325
|
if let Some(ref symspell) = state.symspell {
|
|
330
326
|
for token in tokens.into_iter() {
|
|
331
327
|
let word: String = TryConvert::try_convert(token)?;
|
|
332
|
-
let corrected = correct_word(&state, symspell, &word
|
|
328
|
+
let corrected = correct_word(&state, symspell, &word);
|
|
333
329
|
result.push(corrected)?;
|
|
334
330
|
}
|
|
335
331
|
|
|
@@ -388,8 +384,8 @@ fn init(_ruby: &Ruby) -> Result<(), Error> {
|
|
|
388
384
|
checker_class.define_method("load!", method!(Checker::load_full, 1))?;
|
|
389
385
|
checker_class.define_method("suggestions", method!(Checker::suggestions, 2))?;
|
|
390
386
|
checker_class.define_method("correct?", method!(Checker::correct, 1))?;
|
|
391
|
-
checker_class.define_method("correct", method!(Checker::correct_if_unknown,
|
|
392
|
-
checker_class.define_method("correct_tokens", method!(Checker::correct_tokens,
|
|
387
|
+
checker_class.define_method("correct", method!(Checker::correct_if_unknown, 1))?;
|
|
388
|
+
checker_class.define_method("correct_tokens", method!(Checker::correct_tokens, 1))?;
|
|
393
389
|
checker_class.define_method("stats", method!(Checker::stats, 0))?;
|
|
394
390
|
checker_class.define_method("healthcheck", method!(Checker::healthcheck, 0))?;
|
|
395
391
|
|
data/lib/spellkit/version.rb
CHANGED
data/lib/spellkit.rb
CHANGED
|
@@ -76,12 +76,12 @@ module SpellKit
|
|
|
76
76
|
default.correct?(word)
|
|
77
77
|
end
|
|
78
78
|
|
|
79
|
-
def correct(word
|
|
80
|
-
default.correct(word
|
|
79
|
+
def correct(word)
|
|
80
|
+
default.correct(word)
|
|
81
81
|
end
|
|
82
82
|
|
|
83
|
-
def correct_tokens(tokens
|
|
84
|
-
default.correct_tokens(tokens
|
|
83
|
+
def correct_tokens(tokens)
|
|
84
|
+
default.correct_tokens(tokens)
|
|
85
85
|
end
|
|
86
86
|
|
|
87
87
|
def stats
|
|
@@ -211,19 +211,17 @@ class SpellKit::Checker
|
|
|
211
211
|
_rust_correct?(word)
|
|
212
212
|
end
|
|
213
213
|
|
|
214
|
-
def correct(word
|
|
214
|
+
def correct(word)
|
|
215
215
|
raise SpellKit::InvalidArgumentError, "word cannot be nil" if word.nil?
|
|
216
216
|
raise SpellKit::InvalidArgumentError, "word cannot be empty" if word.to_s.empty?
|
|
217
217
|
|
|
218
|
-
|
|
219
|
-
_rust_correct(word, use_guard)
|
|
218
|
+
_rust_correct(word)
|
|
220
219
|
end
|
|
221
220
|
|
|
222
|
-
def correct_tokens(tokens
|
|
221
|
+
def correct_tokens(tokens)
|
|
223
222
|
raise SpellKit::InvalidArgumentError, "tokens must be an Array" unless tokens.is_a?(Array)
|
|
224
223
|
|
|
225
|
-
|
|
226
|
-
_rust_correct_tokens(tokens, use_guard)
|
|
224
|
+
_rust_correct_tokens(tokens)
|
|
227
225
|
end
|
|
228
226
|
|
|
229
227
|
def stats
|