spellkit 0.2.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +697 -0
- data/ext/spellkit/Cargo.toml +19 -0
- data/ext/spellkit/extconf.rb +4 -0
- data/ext/spellkit/src/guards.rs +75 -0
- data/ext/spellkit/src/lib.rs +393 -0
- data/ext/spellkit/src/symspell.rs +349 -0
- data/lib/spellkit/3.1/spellkit.so +0 -0
- data/lib/spellkit/3.2/spellkit.so +0 -0
- data/lib/spellkit/3.3/spellkit.so +0 -0
- data/lib/spellkit/3.4/spellkit.so +0 -0
- data/lib/spellkit/version.rb +5 -0
- data/lib/spellkit.rb +368 -0
- metadata +202 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: dc45d3c40e1c9085c451c0831aa08755fe7a556bd940cd775ee2cd2231d93fd9
|
|
4
|
+
data.tar.gz: 6a7e14af13a3d6e183d3a95296d5b9ea29478fcb3a91a5ee421854b5f0fb8966
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 4ccb47129cff23f2ce5a6a2cc43d217e398b56ffb6cf9f58935da3ae9fe1b7d8886426abfacddad5be7d260007504c3c711ccf81ab656cdb9f0ad45d9eb8a8c8
|
|
7
|
+
data.tar.gz: 55a8f9d3f7b53aa610a7cebc653c7657ea7c0914351d85a756f8711730370d3d94d7876de491071c0d2329673b92ab0e0d3504c5ee563f826781a6b87976d454
|
data/README.md
ADDED
|
@@ -0,0 +1,697 @@
|
|
|
1
|
+
<img src="/docs/assets/spellkit-wide.png" alt="spellkit" height="160px">
|
|
2
|
+
|
|
3
|
+
Fast, safe typo correction for search-term extraction. A Ruby gem with a native Rust implementation of the SymSpell algorithm.
|
|
4
|
+
|
|
5
|
+
SpellKit provides:
|
|
6
|
+
- **Fast correction** using SymSpell with configurable edit distance (1 or 2)
|
|
7
|
+
- **Term protection** - never alter protected terms using exact matches or regex patterns
|
|
8
|
+
- **Hot reload** - update dictionaries without restarting your application
|
|
9
|
+
- **Sub-millisecond latency** - p95 < 2µs on small dictionaries
|
|
10
|
+
- **Thread-safe** - built with Rust's Arc<RwLock> for safe concurrent access
|
|
11
|
+
|
|
12
|
+
**Why a custom implementation?** Existing Rust SymSpell crates require lowercase dictionary entries, but SpellKit preserves canonical forms (NASA stays NASA, iPhone stays iPhone). We also needed domain-specific guards, hot-reload, and Aspell-style skip patterns - features not available in existing implementations.
|
|
13
|
+
|
|
14
|
+
## Why SpellKit?
|
|
15
|
+
|
|
16
|
+
### No Runtime Dependencies
|
|
17
|
+
SpellKit is a pure Ruby gem with a Rust extension. Just `gem install spellkit` and you're done. No need to install Aspell, Hunspell, or other system packages. This makes deployment simpler and more reliable across different environments.
|
|
18
|
+
|
|
19
|
+
### Fast Performance
|
|
20
|
+
Built on the SymSpell algorithm with Rust, SpellKit delivers:
|
|
21
|
+
- **350,000+ operations/second** for spell checking
|
|
22
|
+
- **3.7x faster** than Aspell for correctness checks
|
|
23
|
+
- **40x faster** than Aspell for generating suggestions
|
|
24
|
+
- **p99 latency < 25µs** even under load
|
|
25
|
+
|
|
26
|
+
See the [Benchmarks](#benchmarks) section for detailed comparisons.
|
|
27
|
+
|
|
28
|
+
### Production Ready
|
|
29
|
+
- Thread-safe concurrent access
|
|
30
|
+
- Hot reload dictionaries without restarts
|
|
31
|
+
- Instance-based API for multi-domain support
|
|
32
|
+
- Comprehensive error handling
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
Add to your Gemfile:
|
|
37
|
+
|
|
38
|
+
```ruby
|
|
39
|
+
gem "spellkit"
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Or install directly:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
gem install spellkit
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Quick Start
|
|
49
|
+
|
|
50
|
+
SpellKit works with dictionaries from URLs or local files. Try it immediately:
|
|
51
|
+
|
|
52
|
+
```ruby
|
|
53
|
+
require "spellkit"
|
|
54
|
+
|
|
55
|
+
# Load from URL (downloads and caches automatically)
|
|
56
|
+
SpellKit.load!(dictionary: SpellKit::DEFAULT_DICTIONARY_URL)
|
|
57
|
+
|
|
58
|
+
# Or use a configure block (recommended for Rails)
|
|
59
|
+
SpellKit.configure do |config|
|
|
60
|
+
config.dictionary = SpellKit::DEFAULT_DICTIONARY_URL
|
|
61
|
+
config.edit_distance = 1
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Or load from local file
|
|
65
|
+
# SpellKit.load!(dictionary: "path/to/dictionary.tsv")
|
|
66
|
+
|
|
67
|
+
# Check if a word is spelled correctly
|
|
68
|
+
puts SpellKit.correct?("hello")
|
|
69
|
+
# => true
|
|
70
|
+
|
|
71
|
+
# Get suggestions for a misspelled word
|
|
72
|
+
suggestions = SpellKit.suggestions("helllo", 5)
|
|
73
|
+
puts suggestions.inspect
|
|
74
|
+
# => [{"term"=>"hello", "distance"=>1, "freq"=>...}]
|
|
75
|
+
|
|
76
|
+
# Correct a typo
|
|
77
|
+
corrected = SpellKit.correct("helllo")
|
|
78
|
+
puts corrected
|
|
79
|
+
# => "hello"
|
|
80
|
+
|
|
81
|
+
# Batch correction
|
|
82
|
+
tokens = %w[helllo wrld ruby teset]
|
|
83
|
+
corrected_tokens = SpellKit.correct_tokens(tokens)
|
|
84
|
+
puts corrected_tokens.inspect
|
|
85
|
+
# => ["hello", "world", "ruby", "test"]
|
|
86
|
+
|
|
87
|
+
# Check stats
|
|
88
|
+
puts SpellKit.stats.inspect
|
|
89
|
+
# => {"loaded"=>true, "dictionary_size"=>..., "edit_distance"=>1, "loaded_at"=>...}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Usage
|
|
93
|
+
|
|
94
|
+
### Basic Correction
|
|
95
|
+
|
|
96
|
+
```ruby
|
|
97
|
+
require "spellkit"
|
|
98
|
+
|
|
99
|
+
# Load from URL (auto-downloads and caches)
|
|
100
|
+
SpellKit.load!(dictionary: "https://example.com/dict.tsv")
|
|
101
|
+
|
|
102
|
+
# Or from local file
|
|
103
|
+
SpellKit.load!(dictionary: "models/dictionary.tsv", edit_distance: 1)
|
|
104
|
+
|
|
105
|
+
# Check if a word is correct
|
|
106
|
+
SpellKit.correct?("hello")
|
|
107
|
+
# => true
|
|
108
|
+
|
|
109
|
+
# Get suggestions
|
|
110
|
+
SpellKit.suggestions("lyssis", 5)
|
|
111
|
+
# => [{"term"=>"lysis", "distance"=>1, "freq"=>2000}, ...]
|
|
112
|
+
|
|
113
|
+
# Correct a typo
|
|
114
|
+
SpellKit.correct("helllo")
|
|
115
|
+
# => "hello"
|
|
116
|
+
|
|
117
|
+
# Batch correction
|
|
118
|
+
tokens = %w[helllo wrld ruby]
|
|
119
|
+
SpellKit.correct_tokens(tokens)
|
|
120
|
+
# => ["hello", "world", "ruby"]
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Term Protection
|
|
124
|
+
|
|
125
|
+
Protect specific terms from correction using exact matches or regex patterns:
|
|
126
|
+
|
|
127
|
+
```ruby
|
|
128
|
+
# Load with exact-match protected terms
|
|
129
|
+
SpellKit.load!(
|
|
130
|
+
dictionary: "models/dictionary.tsv",
|
|
131
|
+
protected_path: "models/protected.txt" # file with terms to protect
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Protect terms matching regex patterns
|
|
135
|
+
SpellKit.load!(
|
|
136
|
+
dictionary: "models/dictionary.tsv",
|
|
137
|
+
protected_patterns: [
|
|
138
|
+
/^[A-Z]{3,4}\d+$/, # gene symbols like CDK10, BRCA1
|
|
139
|
+
/^\d{2,7}-\d{2}-\d$/, # CAS numbers like 7732-18-5
|
|
140
|
+
/^[A-Z]{2,3}-\d+$/ # SKU patterns like ABC-123
|
|
141
|
+
]
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Or combine both
|
|
145
|
+
SpellKit.load!(
|
|
146
|
+
dictionary: "models/dictionary.tsv",
|
|
147
|
+
protected_path: "models/protected.txt",
|
|
148
|
+
protected_patterns: [/^[A-Z]{3,4}\d+$/]
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Protected terms are automatically respected
|
|
152
|
+
SpellKit.correct("CDK10")
|
|
153
|
+
# => "CDK10" # protected, never changed
|
|
154
|
+
|
|
155
|
+
# Batch correction with protection
|
|
156
|
+
tokens = %w[helllo wrld ABC-123 for CDK10]
|
|
157
|
+
SpellKit.correct_tokens(tokens)
|
|
158
|
+
# => ["hello", "world", "ABC-123", "for", "CDK10"]
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Multiple Instances
|
|
162
|
+
|
|
163
|
+
SpellKit supports multiple independent checker instances, useful for different domains or languages:
|
|
164
|
+
|
|
165
|
+
```ruby
|
|
166
|
+
# Create separate instances for different domains
|
|
167
|
+
medical_checker = SpellKit::Checker.new
|
|
168
|
+
medical_checker.load!(
|
|
169
|
+
dictionary: "models/medical_dictionary.tsv",
|
|
170
|
+
protected_path: "models/medical_terms.txt"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
legal_checker = SpellKit::Checker.new
|
|
174
|
+
legal_checker.load!(
|
|
175
|
+
dictionary: "models/legal_dictionary.tsv",
|
|
176
|
+
protected_path: "models/legal_terms.txt"
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Use them independently
|
|
180
|
+
medical_checker.suggestions("lyssis", 5)
|
|
181
|
+
legal_checker.suggestions("contractt", 5)
|
|
182
|
+
|
|
183
|
+
# Each maintains its own state
|
|
184
|
+
medical_checker.stats # Shows medical dictionary stats
|
|
185
|
+
legal_checker.stats # Shows legal dictionary stats
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### Configuration Block
|
|
189
|
+
|
|
190
|
+
Use the configure block pattern for Rails initializers:
|
|
191
|
+
|
|
192
|
+
```ruby
|
|
193
|
+
SpellKit.configure do |config|
|
|
194
|
+
config.dictionary = "models/dictionary.tsv"
|
|
195
|
+
config.protected_path = "models/protected.txt"
|
|
196
|
+
config.protected_patterns = [/^[A-Z]{3,4}\d+$/]
|
|
197
|
+
config.edit_distance = 1
|
|
198
|
+
config.frequency_threshold = 10.0
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# This becomes the default instance
|
|
202
|
+
SpellKit.suggestions("word", 5) # Uses configured dictionary
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Dictionary Format
|
|
206
|
+
|
|
207
|
+
### Dictionary (required)
|
|
208
|
+
|
|
209
|
+
Whitespace-separated file with term and frequency (supports both space and tab delimiters):
|
|
210
|
+
|
|
211
|
+
```
|
|
212
|
+
hello 10000
|
|
213
|
+
world 8000
|
|
214
|
+
lysis 2000
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
Or space-separated:
|
|
218
|
+
```
|
|
219
|
+
hello 10000
|
|
220
|
+
world 8000
|
|
221
|
+
lysis 2000
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
### Protected Terms (optional)
|
|
225
|
+
|
|
226
|
+
One term per line. Terms are matched case-insensitively:
|
|
227
|
+
|
|
228
|
+
**protected.txt**
|
|
229
|
+
```
|
|
230
|
+
# Product codes
|
|
231
|
+
ABC-123
|
|
232
|
+
XYZ-999
|
|
233
|
+
|
|
234
|
+
# Technical terms
|
|
235
|
+
CDK10
|
|
236
|
+
BRCA1
|
|
237
|
+
|
|
238
|
+
# Brand names
|
|
239
|
+
MyBrand
|
|
240
|
+
SpecialTerm
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
## Dictionary Sources
|
|
244
|
+
|
|
245
|
+
SpellKit doesn't bundle dictionaries, but works with several sources:
|
|
246
|
+
|
|
247
|
+
### Use the Default Dictionary (Recommended)
|
|
248
|
+
```ruby
|
|
249
|
+
# English 80k word dictionary from SymSpell
|
|
250
|
+
SpellKit.load!(dictionary: SpellKit::DEFAULT_DICTIONARY_URL)
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
### Public Dictionary URLs
|
|
254
|
+
- **SymSpell English 80k**: `https://raw.githubusercontent.com/wolfgarbe/SymSpell/master/SymSpell.FrequencyDictionary/en-80k.txt`
|
|
255
|
+
- **SymSpell English 500k**: `https://raw.githubusercontent.com/wolfgarbe/SymSpell/master/SymSpell.FrequencyDictionary/en-500k.txt`
|
|
256
|
+
|
|
257
|
+
### Build Your Own
|
|
258
|
+
See "Building Dictionaries" section below for creating domain-specific dictionaries.
|
|
259
|
+
|
|
260
|
+
### Caching
|
|
261
|
+
Dictionaries downloaded from URLs are cached in `~/.cache/spellkit/` for faster subsequent loads.
|
|
262
|
+
|
|
263
|
+
## Configuration
|
|
264
|
+
|
|
265
|
+
```ruby
|
|
266
|
+
SpellKit.load!(
|
|
267
|
+
dictionary: "models/dictionary.tsv", # required: path or URL
|
|
268
|
+
protected_path: "models/protected.txt", # optional
|
|
269
|
+
protected_patterns: [/^[A-Z]{3,4}\d+$/], # optional
|
|
270
|
+
edit_distance: 1, # 1 (default) or 2
|
|
271
|
+
frequency_threshold: 10.0, # default: 10.0 (minimum frequency for corrections)
|
|
272
|
+
|
|
273
|
+
# Skip pattern filters (all default to false)
|
|
274
|
+
skip_urls: true, # Skip URLs (http://, https://, www.)
|
|
275
|
+
skip_emails: true, # Skip email addresses
|
|
276
|
+
skip_hostnames: true, # Skip hostnames (example.com)
|
|
277
|
+
skip_code_patterns: true, # Skip code identifiers (camelCase, snake_case, etc.)
|
|
278
|
+
skip_numbers: true # Skip numeric patterns (versions, IDs, measurements)
|
|
279
|
+
)
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
### Frequency Threshold
|
|
283
|
+
|
|
284
|
+
The `frequency_threshold` parameter controls which corrections are accepted by `correct` and `correct_tokens`:
|
|
285
|
+
|
|
286
|
+
- **For misspelled words** (not in dictionary): Only suggest corrections with frequency ≥ `frequency_threshold`
|
|
287
|
+
- **For dictionary words**: Only suggest alternatives with frequency ≥ `frequency_threshold × original_frequency`
|
|
288
|
+
|
|
289
|
+
This prevents suggesting rare words as corrections for common typos.
|
|
290
|
+
|
|
291
|
+
**Example:**
|
|
292
|
+
```ruby
|
|
293
|
+
# With default threshold (10.0), suggest any correction with freq ≥ 10
|
|
294
|
+
SpellKit.load!(dictionary: "dict.tsv")
|
|
295
|
+
SpellKit.correct("helllo") # => "hello" (if freq ≥ 10)
|
|
296
|
+
|
|
297
|
+
# With high threshold (1000.0), only suggest common corrections
|
|
298
|
+
SpellKit.load!(dictionary: "dict.tsv", frequency_threshold: 1000.0)
|
|
299
|
+
SpellKit.correct("helllo") # => "hello" (if freq ≥ 1000)
|
|
300
|
+
SpellKit.correct("rarword") # => "rarword" (no correction if freq < 1000)
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
### Skip Patterns
|
|
304
|
+
|
|
305
|
+
SpellKit can automatically skip certain patterns to avoid "correcting" technical terms, URLs, and other special content. Inspired by Aspell's filter modes, these patterns are automatically applied when configured.
|
|
306
|
+
|
|
307
|
+
**Available skip patterns:**
|
|
308
|
+
|
|
309
|
+
```ruby
|
|
310
|
+
SpellKit.load!(
|
|
311
|
+
dictionary: "dict.tsv",
|
|
312
|
+
skip_urls: true, # Skip URLs: https://example.com, www.example.com
|
|
313
|
+
skip_emails: true, # Skip emails: user@domain.com, admin+tag@example.com
|
|
314
|
+
skip_hostnames: true, # Skip hostnames: example.com, api.example.com
|
|
315
|
+
skip_code_patterns: true, # Skip code: camelCase, snake_case, PascalCase, dotted.paths
|
|
316
|
+
skip_numbers: true # Skip numbers: 1.2.3, #123, 5kg, 100mb
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# With skip patterns enabled, technical content is preserved
|
|
320
|
+
SpellKit.correct("https://example.com") # => "https://example.com"
|
|
321
|
+
SpellKit.correct("user@test.com") # => "user@test.com"
|
|
322
|
+
SpellKit.correct("getElementById") # => "getElementById"
|
|
323
|
+
SpellKit.correct("version-1.2.3") # => "version-1.2.3"
|
|
324
|
+
|
|
325
|
+
# Regular typos are still corrected
|
|
326
|
+
SpellKit.correct("helllo") # => "hello"
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
**What each skip pattern matches:**
|
|
330
|
+
|
|
331
|
+
- **`skip_urls`**: `http://`, `https://`, `www.` URLs
|
|
332
|
+
- **`skip_emails`**: Email addresses with standard formats including `+` and `.` in usernames
|
|
333
|
+
- **`skip_hostnames`**: Domain names like `example.com`, `api.example.co.uk`
|
|
334
|
+
- **`skip_code_patterns`**:
|
|
335
|
+
- `camelCase` (starts lowercase)
|
|
336
|
+
- `PascalCase` (starts uppercase, mixed case)
|
|
337
|
+
- `snake_case` and `SCREAMING_SNAKE_CASE`
|
|
338
|
+
- `dotted.paths` like `Array.map` or `config.yml`
|
|
339
|
+
- **`skip_numbers`**:
|
|
340
|
+
- Version numbers: `1.0`, `2.5.3`, `10.15.7.1`
|
|
341
|
+
- Hash/IDs: `#123`, `#4567`
|
|
342
|
+
- Measurements: `5kg`, `2.5m`, `100mb`, `16px`
|
|
343
|
+
- Words starting with digits: `5test`, `123abc`
|
|
344
|
+
|
|
345
|
+
**Combining with protected_patterns:**
|
|
346
|
+
|
|
347
|
+
Skip patterns work alongside your custom `protected_patterns`:
|
|
348
|
+
|
|
349
|
+
```ruby
|
|
350
|
+
SpellKit.load!(
|
|
351
|
+
dictionary: "dict.tsv",
|
|
352
|
+
skip_urls: true, # Built-in URL skipping
|
|
353
|
+
protected_patterns: [/^CUSTOM-\d+$/] # Your custom patterns
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# Both work together automatically
|
|
357
|
+
SpellKit.correct("https://example.com") # => "https://example.com" (skip_urls)
|
|
358
|
+
SpellKit.correct("CUSTOM-123") # => "CUSTOM-123" (custom pattern)
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
## API Reference
|
|
362
|
+
|
|
363
|
+
### `SpellKit.load!(**options)`
|
|
364
|
+
|
|
365
|
+
Load or reload dictionaries. Thread-safe atomic swap. Accepts URLs (auto-downloads and caches) or local file paths.
|
|
366
|
+
|
|
367
|
+
**Options:**
|
|
368
|
+
- `dictionary:` (required) - URL or path to TSV file with term<TAB>frequency
|
|
369
|
+
- `protected_path:` (optional) - Path to file with protected terms (one per line)
|
|
370
|
+
- `protected_patterns:` (optional) - Array of Regexp or String patterns to protect
|
|
371
|
+
- `edit_distance:` (default: 1) - Maximum edit distance (1 or 2)
|
|
372
|
+
- `frequency_threshold:` (default: 10.0) - Minimum frequency ratio for corrections
|
|
373
|
+
- `skip_urls:` (default: false) - Skip URLs (http://, https://, www.)
|
|
374
|
+
- `skip_emails:` (default: false) - Skip email addresses
|
|
375
|
+
- `skip_hostnames:` (default: false) - Skip hostnames (example.com)
|
|
376
|
+
- `skip_code_patterns:` (default: false) - Skip code identifiers (camelCase, snake_case, etc.)
|
|
377
|
+
- `skip_numbers:` (default: false) - Skip numeric patterns (versions, IDs, measurements)
|
|
378
|
+
|
|
379
|
+
**Examples:**
|
|
380
|
+
```ruby
|
|
381
|
+
# From URL (recommended for getting started)
|
|
382
|
+
SpellKit.load!(dictionary: SpellKit::DEFAULT_DICTIONARY_URL)
|
|
383
|
+
|
|
384
|
+
# With skip patterns for technical content
|
|
385
|
+
SpellKit.load!(
|
|
386
|
+
dictionary: SpellKit::DEFAULT_DICTIONARY_URL,
|
|
387
|
+
skip_urls: true,
|
|
388
|
+
skip_code_patterns: true
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
# From custom URL
|
|
392
|
+
SpellKit.load!(dictionary: "https://example.com/dict.tsv")
|
|
393
|
+
|
|
394
|
+
# From local file
|
|
395
|
+
SpellKit.load!(dictionary: "/path/to/dictionary.tsv")
|
|
396
|
+
```
|
|
397
|
+
|
|
398
|
+
### `SpellKit.correct?(word)`
|
|
399
|
+
|
|
400
|
+
Check if a word is spelled correctly (exact dictionary match).
|
|
401
|
+
|
|
402
|
+
**Parameters:**
|
|
403
|
+
- `word` (required) - The word to check
|
|
404
|
+
|
|
405
|
+
**Returns:** Boolean - true if word exists in dictionary, false otherwise
|
|
406
|
+
|
|
407
|
+
**Performance:** Very fast O(1) HashMap lookup. Use this instead of `suggest()` when you only need to check correctness.
|
|
408
|
+
|
|
409
|
+
**Example:**
|
|
410
|
+
```ruby
|
|
411
|
+
SpellKit.correct?("hello") # => true
|
|
412
|
+
SpellKit.correct?("helllo") # => false
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
### `SpellKit.suggestions(word, max = 5)`
|
|
416
|
+
|
|
417
|
+
Get ranked suggestions for a word.
|
|
418
|
+
|
|
419
|
+
**Parameters:**
|
|
420
|
+
- `word` (required) - The word to get suggestions for
|
|
421
|
+
- `max` (optional, default: 5) - Maximum number of suggestions to return
|
|
422
|
+
|
|
423
|
+
**Returns:** Array of hashes with `"term"`, `"distance"`, and `"freq"` keys
|
|
424
|
+
|
|
425
|
+
**Example:**
|
|
426
|
+
```ruby
|
|
427
|
+
SpellKit.suggestions("helllo", 5)
|
|
428
|
+
# => [{"term"=>"hello", "distance"=>1, "freq"=>10000}, ...]
|
|
429
|
+
```
|
|
430
|
+
|
|
431
|
+
### `SpellKit.correct(word)`
|
|
432
|
+
|
|
433
|
+
Return corrected word or original if no better match found. Respects `frequency_threshold` configuration. Protected terms and skip patterns are automatically applied when configured.
|
|
434
|
+
|
|
435
|
+
**Parameters:**
|
|
436
|
+
- `word` (required) - The word to correct
|
|
437
|
+
|
|
438
|
+
**Behavior:**
|
|
439
|
+
- Returns original word if it exists in dictionary
|
|
440
|
+
- For misspellings, only accepts corrections with frequency ≥ `frequency_threshold`
|
|
441
|
+
- Returns original word if no corrections pass the threshold
|
|
442
|
+
- Automatically respects protected terms and skip patterns configured in `load!`
|
|
443
|
+
|
|
444
|
+
**Example:**
|
|
445
|
+
```ruby
|
|
446
|
+
SpellKit.correct("helllo") # => "hello"
|
|
447
|
+
SpellKit.correct("hello") # => "hello" (already correct)
|
|
448
|
+
SpellKit.correct("CDK10") # => "CDK10" (protected if configured)
|
|
449
|
+
```
|
|
450
|
+
|
|
451
|
+
### `SpellKit.correct_tokens(tokens)`
|
|
452
|
+
|
|
453
|
+
Batch correction of an array of tokens. Respects `frequency_threshold` configuration. Protected terms and skip patterns are automatically applied when configured.
|
|
454
|
+
|
|
455
|
+
**Parameters:**
|
|
456
|
+
- `tokens` (required) - Array of words to correct
|
|
457
|
+
|
|
458
|
+
**Returns:** Array of corrected strings
|
|
459
|
+
|
|
460
|
+
### `SpellKit.stats`
|
|
461
|
+
|
|
462
|
+
Get current state statistics.
|
|
463
|
+
|
|
464
|
+
**Returns:** Hash with:
|
|
465
|
+
- `"loaded"` - Boolean
|
|
466
|
+
- `"dictionary_size"` - Number of terms
|
|
467
|
+
- `"edit_distance"` - Configured edit distance
|
|
468
|
+
- `"loaded_at"` - Unix timestamp
|
|
469
|
+
|
|
470
|
+
### `SpellKit.healthcheck`
|
|
471
|
+
|
|
472
|
+
Verify system is properly loaded. Raises error if not.
|
|
473
|
+
|
|
474
|
+
## Term Protection
|
|
475
|
+
|
|
476
|
+
When configured, SpellKit automatically protects specific terms from correction:
|
|
477
|
+
|
|
478
|
+
### Exact Matches
|
|
479
|
+
Terms in `protected_path` file are never corrected, even if similar dictionary words exist. Matching is case-insensitive, but original casing is preserved in output.
|
|
480
|
+
|
|
481
|
+
### Pattern Matching
|
|
482
|
+
Terms matching any pattern in `protected_patterns` are protected. Patterns can be:
|
|
483
|
+
- Ruby Regexp objects: `/^[A-Z]{3,4}\d+$/`
|
|
484
|
+
- Regex strings: `"^[A-Z]{3,4}\\d+$"`
|
|
485
|
+
|
|
486
|
+
### Examples
|
|
487
|
+
```ruby
|
|
488
|
+
# Protect specific terms
|
|
489
|
+
protected_patterns: [
|
|
490
|
+
/^[A-Z]{3,4}\d+$/, # Gene symbols: CDK10, BRCA1
|
|
491
|
+
/^\d{2,7}-\d{2}-\d$/, # CAS numbers: 7732-18-5
|
|
492
|
+
/^[A-Z]{2,3}-\d+$/ # Product codes: ABC-123
|
|
493
|
+
]
|
|
494
|
+
```
|
|
495
|
+
|
|
496
|
+
## Rails Integration
|
|
497
|
+
|
|
498
|
+
```ruby
|
|
499
|
+
# config/initializers/spellkit.rb
|
|
500
|
+
|
|
501
|
+
# Option 1: Use default dictionary (easiest)
|
|
502
|
+
SpellKit.configure do |config|
|
|
503
|
+
config.dictionary = SpellKit::DEFAULT_DICTIONARY_URL
|
|
504
|
+
end
|
|
505
|
+
|
|
506
|
+
# Option 2: Use local dictionary with full configuration
|
|
507
|
+
SpellKit.configure do |config|
|
|
508
|
+
config.dictionary = Rails.root.join("models/dictionary.tsv")
|
|
509
|
+
config.protected_path = Rails.root.join("models/protected.txt")
|
|
510
|
+
config.protected_patterns = [
|
|
511
|
+
/^[A-Z]{3,4}\d+$/, # Product codes
|
|
512
|
+
/^\d{2,7}-\d{2}-\d$/ # Reference numbers
|
|
513
|
+
]
|
|
514
|
+
config.edit_distance = 1
|
|
515
|
+
config.frequency_threshold = 10.0
|
|
516
|
+
end
|
|
517
|
+
|
|
518
|
+
# Option 3: Multiple domain-specific instances
|
|
519
|
+
# config/initializers/spellkit.rb
|
|
520
|
+
module SpellCheckers
|
|
521
|
+
MEDICAL = SpellKit::Checker.new.tap do |c|
|
|
522
|
+
c.load!(
|
|
523
|
+
dictionary: Rails.root.join("models/medical_dictionary.tsv"),
|
|
524
|
+
protected_path: Rails.root.join("models/medical_terms.txt")
|
|
525
|
+
)
|
|
526
|
+
end
|
|
527
|
+
|
|
528
|
+
LEGAL = SpellKit::Checker.new.tap do |c|
|
|
529
|
+
c.load!(
|
|
530
|
+
dictionary: Rails.root.join("models/legal_dictionary.tsv"),
|
|
531
|
+
protected_path: Rails.root.join("models/legal_terms.txt")
|
|
532
|
+
)
|
|
533
|
+
end
|
|
534
|
+
end
|
|
535
|
+
|
|
536
|
+
# In your search preprocessing
|
|
537
|
+
class SearchPreprocessor
|
|
538
|
+
def self.correct_query(text)
|
|
539
|
+
tokens = text.downcase.split(/\s+/)
|
|
540
|
+
SpellKit.correct_tokens(tokens).join(" ")
|
|
541
|
+
end
|
|
542
|
+
end
|
|
543
|
+
```
|
|
544
|
+
|
|
545
|
+
## Performance
|
|
546
|
+
|
|
547
|
+
### SpellKit Standalone (M4 Max MacBook Pro, Ruby 3.3.0, 80k dictionary)
|
|
548
|
+
|
|
549
|
+
**Single Word Suggestions:**
|
|
550
|
+
- 3,345 i/s (298.96 μs/i) with max: 1 suggestion
|
|
551
|
+
- 3,198 i/s (312.73 μs/i) with max: 5 suggestions
|
|
552
|
+
- 3,073 i/s (325.45 μs/i) with max: 10 suggestions
|
|
553
|
+
|
|
554
|
+
**Correction Performance:**
|
|
555
|
+
- `correct`: 1,858 i/s (538.17 μs/i)
|
|
556
|
+
- `correct_tokens` (batch): 2,005 i/s (498.76 μs/i)
|
|
557
|
+
|
|
558
|
+
**Protection Performance:**
|
|
559
|
+
- Without protection: 2,926 i/s (341.79 μs/i)
|
|
560
|
+
- With protection: 9,337 i/s (107.10 μs/i) - **3.19x faster!**
|
|
561
|
+
*(Protection checks short-circuit expensive dictionary lookups)*
|
|
562
|
+
|
|
563
|
+
**Latency Distribution (10,000 iterations):**
|
|
564
|
+
- p50: 61μs
|
|
565
|
+
- p95: 66μs
|
|
566
|
+
- p99: 105μs
|
|
567
|
+
- max: 298μs
|
|
568
|
+
|
|
569
|
+
**Raw Throughput:** 16,192 ops/sec
|
|
570
|
+
|
|
571
|
+
### Key Takeaways
|
|
572
|
+
1. **Consistent Performance**: p95 latency of 66μs with 80k dictionary, p99 at 105μs
|
|
573
|
+
2. **Guards are Fast**: Protected term checks improve performance by 3.2x by avoiding dictionary lookups
|
|
574
|
+
3. **High Throughput**: Over 16k operations per second with 80k word dictionary
|
|
575
|
+
4. **Scales Well**: Minimal performance difference between 1 vs 10 suggestions
|
|
576
|
+
|
|
577
|
+
### Comparison with Aspell
|
|
578
|
+
|
|
579
|
+
SpellKit vs Aspell (M4 Max MacBook Pro, Ruby 3.3.0, 80k dictionary):
|
|
580
|
+
|
|
581
|
+
**Suggestion Performance (13 misspelled words):**
|
|
582
|
+
- SpellKit: 3,162 i/s (316μs per batch)
|
|
583
|
+
- Aspell: 433 i/s (2.31ms per batch)
|
|
584
|
+
- **SpellKit is 7.3x faster**
|
|
585
|
+
|
|
586
|
+
**Spell Checking (correct? on 26 words):**
|
|
587
|
+
- SpellKit: 263,279 i/s (3.8μs per batch)
|
|
588
|
+
- Aspell: 72,099 i/s (13.9μs per batch)
|
|
589
|
+
- **SpellKit is 3.65x faster**
|
|
590
|
+
|
|
591
|
+
**Latency Distribution (10,000 single-word suggestions):**
|
|
592
|
+
- SpellKit: p50=63μs, p95=69μs, p99=98μs
|
|
593
|
+
- Aspell: p50=105μs, p95=121μs, p99=182μs
|
|
594
|
+
- **SpellKit is 1.7x faster at p50, 1.75x faster at p95**
|
|
595
|
+
|
|
596
|
+
Both libraries provide high-quality spell checking, but SpellKit's SymSpell algorithm (O(1) lookup) offers significant performance advantages over Aspell's statistical approach, especially for high-throughput applications.
|
|
597
|
+
|
|
598
|
+
## Benchmarks
|
|
599
|
+
|
|
600
|
+
SpellKit includes comprehensive benchmarks to measure performance and compare with other spell checkers.
|
|
601
|
+
|
|
602
|
+
### Running Benchmarks
|
|
603
|
+
|
|
604
|
+
**Performance Benchmark** - Comprehensive SpellKit performance analysis:
|
|
605
|
+
```bash
|
|
606
|
+
bundle exec ruby benchmark/performance.rb
|
|
607
|
+
```
|
|
608
|
+
|
|
609
|
+
Measures:
|
|
610
|
+
- Single word suggestions with varying result limits
|
|
611
|
+
- Correction performance on mixed datasets
|
|
612
|
+
- Batch correction throughput
|
|
613
|
+
- Guard/protection overhead
|
|
614
|
+
- Latency distribution (p50, p95, p99)
|
|
615
|
+
- Raw throughput (ops/sec)
|
|
616
|
+
|
|
617
|
+
**Aspell Comparison** - Direct comparison with Aspell:
|
|
618
|
+
```bash
|
|
619
|
+
# First install Aspell if needed:
|
|
620
|
+
# macOS: brew install aspell
|
|
621
|
+
# Ubuntu: sudo apt-get install aspell libaspell-dev
|
|
622
|
+
|
|
623
|
+
bundle exec ruby benchmark/comparison_aspell.rb
|
|
624
|
+
```
|
|
625
|
+
|
|
626
|
+
Compares SpellKit with Aspell on:
|
|
627
|
+
- Single word correction performance
|
|
628
|
+
- Spell checking (correctness tests)
|
|
629
|
+
- Latency distribution at scale
|
|
630
|
+
|
|
631
|
+
See [benchmark/README.md](benchmark/README.md) for detailed results and analysis.
|
|
632
|
+
|
|
633
|
+
### Why These Benchmarks?
|
|
634
|
+
|
|
635
|
+
**SpellKit vs Aspell**: Both provide fuzzy matching and suggestions for misspelled words, but use different algorithms:
|
|
636
|
+
- **SpellKit (SymSpell)**: O(1) lookup complexity, optimized for speed with large dictionaries
|
|
637
|
+
- **Aspell**: Statistical scoring with phonetic similarity, good for natural language
|
|
638
|
+
|
|
639
|
+
The comparison shows SpellKit's performance advantage while solving the same problem.
|
|
640
|
+
|
|
641
|
+
## Building Dictionaries
|
|
642
|
+
|
|
643
|
+
Create your dictionary from your corpus:
|
|
644
|
+
|
|
645
|
+
```ruby
|
|
646
|
+
# example_builder.rb
|
|
647
|
+
require "set"
|
|
648
|
+
|
|
649
|
+
counts = Hash.new(0)
|
|
650
|
+
|
|
651
|
+
# Read your corpus
|
|
652
|
+
File.foreach("corpus.txt") do |line|
|
|
653
|
+
line.downcase.split(/\W+/).each do |word|
|
|
654
|
+
next if word.length < 3
|
|
655
|
+
counts[word] += 1
|
|
656
|
+
end
|
|
657
|
+
end
|
|
658
|
+
|
|
659
|
+
# Filter by minimum count and write
|
|
660
|
+
min_count = 5
|
|
661
|
+
File.open("dictionary.tsv", "w") do |f|
|
|
662
|
+
counts.select { |_, count| count >= min_count }
|
|
663
|
+
.sort_by { |_, count| -count }
|
|
664
|
+
.each { |term, count| f.puts "#{term}\t#{count}" }
|
|
665
|
+
end
|
|
666
|
+
```
|
|
667
|
+
|
|
668
|
+
## Development
|
|
669
|
+
|
|
670
|
+
After checking out the repo:
|
|
671
|
+
|
|
672
|
+
```bash
|
|
673
|
+
bundle install
|
|
674
|
+
bundle exec rake compile
|
|
675
|
+
bundle exec rake spec
|
|
676
|
+
```
|
|
677
|
+
|
|
678
|
+
To build the gem:
|
|
679
|
+
|
|
680
|
+
```bash
|
|
681
|
+
bundle exec rake build
|
|
682
|
+
```
|
|
683
|
+
|
|
684
|
+
## Platform Support
|
|
685
|
+
|
|
686
|
+
Pre-built gems available for:
|
|
687
|
+
- macOS (x86_64, arm64)
|
|
688
|
+
- Linux (glibc, musl)
|
|
689
|
+
- Ruby 3.1, 3.2, 3.3
|
|
690
|
+
|
|
691
|
+
## Contributing
|
|
692
|
+
|
|
693
|
+
Bug reports and pull requests are welcome at https://github.com/scientist-labs/spellkit
|
|
694
|
+
|
|
695
|
+
## License
|
|
696
|
+
|
|
697
|
+
MIT License - see [LICENSE](LICENSE.txt) file for details.
|