spellkit 0.2.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +697 -0
- data/ext/spellkit/Cargo.toml +19 -0
- data/ext/spellkit/extconf.rb +4 -0
- data/ext/spellkit/src/guards.rs +75 -0
- data/ext/spellkit/src/lib.rs +393 -0
- data/ext/spellkit/src/symspell.rs +349 -0
- data/lib/spellkit/3.1/spellkit.bundle +0 -0
- data/lib/spellkit/3.2/spellkit.bundle +0 -0
- data/lib/spellkit/3.3/spellkit.bundle +0 -0
- data/lib/spellkit/3.4/spellkit.bundle +0 -0
- data/lib/spellkit/version.rb +5 -0
- data/lib/spellkit.rb +368 -0
- metadata +196 -0
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
use hashbrown::{HashMap, HashSet};
|
|
2
|
+
use std::cmp::Ordering;
|
|
3
|
+
use unicode_normalization::UnicodeNormalization;
|
|
4
|
+
|
|
5
|
+
#[derive(Debug, Clone)]
|
|
6
|
+
pub struct WordEntry {
|
|
7
|
+
pub canonical: String,
|
|
8
|
+
pub frequency: u64,
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
#[derive(Debug, Clone)]
|
|
12
|
+
pub struct Suggestion {
|
|
13
|
+
pub term: String,
|
|
14
|
+
pub distance: usize,
|
|
15
|
+
pub frequency: u64,
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
impl Suggestion {
|
|
19
|
+
pub fn new(term: String, distance: usize, frequency: u64) -> Self {
|
|
20
|
+
Self {
|
|
21
|
+
term,
|
|
22
|
+
distance,
|
|
23
|
+
frequency,
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
impl Ord for Suggestion {
|
|
29
|
+
fn cmp(&self, other: &Self) -> Ordering {
|
|
30
|
+
self.distance
|
|
31
|
+
.cmp(&other.distance)
|
|
32
|
+
.then_with(|| other.frequency.cmp(&self.frequency))
|
|
33
|
+
.then_with(|| self.term.cmp(&other.term))
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
impl PartialOrd for Suggestion {
|
|
38
|
+
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
|
39
|
+
Some(self.cmp(other))
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
impl PartialEq for Suggestion {
|
|
44
|
+
fn eq(&self, other: &Self) -> bool {
|
|
45
|
+
self.term == other.term && self.distance == other.distance && self.frequency == other.frequency
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
impl Eq for Suggestion {}
|
|
50
|
+
|
|
51
|
+
pub struct SymSpell {
|
|
52
|
+
deletes: HashMap<String, HashSet<String>>,
|
|
53
|
+
words: HashMap<String, WordEntry>,
|
|
54
|
+
max_edit_distance: usize,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
impl SymSpell {
|
|
58
|
+
pub fn new(max_edit_distance: usize) -> Self {
|
|
59
|
+
Self {
|
|
60
|
+
deletes: HashMap::new(),
|
|
61
|
+
words: HashMap::new(),
|
|
62
|
+
max_edit_distance,
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
pub fn normalize_word(word: &str) -> String {
|
|
67
|
+
word.nfkd()
|
|
68
|
+
.filter(|c| !c.is_control() && !c.is_whitespace())
|
|
69
|
+
.collect::<String>()
|
|
70
|
+
.to_lowercase()
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
pub fn add_word(&mut self, normalized: &str, canonical: &str, frequency: u64) -> bool {
|
|
74
|
+
let normalized_key = normalized.to_string();
|
|
75
|
+
|
|
76
|
+
let was_new = if let Some(existing) = self.words.get_mut(&normalized_key) {
|
|
77
|
+
// Duplicate: sum frequencies and keep highest-frequency canonical form
|
|
78
|
+
let new_total_freq = existing.frequency + frequency;
|
|
79
|
+
|
|
80
|
+
// Keep the canonical form from the higher-frequency variant
|
|
81
|
+
if frequency > existing.frequency {
|
|
82
|
+
existing.canonical = canonical.to_string();
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
existing.frequency = new_total_freq;
|
|
86
|
+
false
|
|
87
|
+
} else {
|
|
88
|
+
// New entry
|
|
89
|
+
self.words.insert(
|
|
90
|
+
normalized_key.clone(),
|
|
91
|
+
WordEntry {
|
|
92
|
+
canonical: canonical.to_string(),
|
|
93
|
+
frequency,
|
|
94
|
+
},
|
|
95
|
+
);
|
|
96
|
+
true
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
// Only generate deletes for new entries (avoid redundant work)
|
|
100
|
+
if was_new {
|
|
101
|
+
let deletes = self.get_deletes(normalized, self.max_edit_distance);
|
|
102
|
+
for delete in deletes {
|
|
103
|
+
self.deletes
|
|
104
|
+
.entry(delete)
|
|
105
|
+
.or_insert_with(HashSet::new)
|
|
106
|
+
.insert(normalized_key.clone());
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
was_new
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
fn get_deletes(&self, word: &str, edit_distance: usize) -> HashSet<String> {
|
|
114
|
+
let mut deletes = HashSet::new();
|
|
115
|
+
if edit_distance == 0 {
|
|
116
|
+
return deletes;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
let mut queue = vec![word.to_string()];
|
|
120
|
+
let mut processed = HashSet::new();
|
|
121
|
+
|
|
122
|
+
for _ in 0..edit_distance {
|
|
123
|
+
let mut temp_queue = Vec::new();
|
|
124
|
+
for item in queue {
|
|
125
|
+
if processed.contains(&item) {
|
|
126
|
+
continue;
|
|
127
|
+
}
|
|
128
|
+
processed.insert(item.clone());
|
|
129
|
+
|
|
130
|
+
for delete in self.generate_deletes(&item) {
|
|
131
|
+
deletes.insert(delete.clone());
|
|
132
|
+
|
|
133
|
+
// Only continue processing non-empty strings to avoid infinite loops
|
|
134
|
+
if !delete.is_empty() {
|
|
135
|
+
temp_queue.push(delete);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
queue = temp_queue;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
deletes
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
fn generate_deletes(&self, word: &str) -> Vec<String> {
|
|
146
|
+
let chars: Vec<char> = word.chars().collect();
|
|
147
|
+
let mut deletes = Vec::new();
|
|
148
|
+
|
|
149
|
+
for i in 0..chars.len() {
|
|
150
|
+
let mut new_word = String::new();
|
|
151
|
+
for (j, &ch) in chars.iter().enumerate() {
|
|
152
|
+
if j != i {
|
|
153
|
+
new_word.push(ch);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
deletes.push(new_word);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
deletes
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
pub fn contains(&self, word: &str) -> bool {
|
|
163
|
+
let normalized = Self::normalize_word(word);
|
|
164
|
+
self.words.contains_key(&normalized)
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
pub fn get_frequency(&self, word: &str) -> Option<u64> {
|
|
168
|
+
let normalized = Self::normalize_word(word);
|
|
169
|
+
self.words.get(&normalized).map(|entry| entry.frequency)
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
pub fn suggestions(&self, word: &str, max_suggestions: usize) -> Vec<Suggestion> {
|
|
173
|
+
let normalized = Self::normalize_word(word);
|
|
174
|
+
let mut suggestions = Vec::new();
|
|
175
|
+
let mut seen = HashSet::new();
|
|
176
|
+
|
|
177
|
+
if let Some(entry) = self.words.get(&normalized) {
|
|
178
|
+
suggestions.push(Suggestion::new(entry.canonical.clone(), 0, entry.frequency));
|
|
179
|
+
seen.insert(normalized.clone());
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
let input_deletes = self.get_deletes(&normalized, self.max_edit_distance);
|
|
183
|
+
|
|
184
|
+
for delete in &input_deletes {
|
|
185
|
+
// Check if this delete is itself a dictionary word (important for finding words shorter than input)
|
|
186
|
+
if !seen.contains(delete) {
|
|
187
|
+
if let Some(entry) = self.words.get(delete) {
|
|
188
|
+
let distance = self.edit_distance(&normalized, delete);
|
|
189
|
+
if distance <= self.max_edit_distance {
|
|
190
|
+
suggestions.push(Suggestion::new(entry.canonical.clone(), distance, entry.frequency));
|
|
191
|
+
seen.insert(delete.clone());
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Check the deletes map for candidates
|
|
197
|
+
if let Some(candidates) = self.deletes.get(delete) {
|
|
198
|
+
for candidate in candidates {
|
|
199
|
+
if seen.contains(candidate) {
|
|
200
|
+
continue;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
let distance = self.edit_distance(&normalized, candidate);
|
|
204
|
+
if distance <= self.max_edit_distance {
|
|
205
|
+
if let Some(entry) = self.words.get(candidate) {
|
|
206
|
+
suggestions.push(Suggestion::new(entry.canonical.clone(), distance, entry.frequency));
|
|
207
|
+
seen.insert(candidate.clone());
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if let Some(candidates) = self.deletes.get(&normalized) {
|
|
215
|
+
for candidate in candidates {
|
|
216
|
+
if seen.contains(candidate) {
|
|
217
|
+
continue;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
let distance = self.edit_distance(&normalized, candidate);
|
|
221
|
+
if distance <= self.max_edit_distance {
|
|
222
|
+
if let Some(entry) = self.words.get(candidate) {
|
|
223
|
+
suggestions.push(Suggestion::new(entry.canonical.clone(), distance, entry.frequency));
|
|
224
|
+
seen.insert(candidate.clone());
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
suggestions.sort();
|
|
231
|
+
suggestions.truncate(max_suggestions);
|
|
232
|
+
suggestions
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
fn edit_distance(&self, s1: &str, s2: &str) -> usize {
|
|
236
|
+
let len1 = s1.chars().count();
|
|
237
|
+
let len2 = s2.chars().count();
|
|
238
|
+
|
|
239
|
+
if len1 == 0 {
|
|
240
|
+
return len2;
|
|
241
|
+
}
|
|
242
|
+
if len2 == 0 {
|
|
243
|
+
return len1;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
let s1_chars: Vec<char> = s1.chars().collect();
|
|
247
|
+
let s2_chars: Vec<char> = s2.chars().collect();
|
|
248
|
+
|
|
249
|
+
let mut prev_row: Vec<usize> = (0..=len2).collect();
|
|
250
|
+
let mut curr_row = vec![0; len2 + 1];
|
|
251
|
+
|
|
252
|
+
for i in 1..=len1 {
|
|
253
|
+
curr_row[0] = i;
|
|
254
|
+
|
|
255
|
+
for j in 1..=len2 {
|
|
256
|
+
let cost = if s1_chars[i - 1] == s2_chars[j - 1] {
|
|
257
|
+
0
|
|
258
|
+
} else {
|
|
259
|
+
1
|
|
260
|
+
};
|
|
261
|
+
|
|
262
|
+
curr_row[j] = std::cmp::min(
|
|
263
|
+
std::cmp::min(
|
|
264
|
+
prev_row[j] + 1, // deletion
|
|
265
|
+
curr_row[j - 1] + 1 // insertion
|
|
266
|
+
),
|
|
267
|
+
prev_row[j - 1] + cost // substitution
|
|
268
|
+
);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
std::mem::swap(&mut prev_row, &mut curr_row);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
prev_row[len2]
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
#[cfg(test)]
|
|
279
|
+
mod tests {
|
|
280
|
+
use super::*;
|
|
281
|
+
|
|
282
|
+
#[test]
|
|
283
|
+
fn test_edit_distance() {
|
|
284
|
+
let symspell = SymSpell::new(2);
|
|
285
|
+
assert_eq!(symspell.edit_distance("test", "test"), 0);
|
|
286
|
+
assert_eq!(symspell.edit_distance("test", "tests"), 1);
|
|
287
|
+
assert_eq!(symspell.edit_distance("test", "tast"), 1);
|
|
288
|
+
assert_eq!(symspell.edit_distance("test", "toast"), 2);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
#[test]
|
|
292
|
+
fn test_suggestions() {
|
|
293
|
+
let mut symspell = SymSpell::new(2);
|
|
294
|
+
symspell.add_word("hello", "hello", 1000);
|
|
295
|
+
symspell.add_word("hell", "hell", 500);
|
|
296
|
+
symspell.add_word("help", "help", 750);
|
|
297
|
+
|
|
298
|
+
let suggestions = symspell.suggestions("helo", 3);
|
|
299
|
+
assert!(!suggestions.is_empty());
|
|
300
|
+
assert_eq!(suggestions[0].term, "hello");
|
|
301
|
+
assert_eq!(suggestions[0].distance, 1);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
#[test]
|
|
305
|
+
fn test_single_character_corrections() {
|
|
306
|
+
let mut symspell = SymSpell::new(1);
|
|
307
|
+
symspell.add_word("a", "a", 10000);
|
|
308
|
+
symspell.add_word("i", "I", 8000);
|
|
309
|
+
symspell.add_word("o", "o", 6000);
|
|
310
|
+
|
|
311
|
+
let suggestions = symspell.suggestions("x", 5);
|
|
312
|
+
assert!(!suggestions.is_empty(), "Single-character corrections should work");
|
|
313
|
+
assert!(suggestions.iter().any(|s| s.term == "a"), "Should suggest 'a' for 'x'");
|
|
314
|
+
|
|
315
|
+
let suggestions_for_j = symspell.suggestions("j", 5);
|
|
316
|
+
assert!(!suggestions_for_j.is_empty(), "Should find suggestions for 'j'");
|
|
317
|
+
assert!(suggestions_for_j.iter().any(|s| s.term == "I"), "Should suggest canonical 'I' (not 'i')");
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
#[test]
|
|
321
|
+
fn test_duplicate_entries_keep_highest_frequency_canonical() {
|
|
322
|
+
let mut symspell = SymSpell::new(1);
|
|
323
|
+
|
|
324
|
+
// Add high-frequency lowercase variant
|
|
325
|
+
symspell.add_word("hello", "hello", 10000);
|
|
326
|
+
|
|
327
|
+
// Add low-frequency uppercase variant (should not replace canonical)
|
|
328
|
+
symspell.add_word("hello", "HELLO", 100);
|
|
329
|
+
|
|
330
|
+
let suggestions = symspell.suggestions("hello", 1);
|
|
331
|
+
assert_eq!(suggestions.len(), 1);
|
|
332
|
+
assert_eq!(suggestions[0].term, "hello", "Should keep high-frequency 'hello' as canonical, not 'HELLO'");
|
|
333
|
+
assert_eq!(suggestions[0].frequency, 10100, "Should sum frequencies: 10000 + 100 = 10100");
|
|
334
|
+
|
|
335
|
+
// Verify reverse order also works
|
|
336
|
+
let mut symspell2 = SymSpell::new(1);
|
|
337
|
+
|
|
338
|
+
// Add low-frequency first
|
|
339
|
+
symspell2.add_word("world", "WORLD", 100);
|
|
340
|
+
|
|
341
|
+
// Add high-frequency second (should replace canonical)
|
|
342
|
+
symspell2.add_word("world", "world", 10000);
|
|
343
|
+
|
|
344
|
+
let suggestions2 = symspell2.suggestions("world", 1);
|
|
345
|
+
assert_eq!(suggestions2.len(), 1);
|
|
346
|
+
assert_eq!(suggestions2[0].term, "world", "Should update to high-frequency 'world' canonical");
|
|
347
|
+
assert_eq!(suggestions2[0].frequency, 10100, "Should sum frequencies");
|
|
348
|
+
}
|
|
349
|
+
}
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|