spellkit 0.1.0.pre.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,6 +22,10 @@ struct CheckerState {
22
22
  loaded_at: Option<u64>,
23
23
  dictionary_size: usize,
24
24
  edit_distance: usize,
25
+ skipped_malformed: usize,
26
+ skipped_multiword: usize,
27
+ skipped_invalid_freq: usize,
28
+ skipped_duplicates: usize,
25
29
  }
26
30
 
27
31
  impl CheckerState {
@@ -34,10 +38,63 @@ impl CheckerState {
34
38
  loaded_at: None,
35
39
  dictionary_size: 0,
36
40
  edit_distance: 1,
41
+ skipped_malformed: 0,
42
+ skipped_multiword: 0,
43
+ skipped_invalid_freq: 0,
44
+ skipped_duplicates: 0,
37
45
  }
38
46
  }
39
47
  }
40
48
 
49
+ // Helper function to correct a single word
50
+ // Returns the corrected word or the original if no correction is appropriate
51
+ fn correct_word(
52
+ state: &CheckerState,
53
+ symspell: &SymSpell,
54
+ word: &str,
55
+ use_guard: bool,
56
+ ) -> String {
57
+ // Check if word is protected
58
+ if use_guard {
59
+ let normalized = SymSpell::normalize_word(word);
60
+ if state.guards.is_protected_normalized(word, &normalized) {
61
+ return word.to_string();
62
+ }
63
+ }
64
+
65
+ let suggestions = symspell.suggestions(word, 5);
66
+
67
+ // If exact match exists, return canonical form from dictionary
68
+ if !suggestions.is_empty() && suggestions[0].distance == 0 {
69
+ return suggestions[0].term.clone();
70
+ }
71
+
72
+ // Get original word's frequency (if it exists in dictionary)
73
+ let original_freq = symspell.get_frequency(word);
74
+
75
+ // Find best correction with frequency threshold
76
+ for suggestion in &suggestions {
77
+ if suggestion.distance <= state.edit_distance {
78
+ // Apply frequency threshold
79
+ let passes_threshold = match original_freq {
80
+ // Word not in dictionary: require suggestion frequency >= absolute threshold
81
+ None => suggestion.frequency as f64 >= state.frequency_threshold,
82
+ // Word in dictionary: require suggestion frequency >= threshold * original frequency
83
+ Some(orig_freq) => {
84
+ suggestion.frequency as f64 >= state.frequency_threshold * orig_freq as f64
85
+ }
86
+ };
87
+
88
+ if passes_threshold {
89
+ return suggestion.term.clone();
90
+ }
91
+ }
92
+ }
93
+
94
+ // No suggestions passed the threshold
95
+ word.to_string()
96
+ }
97
+
41
98
  impl Checker {
42
99
  fn new() -> Self {
43
100
  Self {
@@ -54,56 +111,129 @@ impl Checker {
54
111
  .map_err(|_| Error::new(ruby.exception_arg_error(), "dictionary_path is required"))?
55
112
  )?;
56
113
 
57
- let content = std::fs::read_to_string(&dictionary_path)
58
- .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("Failed to read dictionary file: {}", e)))?;
114
+ // Optional: edit distance
115
+ let edit_dist: usize = config.get("edit_distance")
116
+ .and_then(|v: Value| TryConvert::try_convert(v).ok())
117
+ .unwrap_or(1);
59
118
 
60
- // Optional: edit distance
61
- let edit_dist: usize = config.get("edit_distance")
62
- .and_then(|v: Value| TryConvert::try_convert(v).ok())
63
- .unwrap_or(1);
119
+ if edit_dist > 2 {
120
+ return Err(Error::new(ruby.exception_arg_error(), "edit_distance must be 1 or 2"));
121
+ }
64
122
 
65
- if edit_dist > 2 {
66
- return Err(Error::new(ruby.exception_arg_error(), "edit_distance must be 1 or 2"));
67
- }
123
+ // Stream dictionary loading: read line-by-line and add directly to SymSpell
124
+ // This avoids buffering the entire file and intermediate Vec allocation
125
+ let file = std::fs::File::open(&dictionary_path)
126
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("Failed to open dictionary file: {}", e)))?;
127
+
128
+ let reader = std::io::BufReader::new(file);
129
+ let mut symspell = SymSpell::new(edit_dist);
130
+ let mut dictionary_size = 0;
131
+ let mut skipped_malformed = 0;
132
+ let mut skipped_multiword = 0;
133
+ let mut skipped_invalid_freq = 0;
134
+ let mut skipped_duplicates = 0;
135
+
136
+ use std::io::BufRead;
137
+ for line in reader.lines() {
138
+ let line = line.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("Failed to read line: {}", e)))?;
139
+
140
+ // Try tab-separated first (allows multi-word terms), then space-separated (SymSpell format)
141
+ let parts: Vec<&str> = if line.contains('\t') {
142
+ line.split('\t').collect()
143
+ } else {
144
+ line.split_whitespace().collect()
145
+ };
146
+
147
+ // Validate we have exactly 2 columns (term and frequency)
148
+ if parts.len() != 2 {
149
+ skipped_malformed += 1;
150
+ continue;
151
+ }
152
+
153
+ let term = parts[0].trim();
154
+ let freq_str = parts[1].trim();
68
155
 
69
- let mut words = Vec::new();
70
- for line in content.lines() {
71
- let parts: Vec<&str> = line.split_whitespace().collect();
72
- if parts.len() == 2 {
73
- if let Ok(freq) = parts[1].parse::<u64>() {
74
- words.push((parts[0].to_string(), freq));
156
+ // Skip empty terms or frequencies
157
+ if term.is_empty() || freq_str.is_empty() {
158
+ skipped_malformed += 1;
159
+ continue;
75
160
  }
76
- }
77
- }
78
161
 
79
- let dictionary_size = words.len();
80
- let mut symspell = SymSpell::new(edit_dist);
81
- symspell.load_dictionary(words);
162
+ // Check for multi-word terms (SymSpell algorithm doesn't support phrases)
163
+ if term.contains(char::is_whitespace) {
164
+ skipped_multiword += 1;
165
+ continue;
166
+ }
82
167
 
83
- let mut guards = Guards::new();
168
+ // Parse frequency
169
+ match freq_str.parse::<u64>() {
170
+ Ok(freq) => {
171
+ let normalized = SymSpell::normalize_word(term);
172
+ let was_new = symspell.add_word(&normalized, term, freq);
173
+ if was_new {
174
+ dictionary_size += 1;
175
+ } else {
176
+ skipped_duplicates += 1;
177
+ }
178
+ }
179
+ Err(_) => {
180
+ skipped_invalid_freq += 1;
181
+ }
182
+ }
183
+ }
84
184
 
85
- // Load optional protected terms file
86
- if let Some(protected_path) = config.get("protected_path") {
87
- let path: String = TryConvert::try_convert(protected_path)?;
88
- if let Ok(content) = std::fs::read_to_string(path) {
185
+ let mut guards = Guards::new();
186
+
187
+ // Load optional protected terms file
188
+ if let Some(protected_path) = config.get("protected_path") {
189
+ let path: String = TryConvert::try_convert(protected_path)?;
190
+ let content = std::fs::read_to_string(&path)
191
+ .map_err(|e| Error::new(ruby.exception_runtime_error(),
192
+ format!("Failed to read protected terms file '{}': {}", path, e)))?;
89
193
  guards.load_protected(&content);
90
194
  }
91
- }
92
195
 
93
- // Load optional protected patterns
94
- if let Some(patterns_value) = config.get("protected_patterns") {
95
- let patterns: RArray = TryConvert::try_convert(patterns_value)?;
96
- for pattern_value in patterns.into_iter() {
97
- let pattern: String = TryConvert::try_convert(pattern_value)?;
98
- guards.add_pattern(&pattern)
99
- .map_err(|e| Error::new(ruby.exception_arg_error(), e))?;
196
+ // Load optional protected patterns
197
+ if let Some(patterns_value) = config.get("protected_patterns") {
198
+ let patterns: RArray = TryConvert::try_convert(patterns_value)?;
199
+ for pattern_value in patterns.into_iter() {
200
+ let pattern_hash: RHash = TryConvert::try_convert(pattern_value)?;
201
+
202
+ let source: String = TryConvert::try_convert(
203
+ pattern_hash.fetch::<_, Value>("source")
204
+ .map_err(|_| Error::new(ruby.exception_arg_error(), "pattern hash missing 'source' key"))?
205
+ )?;
206
+
207
+ let case_insensitive: bool = pattern_hash.get("case_insensitive")
208
+ .and_then(|v: Value| TryConvert::try_convert(v).ok())
209
+ .unwrap_or(false);
210
+
211
+ let multiline: bool = pattern_hash.get("multiline")
212
+ .and_then(|v: Value| TryConvert::try_convert(v).ok())
213
+ .unwrap_or(false);
214
+
215
+ let extended: bool = pattern_hash.get("extended")
216
+ .and_then(|v: Value| TryConvert::try_convert(v).ok())
217
+ .unwrap_or(false);
218
+
219
+ guards.add_pattern_with_flags(&source, case_insensitive, multiline, extended)
220
+ .map_err(|e| Error::new(ruby.exception_arg_error(), e))?;
221
+ }
100
222
  }
101
- }
102
223
 
103
- // Optional frequency threshold
104
- let frequency_threshold: f64 = config.get("frequency_threshold")
105
- .and_then(|v: Value| TryConvert::try_convert(v).ok())
106
- .unwrap_or(10.0);
224
+ // Optional frequency threshold
225
+ let frequency_threshold: f64 = config.get("frequency_threshold")
226
+ .and_then(|v: Value| TryConvert::try_convert(v).ok())
227
+ .unwrap_or(10.0);
228
+
229
+ // Validate frequency threshold
230
+ if !frequency_threshold.is_finite() {
231
+ return Err(Error::new(ruby.exception_arg_error(), "frequency_threshold must be finite (not NaN or Infinity)"));
232
+ }
233
+
234
+ if frequency_threshold < 0.0 {
235
+ return Err(Error::new(ruby.exception_arg_error(), format!("frequency_threshold must be non-negative, got: {}", frequency_threshold)));
236
+ }
107
237
 
108
238
  let loaded_at = SystemTime::now()
109
239
  .duration_since(UNIX_EPOCH)
@@ -118,11 +248,15 @@ impl Checker {
118
248
  state.loaded_at = loaded_at;
119
249
  state.dictionary_size = dictionary_size;
120
250
  state.edit_distance = edit_dist;
251
+ state.skipped_malformed = skipped_malformed;
252
+ state.skipped_multiword = skipped_multiword;
253
+ state.skipped_invalid_freq = skipped_invalid_freq;
254
+ state.skipped_duplicates = skipped_duplicates;
121
255
 
122
256
  Ok(())
123
257
  }
124
258
 
125
- fn suggest(&self, word: String, max: Option<usize>) -> Result<RArray, Error> {
259
+ fn suggestions(&self, word: String, max: Option<usize>) -> Result<RArray, Error> {
126
260
  let ruby = Ruby::get().unwrap();
127
261
  let max_suggestions = max.unwrap_or(5);
128
262
  let state = self.state.read().unwrap();
@@ -132,7 +266,7 @@ impl Checker {
132
266
  }
133
267
 
134
268
  if let Some(ref symspell) = state.symspell {
135
- let suggestions = symspell.suggest(&word, max_suggestions);
269
+ let suggestions = symspell.suggestions(&word, max_suggestions);
136
270
  let result = RArray::new();
137
271
 
138
272
  for suggestion in suggestions {
@@ -149,7 +283,7 @@ impl Checker {
149
283
  }
150
284
  }
151
285
 
152
- fn correct_if_unknown(&self, word: String, use_guard: Option<bool>) -> Result<String, Error> {
286
+ fn correct(&self, word: String) -> Result<bool, Error> {
153
287
  let ruby = Ruby::get().unwrap();
154
288
  let state = self.state.read().unwrap();
155
289
 
@@ -157,49 +291,52 @@ impl Checker {
157
291
  return Err(Error::new(ruby.exception_runtime_error(), "Dictionary not loaded. Call load! first"));
158
292
  }
159
293
 
160
- // Check if word is protected
161
- if use_guard.unwrap_or(false) {
162
- let normalized = SymSpell::normalize_word(&word);
163
- if state.guards.is_protected_normalized(&word, &normalized) {
164
- return Ok(word);
165
- }
166
- }
167
-
168
294
  if let Some(ref symspell) = state.symspell {
169
- let suggestions = symspell.suggest(&word, 5);
295
+ Ok(symspell.contains(&word))
296
+ } else {
297
+ Err(Error::new(ruby.exception_runtime_error(), "SymSpell not initialized"))
298
+ }
299
+ }
170
300
 
171
- // If exact match exists, return original
172
- if !suggestions.is_empty() && suggestions[0].distance == 0 {
173
- return Ok(word);
174
- }
301
+ fn correct_if_unknown(&self, word: String, use_guard: Option<bool>) -> Result<String, Error> {
302
+ let ruby = Ruby::get().unwrap();
303
+ let state = self.state.read().unwrap();
175
304
 
176
- // Find best correction with frequency gating
177
- for suggestion in &suggestions {
178
- if suggestion.distance <= 1 {
179
- // Check frequency threshold - correction should be significantly more common
180
- // Since we don't have the original word's frequency, we'll just take any ED=1 match
181
- // In a full implementation, we'd check if suggestion.frequency >= threshold * original_freq
182
- return Ok(suggestion.term.clone());
183
- }
184
- }
305
+ if !state.loaded {
306
+ return Err(Error::new(ruby.exception_runtime_error(), "Dictionary not loaded. Call load! first"));
307
+ }
185
308
 
186
- Ok(word)
309
+ if let Some(ref symspell) = state.symspell {
310
+ Ok(correct_word(&state, symspell, &word, use_guard.unwrap_or(false)))
187
311
  } else {
188
312
  Err(Error::new(ruby.exception_runtime_error(), "SymSpell not initialized"))
189
313
  }
190
314
  }
191
315
 
192
316
  fn correct_tokens(&self, tokens: RArray, use_guard: Option<bool>) -> Result<RArray, Error> {
193
- let result = RArray::new();
194
- let guard = use_guard.unwrap_or(false);
317
+ // Optimize batch correction by acquiring lock once for all tokens
318
+ // instead of calling correct_if_unknown per token (which re-locks each time)
319
+ let ruby = Ruby::get().unwrap();
320
+ let state = self.state.read().unwrap();
321
+ let use_guard = use_guard.unwrap_or(false);
195
322
 
196
- for token in tokens.into_iter() {
197
- let word: String = TryConvert::try_convert(token)?;
198
- let corrected = self.correct_if_unknown(word, Some(guard))?;
199
- result.push(corrected)?;
323
+ if !state.loaded {
324
+ return Err(Error::new(ruby.exception_runtime_error(), "Dictionary not loaded. Call load! first"));
200
325
  }
201
326
 
202
- Ok(result)
327
+ let result = RArray::new();
328
+
329
+ if let Some(ref symspell) = state.symspell {
330
+ for token in tokens.into_iter() {
331
+ let word: String = TryConvert::try_convert(token)?;
332
+ let corrected = correct_word(&state, symspell, &word, use_guard);
333
+ result.push(corrected)?;
334
+ }
335
+
336
+ Ok(result)
337
+ } else {
338
+ Err(Error::new(ruby.exception_runtime_error(), "SymSpell not initialized"))
339
+ }
203
340
  }
204
341
 
205
342
  fn stats(&self) -> Result<RHash, Error> {
@@ -214,6 +351,10 @@ impl Checker {
214
351
  stats.aset("loaded", true)?;
215
352
  stats.aset("dictionary_size", state.dictionary_size)?;
216
353
  stats.aset("edit_distance", state.edit_distance)?;
354
+ stats.aset("skipped_malformed", state.skipped_malformed)?;
355
+ stats.aset("skipped_multiword", state.skipped_multiword)?;
356
+ stats.aset("skipped_invalid_freq", state.skipped_invalid_freq)?;
357
+ stats.aset("skipped_duplicates", state.skipped_duplicates)?;
217
358
 
218
359
  if let Some(loaded_at) = state.loaded_at {
219
360
  stats.aset("loaded_at", loaded_at)?;
@@ -245,8 +386,9 @@ fn init(_ruby: &Ruby) -> Result<(), Error> {
245
386
 
246
387
  checker_class.define_singleton_method("new", function!(Checker::new, 0))?;
247
388
  checker_class.define_method("load!", method!(Checker::load_full, 1))?;
248
- checker_class.define_method("suggest", method!(Checker::suggest, 2))?;
249
- checker_class.define_method("correct_if_unknown", method!(Checker::correct_if_unknown, 2))?;
389
+ checker_class.define_method("suggestions", method!(Checker::suggestions, 2))?;
390
+ checker_class.define_method("correct?", method!(Checker::correct, 1))?;
391
+ checker_class.define_method("correct", method!(Checker::correct_if_unknown, 2))?;
250
392
  checker_class.define_method("correct_tokens", method!(Checker::correct_tokens, 2))?;
251
393
  checker_class.define_method("stats", method!(Checker::stats, 0))?;
252
394
  checker_class.define_method("healthcheck", method!(Checker::healthcheck, 0))?;
@@ -2,6 +2,12 @@ use hashbrown::{HashMap, HashSet};
2
2
  use std::cmp::Ordering;
3
3
  use unicode_normalization::UnicodeNormalization;
4
4
 
5
+ #[derive(Debug, Clone)]
6
+ pub struct WordEntry {
7
+ pub canonical: String,
8
+ pub frequency: u64,
9
+ }
10
+
5
11
  #[derive(Debug, Clone)]
6
12
  pub struct Suggestion {
7
13
  pub term: String,
@@ -44,7 +50,7 @@ impl Eq for Suggestion {}
44
50
 
45
51
  pub struct SymSpell {
46
52
  deletes: HashMap<String, HashSet<String>>,
47
- words: HashMap<String, u64>,
53
+ words: HashMap<String, WordEntry>,
48
54
  max_edit_distance: usize,
49
55
  }
50
56
 
@@ -64,23 +70,44 @@ impl SymSpell {
64
70
  .to_lowercase()
65
71
  }
66
72
 
67
- pub fn load_dictionary(&mut self, words: Vec<(String, u64)>) {
68
- for (word, freq) in words {
69
- let normalized = Self::normalize_word(&word);
70
- self.add_word(&normalized, freq);
71
- }
72
- }
73
+ pub fn add_word(&mut self, normalized: &str, canonical: &str, frequency: u64) -> bool {
74
+ let normalized_key = normalized.to_string();
73
75
 
74
- pub fn add_word(&mut self, word: &str, frequency: u64) {
75
- self.words.insert(word.to_string(), frequency);
76
+ let was_new = if let Some(existing) = self.words.get_mut(&normalized_key) {
77
+ // Duplicate: sum frequencies and keep highest-frequency canonical form
78
+ let new_total_freq = existing.frequency + frequency;
79
+
80
+ // Keep the canonical form from the higher-frequency variant
81
+ if frequency > existing.frequency {
82
+ existing.canonical = canonical.to_string();
83
+ }
76
84
 
77
- let deletes = self.get_deletes(word, self.max_edit_distance);
78
- for delete in deletes {
79
- self.deletes
80
- .entry(delete)
81
- .or_insert_with(HashSet::new)
82
- .insert(word.to_string());
85
+ existing.frequency = new_total_freq;
86
+ false
87
+ } else {
88
+ // New entry
89
+ self.words.insert(
90
+ normalized_key.clone(),
91
+ WordEntry {
92
+ canonical: canonical.to_string(),
93
+ frequency,
94
+ },
95
+ );
96
+ true
97
+ };
98
+
99
+ // Only generate deletes for new entries (avoid redundant work)
100
+ if was_new {
101
+ let deletes = self.get_deletes(normalized, self.max_edit_distance);
102
+ for delete in deletes {
103
+ self.deletes
104
+ .entry(delete)
105
+ .or_insert_with(HashSet::new)
106
+ .insert(normalized_key.clone());
107
+ }
83
108
  }
109
+
110
+ was_new
84
111
  }
85
112
 
86
113
  fn get_deletes(&self, word: &str, edit_distance: usize) -> HashSet<String> {
@@ -101,8 +128,10 @@ impl SymSpell {
101
128
  processed.insert(item.clone());
102
129
 
103
130
  for delete in self.generate_deletes(&item) {
104
- if delete.len() >= 1 {
105
- deletes.insert(delete.clone());
131
+ deletes.insert(delete.clone());
132
+
133
+ // Only continue processing non-empty strings to avoid infinite loops
134
+ if !delete.is_empty() {
106
135
  temp_queue.push(delete);
107
136
  }
108
137
  }
@@ -130,13 +159,23 @@ impl SymSpell {
130
159
  deletes
131
160
  }
132
161
 
133
- pub fn suggest(&self, word: &str, max_suggestions: usize) -> Vec<Suggestion> {
162
+ pub fn contains(&self, word: &str) -> bool {
163
+ let normalized = Self::normalize_word(word);
164
+ self.words.contains_key(&normalized)
165
+ }
166
+
167
+ pub fn get_frequency(&self, word: &str) -> Option<u64> {
168
+ let normalized = Self::normalize_word(word);
169
+ self.words.get(&normalized).map(|entry| entry.frequency)
170
+ }
171
+
172
+ pub fn suggestions(&self, word: &str, max_suggestions: usize) -> Vec<Suggestion> {
134
173
  let normalized = Self::normalize_word(word);
135
174
  let mut suggestions = Vec::new();
136
175
  let mut seen = HashSet::new();
137
176
 
138
- if let Some(&freq) = self.words.get(&normalized) {
139
- suggestions.push(Suggestion::new(normalized.clone(), 0, freq));
177
+ if let Some(entry) = self.words.get(&normalized) {
178
+ suggestions.push(Suggestion::new(entry.canonical.clone(), 0, entry.frequency));
140
179
  seen.insert(normalized.clone());
141
180
  }
142
181
 
@@ -145,10 +184,10 @@ impl SymSpell {
145
184
  for delete in &input_deletes {
146
185
  // Check if this delete is itself a dictionary word (important for finding words shorter than input)
147
186
  if !seen.contains(delete) {
148
- if let Some(&freq) = self.words.get(delete) {
187
+ if let Some(entry) = self.words.get(delete) {
149
188
  let distance = self.edit_distance(&normalized, delete);
150
189
  if distance <= self.max_edit_distance {
151
- suggestions.push(Suggestion::new(delete.clone(), distance, freq));
190
+ suggestions.push(Suggestion::new(entry.canonical.clone(), distance, entry.frequency));
152
191
  seen.insert(delete.clone());
153
192
  }
154
193
  }
@@ -163,8 +202,8 @@ impl SymSpell {
163
202
 
164
203
  let distance = self.edit_distance(&normalized, candidate);
165
204
  if distance <= self.max_edit_distance {
166
- if let Some(&freq) = self.words.get(candidate) {
167
- suggestions.push(Suggestion::new(candidate.clone(), distance, freq));
205
+ if let Some(entry) = self.words.get(candidate) {
206
+ suggestions.push(Suggestion::new(entry.canonical.clone(), distance, entry.frequency));
168
207
  seen.insert(candidate.clone());
169
208
  }
170
209
  }
@@ -180,8 +219,8 @@ impl SymSpell {
180
219
 
181
220
  let distance = self.edit_distance(&normalized, candidate);
182
221
  if distance <= self.max_edit_distance {
183
- if let Some(&freq) = self.words.get(candidate) {
184
- suggestions.push(Suggestion::new(candidate.clone(), distance, freq));
222
+ if let Some(entry) = self.words.get(candidate) {
223
+ suggestions.push(Suggestion::new(entry.canonical.clone(), distance, entry.frequency));
185
224
  seen.insert(candidate.clone());
186
225
  }
187
226
  }
@@ -252,13 +291,59 @@ mod tests {
252
291
  #[test]
253
292
  fn test_suggestions() {
254
293
  let mut symspell = SymSpell::new(2);
255
- symspell.add_word("hello", 1000);
256
- symspell.add_word("hell", 500);
257
- symspell.add_word("help", 750);
294
+ symspell.add_word("hello", "hello", 1000);
295
+ symspell.add_word("hell", "hell", 500);
296
+ symspell.add_word("help", "help", 750);
258
297
 
259
- let suggestions = symspell.suggest("helo", 3);
298
+ let suggestions = symspell.suggestions("helo", 3);
260
299
  assert!(!suggestions.is_empty());
261
300
  assert_eq!(suggestions[0].term, "hello");
262
301
  assert_eq!(suggestions[0].distance, 1);
263
302
  }
303
+
304
+ #[test]
305
+ fn test_single_character_corrections() {
306
+ let mut symspell = SymSpell::new(1);
307
+ symspell.add_word("a", "a", 10000);
308
+ symspell.add_word("i", "I", 8000);
309
+ symspell.add_word("o", "o", 6000);
310
+
311
+ let suggestions = symspell.suggestions("x", 5);
312
+ assert!(!suggestions.is_empty(), "Single-character corrections should work");
313
+ assert!(suggestions.iter().any(|s| s.term == "a"), "Should suggest 'a' for 'x'");
314
+
315
+ let suggestions_for_j = symspell.suggestions("j", 5);
316
+ assert!(!suggestions_for_j.is_empty(), "Should find suggestions for 'j'");
317
+ assert!(suggestions_for_j.iter().any(|s| s.term == "I"), "Should suggest canonical 'I' (not 'i')");
318
+ }
319
+
320
+ #[test]
321
+ fn test_duplicate_entries_keep_highest_frequency_canonical() {
322
+ let mut symspell = SymSpell::new(1);
323
+
324
+ // Add high-frequency lowercase variant
325
+ symspell.add_word("hello", "hello", 10000);
326
+
327
+ // Add low-frequency uppercase variant (should not replace canonical)
328
+ symspell.add_word("hello", "HELLO", 100);
329
+
330
+ let suggestions = symspell.suggestions("hello", 1);
331
+ assert_eq!(suggestions.len(), 1);
332
+ assert_eq!(suggestions[0].term, "hello", "Should keep high-frequency 'hello' as canonical, not 'HELLO'");
333
+ assert_eq!(suggestions[0].frequency, 10100, "Should sum frequencies: 10000 + 100 = 10100");
334
+
335
+ // Verify reverse order also works
336
+ let mut symspell2 = SymSpell::new(1);
337
+
338
+ // Add low-frequency first
339
+ symspell2.add_word("world", "WORLD", 100);
340
+
341
+ // Add high-frequency second (should replace canonical)
342
+ symspell2.add_word("world", "world", 10000);
343
+
344
+ let suggestions2 = symspell2.suggestions("world", 1);
345
+ assert_eq!(suggestions2.len(), 1);
346
+ assert_eq!(suggestions2[0].term, "world", "Should update to high-frequency 'world' canonical");
347
+ assert_eq!(suggestions2[0].frequency, 10100, "Should sum frequencies");
348
+ }
264
349
  }