spellkit 0.2.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ [package]
2
+ name = "spellkit"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+ authors = ["Chris Petersen <chris@petersen.io>"]
6
+ license = "MIT"
7
+ description = "Fast, safe typo correction for search-term extraction"
8
+
9
+ [lib]
10
+ name = "spellkit"
11
+ crate-type = ["cdylib"]
12
+
13
+ [dependencies]
14
+ magnus = { version = "0.7", features = ["rb-sys"] }
15
+ hashbrown = "0.15"
16
+ unicode-normalization = "0.1"
17
+ regex = "1.11"
18
+
19
+ [dev-dependencies]
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
3
+
4
+ create_rust_makefile("spellkit/spellkit")
@@ -0,0 +1,75 @@
1
+ use hashbrown::HashSet;
2
+ use regex::{Regex, RegexBuilder};
3
+ use crate::symspell::SymSpell;
4
+
5
+ #[derive(Debug, Clone)]
6
+ pub struct Guards {
7
+ protected_set: HashSet<String>,
8
+ protected_patterns: Vec<Regex>,
9
+ }
10
+
11
+ impl Guards {
12
+ pub fn new() -> Self {
13
+ Self {
14
+ protected_set: HashSet::new(),
15
+ protected_patterns: Vec::new(),
16
+ }
17
+ }
18
+
19
+ pub fn load_protected(&mut self, content: &str) {
20
+ for line in content.lines() {
21
+ let trimmed = line.trim();
22
+ if !trimmed.is_empty() && !trimmed.starts_with('#') {
23
+ // Store literal form
24
+ self.protected_set.insert(trimmed.to_string());
25
+ // Store lowercase form
26
+ self.protected_set.insert(trimmed.to_lowercase());
27
+ // Store normalized form (strips whitespace, converts to lowercase)
28
+ // This ensures variants like "newyork" are protected if "New York" is in the list
29
+ let normalized = SymSpell::normalize_word(trimmed);
30
+ self.protected_set.insert(normalized);
31
+ }
32
+ }
33
+ }
34
+
35
+ pub fn add_pattern_with_flags(
36
+ &mut self,
37
+ pattern: &str,
38
+ case_insensitive: bool,
39
+ multiline: bool,
40
+ extended: bool,
41
+ ) -> Result<(), String> {
42
+ match RegexBuilder::new(pattern)
43
+ .case_insensitive(case_insensitive)
44
+ .multi_line(multiline)
45
+ .ignore_whitespace(extended)
46
+ .build()
47
+ {
48
+ Ok(regex) => {
49
+ self.protected_patterns.push(regex);
50
+ Ok(())
51
+ }
52
+ Err(e) => Err(format!("Invalid regex pattern: {}", e)),
53
+ }
54
+ }
55
+
56
+ pub fn is_protected(&self, word: &str) -> bool {
57
+ let lower = word.to_lowercase();
58
+
59
+ if self.protected_set.contains(word) || self.protected_set.contains(&lower) {
60
+ return true;
61
+ }
62
+
63
+ for pattern in &self.protected_patterns {
64
+ if pattern.is_match(word) {
65
+ return true;
66
+ }
67
+ }
68
+
69
+ false
70
+ }
71
+
72
+ pub fn is_protected_normalized(&self, word: &str, normalized: &str) -> bool {
73
+ self.is_protected(word) || self.is_protected(normalized)
74
+ }
75
+ }
@@ -0,0 +1,393 @@
1
+ mod symspell;
2
+ mod guards;
3
+
4
+ use magnus::{class, define_module, function, method, prelude::*, Error, RArray, RHash, Ruby, Value, TryConvert};
5
+ use std::sync::{Arc, RwLock};
6
+ use symspell::SymSpell;
7
+ use guards::Guards;
8
+
9
+ use std::time::{SystemTime, UNIX_EPOCH};
10
+
11
+ #[derive(Clone)]
12
+ #[magnus::wrap(class = "SpellKit::Checker", free_immediately, size)]
13
+ struct Checker {
14
+ state: Arc<RwLock<CheckerState>>,
15
+ }
16
+
17
+ struct CheckerState {
18
+ symspell: Option<SymSpell>,
19
+ guards: Guards,
20
+ loaded: bool,
21
+ frequency_threshold: f64,
22
+ loaded_at: Option<u64>,
23
+ dictionary_size: usize,
24
+ edit_distance: usize,
25
+ skipped_malformed: usize,
26
+ skipped_multiword: usize,
27
+ skipped_invalid_freq: usize,
28
+ skipped_duplicates: usize,
29
+ }
30
+
31
+ impl CheckerState {
32
+ fn new() -> Self {
33
+ Self {
34
+ symspell: None,
35
+ guards: Guards::new(),
36
+ loaded: false,
37
+ frequency_threshold: 10.0,
38
+ loaded_at: None,
39
+ dictionary_size: 0,
40
+ edit_distance: 1,
41
+ skipped_malformed: 0,
42
+ skipped_multiword: 0,
43
+ skipped_invalid_freq: 0,
44
+ skipped_duplicates: 0,
45
+ }
46
+ }
47
+ }
48
+
49
+ // Helper function to correct a single word
50
+ // Returns the corrected word or the original if no correction is appropriate
51
+ fn correct_word(
52
+ state: &CheckerState,
53
+ symspell: &SymSpell,
54
+ word: &str,
55
+ ) -> String {
56
+ // Always check if word is protected
57
+ let normalized = SymSpell::normalize_word(word);
58
+ if state.guards.is_protected_normalized(word, &normalized) {
59
+ return word.to_string();
60
+ }
61
+
62
+ let suggestions = symspell.suggestions(word, 5);
63
+
64
+ // If exact match exists, return canonical form from dictionary
65
+ if !suggestions.is_empty() && suggestions[0].distance == 0 {
66
+ return suggestions[0].term.clone();
67
+ }
68
+
69
+ // Get original word's frequency (if it exists in dictionary)
70
+ let original_freq = symspell.get_frequency(word);
71
+
72
+ // Find best correction with frequency threshold
73
+ for suggestion in &suggestions {
74
+ if suggestion.distance <= state.edit_distance {
75
+ // Apply frequency threshold
76
+ let passes_threshold = match original_freq {
77
+ // Word not in dictionary: require suggestion frequency >= absolute threshold
78
+ None => suggestion.frequency as f64 >= state.frequency_threshold,
79
+ // Word in dictionary: require suggestion frequency >= threshold * original frequency
80
+ Some(orig_freq) => {
81
+ suggestion.frequency as f64 >= state.frequency_threshold * orig_freq as f64
82
+ }
83
+ };
84
+
85
+ if passes_threshold {
86
+ return suggestion.term.clone();
87
+ }
88
+ }
89
+ }
90
+
91
+ // No suggestions passed the threshold
92
+ word.to_string()
93
+ }
94
+
95
+ impl Checker {
96
+ fn new() -> Self {
97
+ Self {
98
+ state: Arc::new(RwLock::new(CheckerState::new())),
99
+ }
100
+ }
101
+
102
+ fn load_full(&self, config: RHash) -> Result<(), Error> {
103
+ let ruby = Ruby::get().unwrap();
104
+
105
+ // Required: dictionary path
106
+ let dictionary_path: String = TryConvert::try_convert(
107
+ config.fetch::<_, Value>("dictionary_path")
108
+ .map_err(|_| Error::new(ruby.exception_arg_error(), "dictionary_path is required"))?
109
+ )?;
110
+
111
+ // Optional: edit distance
112
+ let edit_dist: usize = config.get("edit_distance")
113
+ .and_then(|v: Value| TryConvert::try_convert(v).ok())
114
+ .unwrap_or(1);
115
+
116
+ if edit_dist > 2 {
117
+ return Err(Error::new(ruby.exception_arg_error(), "edit_distance must be 1 or 2"));
118
+ }
119
+
120
+ // Stream dictionary loading: read line-by-line and add directly to SymSpell
121
+ // This avoids buffering the entire file and intermediate Vec allocation
122
+ let file = std::fs::File::open(&dictionary_path)
123
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("Failed to open dictionary file: {}", e)))?;
124
+
125
+ let reader = std::io::BufReader::new(file);
126
+ let mut symspell = SymSpell::new(edit_dist);
127
+ let mut dictionary_size = 0;
128
+ let mut skipped_malformed = 0;
129
+ let mut skipped_multiword = 0;
130
+ let mut skipped_invalid_freq = 0;
131
+ let mut skipped_duplicates = 0;
132
+
133
+ use std::io::BufRead;
134
+ for line in reader.lines() {
135
+ let line = line.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("Failed to read line: {}", e)))?;
136
+
137
+ // Try tab-separated first (allows multi-word terms), then space-separated (SymSpell format)
138
+ let parts: Vec<&str> = if line.contains('\t') {
139
+ line.split('\t').collect()
140
+ } else {
141
+ line.split_whitespace().collect()
142
+ };
143
+
144
+ // Validate we have exactly 2 columns (term and frequency)
145
+ if parts.len() != 2 {
146
+ skipped_malformed += 1;
147
+ continue;
148
+ }
149
+
150
+ let term = parts[0].trim();
151
+ let freq_str = parts[1].trim();
152
+
153
+ // Skip empty terms or frequencies
154
+ if term.is_empty() || freq_str.is_empty() {
155
+ skipped_malformed += 1;
156
+ continue;
157
+ }
158
+
159
+ // Check for multi-word terms (SymSpell algorithm doesn't support phrases)
160
+ if term.contains(char::is_whitespace) {
161
+ skipped_multiword += 1;
162
+ continue;
163
+ }
164
+
165
+ // Parse frequency
166
+ match freq_str.parse::<u64>() {
167
+ Ok(freq) => {
168
+ let normalized = SymSpell::normalize_word(term);
169
+ let was_new = symspell.add_word(&normalized, term, freq);
170
+ if was_new {
171
+ dictionary_size += 1;
172
+ } else {
173
+ skipped_duplicates += 1;
174
+ }
175
+ }
176
+ Err(_) => {
177
+ skipped_invalid_freq += 1;
178
+ }
179
+ }
180
+ }
181
+
182
+ let mut guards = Guards::new();
183
+
184
+ // Load optional protected terms file
185
+ if let Some(protected_path) = config.get("protected_path") {
186
+ let path: String = TryConvert::try_convert(protected_path)?;
187
+ let content = std::fs::read_to_string(&path)
188
+ .map_err(|e| Error::new(ruby.exception_runtime_error(),
189
+ format!("Failed to read protected terms file '{}': {}", path, e)))?;
190
+ guards.load_protected(&content);
191
+ }
192
+
193
+ // Load optional protected patterns
194
+ if let Some(patterns_value) = config.get("protected_patterns") {
195
+ let patterns: RArray = TryConvert::try_convert(patterns_value)?;
196
+ for pattern_value in patterns.into_iter() {
197
+ let pattern_hash: RHash = TryConvert::try_convert(pattern_value)?;
198
+
199
+ let source: String = TryConvert::try_convert(
200
+ pattern_hash.fetch::<_, Value>("source")
201
+ .map_err(|_| Error::new(ruby.exception_arg_error(), "pattern hash missing 'source' key"))?
202
+ )?;
203
+
204
+ let case_insensitive: bool = pattern_hash.get("case_insensitive")
205
+ .and_then(|v: Value| TryConvert::try_convert(v).ok())
206
+ .unwrap_or(false);
207
+
208
+ let multiline: bool = pattern_hash.get("multiline")
209
+ .and_then(|v: Value| TryConvert::try_convert(v).ok())
210
+ .unwrap_or(false);
211
+
212
+ let extended: bool = pattern_hash.get("extended")
213
+ .and_then(|v: Value| TryConvert::try_convert(v).ok())
214
+ .unwrap_or(false);
215
+
216
+ guards.add_pattern_with_flags(&source, case_insensitive, multiline, extended)
217
+ .map_err(|e| Error::new(ruby.exception_arg_error(), e))?;
218
+ }
219
+ }
220
+
221
+ // Optional frequency threshold
222
+ let frequency_threshold: f64 = config.get("frequency_threshold")
223
+ .and_then(|v: Value| TryConvert::try_convert(v).ok())
224
+ .unwrap_or(10.0);
225
+
226
+ // Validate frequency threshold
227
+ if !frequency_threshold.is_finite() {
228
+ return Err(Error::new(ruby.exception_arg_error(), "frequency_threshold must be finite (not NaN or Infinity)"));
229
+ }
230
+
231
+ if frequency_threshold < 0.0 {
232
+ return Err(Error::new(ruby.exception_arg_error(), format!("frequency_threshold must be non-negative, got: {}", frequency_threshold)));
233
+ }
234
+
235
+ let loaded_at = SystemTime::now()
236
+ .duration_since(UNIX_EPOCH)
237
+ .ok()
238
+ .map(|d| d.as_secs());
239
+
240
+ let mut state = self.state.write().unwrap();
241
+ state.symspell = Some(symspell);
242
+ state.guards = guards;
243
+ state.frequency_threshold = frequency_threshold;
244
+ state.loaded = true;
245
+ state.loaded_at = loaded_at;
246
+ state.dictionary_size = dictionary_size;
247
+ state.edit_distance = edit_dist;
248
+ state.skipped_malformed = skipped_malformed;
249
+ state.skipped_multiword = skipped_multiword;
250
+ state.skipped_invalid_freq = skipped_invalid_freq;
251
+ state.skipped_duplicates = skipped_duplicates;
252
+
253
+ Ok(())
254
+ }
255
+
256
+ fn suggestions(&self, word: String, max: Option<usize>) -> Result<RArray, Error> {
257
+ let ruby = Ruby::get().unwrap();
258
+ let max_suggestions = max.unwrap_or(5);
259
+ let state = self.state.read().unwrap();
260
+
261
+ if !state.loaded {
262
+ return Err(Error::new(ruby.exception_runtime_error(), "Dictionary not loaded. Call load! first"));
263
+ }
264
+
265
+ if let Some(ref symspell) = state.symspell {
266
+ let suggestions = symspell.suggestions(&word, max_suggestions);
267
+ let result = RArray::new();
268
+
269
+ for suggestion in suggestions {
270
+ let hash = RHash::new();
271
+ hash.aset("term", suggestion.term)?;
272
+ hash.aset("distance", suggestion.distance)?;
273
+ hash.aset("freq", suggestion.frequency)?;
274
+ result.push(hash)?;
275
+ }
276
+
277
+ Ok(result)
278
+ } else {
279
+ Err(Error::new(ruby.exception_runtime_error(), "SymSpell not initialized"))
280
+ }
281
+ }
282
+
283
+ fn correct(&self, word: String) -> Result<bool, Error> {
284
+ let ruby = Ruby::get().unwrap();
285
+ let state = self.state.read().unwrap();
286
+
287
+ if !state.loaded {
288
+ return Err(Error::new(ruby.exception_runtime_error(), "Dictionary not loaded. Call load! first"));
289
+ }
290
+
291
+ if let Some(ref symspell) = state.symspell {
292
+ Ok(symspell.contains(&word))
293
+ } else {
294
+ Err(Error::new(ruby.exception_runtime_error(), "SymSpell not initialized"))
295
+ }
296
+ }
297
+
298
+ fn correct_if_unknown(&self, word: String) -> Result<String, Error> {
299
+ let ruby = Ruby::get().unwrap();
300
+ let state = self.state.read().unwrap();
301
+
302
+ if !state.loaded {
303
+ return Err(Error::new(ruby.exception_runtime_error(), "Dictionary not loaded. Call load! first"));
304
+ }
305
+
306
+ if let Some(ref symspell) = state.symspell {
307
+ Ok(correct_word(&state, symspell, &word))
308
+ } else {
309
+ Err(Error::new(ruby.exception_runtime_error(), "SymSpell not initialized"))
310
+ }
311
+ }
312
+
313
+ fn correct_tokens(&self, tokens: RArray) -> Result<RArray, Error> {
314
+ // Optimize batch correction by acquiring lock once for all tokens
315
+ // instead of calling correct_if_unknown per token (which re-locks each time)
316
+ let ruby = Ruby::get().unwrap();
317
+ let state = self.state.read().unwrap();
318
+
319
+ if !state.loaded {
320
+ return Err(Error::new(ruby.exception_runtime_error(), "Dictionary not loaded. Call load! first"));
321
+ }
322
+
323
+ let result = RArray::new();
324
+
325
+ if let Some(ref symspell) = state.symspell {
326
+ for token in tokens.into_iter() {
327
+ let word: String = TryConvert::try_convert(token)?;
328
+ let corrected = correct_word(&state, symspell, &word);
329
+ result.push(corrected)?;
330
+ }
331
+
332
+ Ok(result)
333
+ } else {
334
+ Err(Error::new(ruby.exception_runtime_error(), "SymSpell not initialized"))
335
+ }
336
+ }
337
+
338
+ fn stats(&self) -> Result<RHash, Error> {
339
+ let state = self.state.read().unwrap();
340
+ let stats = RHash::new();
341
+
342
+ if !state.loaded {
343
+ stats.aset("loaded", false)?;
344
+ return Ok(stats);
345
+ }
346
+
347
+ stats.aset("loaded", true)?;
348
+ stats.aset("dictionary_size", state.dictionary_size)?;
349
+ stats.aset("edit_distance", state.edit_distance)?;
350
+ stats.aset("skipped_malformed", state.skipped_malformed)?;
351
+ stats.aset("skipped_multiword", state.skipped_multiword)?;
352
+ stats.aset("skipped_invalid_freq", state.skipped_invalid_freq)?;
353
+ stats.aset("skipped_duplicates", state.skipped_duplicates)?;
354
+
355
+ if let Some(loaded_at) = state.loaded_at {
356
+ stats.aset("loaded_at", loaded_at)?;
357
+ }
358
+
359
+ Ok(stats)
360
+ }
361
+
362
+ fn healthcheck(&self) -> Result<(), Error> {
363
+ let ruby = Ruby::get().unwrap();
364
+ let state = self.state.read().unwrap();
365
+
366
+ if !state.loaded {
367
+ return Err(Error::new(ruby.exception_runtime_error(), "Dictionary not loaded"));
368
+ }
369
+
370
+ if state.symspell.is_none() {
371
+ return Err(Error::new(ruby.exception_runtime_error(), "SymSpell not initialized"));
372
+ }
373
+
374
+ Ok(())
375
+ }
376
+ }
377
+
378
+ #[magnus::init]
379
+ fn init(_ruby: &Ruby) -> Result<(), Error> {
380
+ let module = define_module("SpellKit")?;
381
+ let checker_class = module.define_class("Checker", class::object())?;
382
+
383
+ checker_class.define_singleton_method("new", function!(Checker::new, 0))?;
384
+ checker_class.define_method("load!", method!(Checker::load_full, 1))?;
385
+ checker_class.define_method("suggestions", method!(Checker::suggestions, 2))?;
386
+ checker_class.define_method("correct?", method!(Checker::correct, 1))?;
387
+ checker_class.define_method("correct", method!(Checker::correct_if_unknown, 1))?;
388
+ checker_class.define_method("correct_tokens", method!(Checker::correct_tokens, 1))?;
389
+ checker_class.define_method("stats", method!(Checker::stats, 0))?;
390
+ checker_class.define_method("healthcheck", method!(Checker::healthcheck, 0))?;
391
+
392
+ Ok(())
393
+ }