spellkit 0.1.0.pre.1 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8b0bb02947ee896cb9fab3cef53771c6a80e4e14123e518ffacb360b2c136b4d
4
- data.tar.gz: 2a5af3e67e414f37fb6ff0a3c3a56a1d771e1480f29380b6196000f9f27e7071
3
+ metadata.gz: 6e690fa50208d003679afff3117b6f00664e6938e7feb992eff6a6544fad279e
4
+ data.tar.gz: 17d41414cbd48e093913cfa8765aac063f1cb17283df13414ad801fdf0aa79ae
5
5
  SHA512:
6
- metadata.gz: 1cf8d17fbdbcea925c32414e0d746ab4533604941b1603b127667f17f224b5c027f1aeb40b31f712c02a4bc584a1694ca75ef2a717563fe98d5d85d505a427da
7
- data.tar.gz: 379eb6fa669ea2f13906cb121a69877414c1338f2b68509f9be55d79c29946eaf293136bbee7a766c75841db2d7fc6527cecbb01550a0ebd4cbfb97bcaf13f7c
6
+ metadata.gz: '0989bcacde87e9405c99f8674c681be65844a381cac11a00f8f05dd2f1a54312ce9d1799ca6317688255ee5fad454f4670845163b87e6a9541a981fff8685b35'
7
+ data.tar.gz: f12c69803c39ee74083a2090c85aac7bdfe7c39ac81995a95c09ad8c1e70df35cfb8d332de287413accf99aa4b6960e9e0c5f77ccd39b37d4ae092b06d8ea2d4
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # SpellKit
1
+ <img src="/docs/assets/spellkit-wide.png" alt="spellkit" height="160px">
2
2
 
3
3
  Fast, safe typo correction for search-term extraction, wrapping the SymSpell algorithm in Rust via Magnus.
4
4
 
@@ -9,6 +9,26 @@ SpellKit provides:
9
9
  - **Sub-millisecond latency** - p95 < 2µs on small dictionaries
10
10
  - **Thread-safe** - built with Rust's Arc<RwLock> for safe concurrent access
11
11
 
12
+ ## Why SpellKit?
13
+
14
+ ### No Runtime Dependencies
15
+ SpellKit is a pure Ruby gem with a Rust extension. Just `gem install spellkit` and you're done. No need to install Aspell, Hunspell, or other system packages. This makes deployment simpler and more reliable across different environments.
16
+
17
+ ### Fast Performance
18
+ Built on the SymSpell algorithm with Rust, SpellKit delivers:
19
+ - **350,000+ operations/second** for spell checking
20
+ - **3.7x faster** than Aspell for correctness checks
21
+ - **40x faster** than Aspell for generating suggestions
22
+ - **p99 latency < 25µs** even under load
23
+
24
+ See the [Benchmarks](#benchmarks) section for detailed comparisons.
25
+
26
+ ### Production Ready
27
+ - Thread-safe concurrent access
28
+ - Hot reload dictionaries without restarts
29
+ - Instance-based API for multi-domain support
30
+ - Comprehensive error handling
31
+
12
32
  ## Installation
13
33
 
14
34
  Add to your Gemfile:
@@ -42,13 +62,17 @@ end
42
62
  # Or load from local file
43
63
  # SpellKit.load!(dictionary: "path/to/dictionary.tsv")
44
64
 
65
+ # Check if a word is spelled correctly
66
+ puts SpellKit.correct?("hello")
67
+ # => true
68
+
45
69
  # Get suggestions for a misspelled word
46
- suggestions = SpellKit.suggest("helo", 5)
70
+ suggestions = SpellKit.suggestions("helllo", 5)
47
71
  puts suggestions.inspect
48
72
  # => [{"term"=>"hello", "distance"=>1, "freq"=>...}]
49
73
 
50
74
  # Correct a typo
51
- corrected = SpellKit.correct_if_unknown("helo")
75
+ corrected = SpellKit.correct("helllo")
52
76
  puts corrected
53
77
  # => "hello"
54
78
 
@@ -76,16 +100,20 @@ SpellKit.load!(dictionary: "https://example.com/dict.tsv")
76
100
  # Or from local file
77
101
  SpellKit.load!(dictionary: "models/dictionary.tsv", edit_distance: 1)
78
102
 
103
+ # Check if a word is correct
104
+ SpellKit.correct?("hello")
105
+ # => true
106
+
79
107
  # Get suggestions
80
- SpellKit.suggest("lyssis", 5)
108
+ SpellKit.suggestions("lyssis", 5)
81
109
  # => [{"term"=>"lysis", "distance"=>1, "freq"=>2000}, ...]
82
110
 
83
111
  # Correct a typo
84
- SpellKit.correct_if_unknown("helo")
112
+ SpellKit.correct("helllo")
85
113
  # => "hello"
86
114
 
87
115
  # Batch correction
88
- tokens = %w[helo wrld ruby]
116
+ tokens = %w[helllo wrld ruby]
89
117
  SpellKit.correct_tokens(tokens)
90
118
  # => ["hello", "world", "ruby"]
91
119
  ```
@@ -118,13 +146,13 @@ SpellKit.load!(
118
146
  protected_patterns: [/^[A-Z]{3,4}\d+$/]
119
147
  )
120
148
 
121
- # Use guard: :domain to enable protection
122
- SpellKit.correct_if_unknown("CDK10", guard: :domain)
149
+ # Protected terms are automatically respected
150
+ SpellKit.correct("CDK10")
123
151
  # => "CDK10" # protected, never changed
124
152
 
125
- # Batch correction with guards
126
- tokens = %w[helo wrld ABC-123 for CDK10]
127
- SpellKit.correct_tokens(tokens, guard: :domain)
153
+ # Batch correction with protection
154
+ tokens = %w[helllo wrld ABC-123 for CDK10]
155
+ SpellKit.correct_tokens(tokens)
128
156
  # => ["hello", "world", "ABC-123", "for", "CDK10"]
129
157
  ```
130
158
 
@@ -147,8 +175,8 @@ legal_checker.load!(
147
175
  )
148
176
 
149
177
  # Use them independently
150
- medical_checker.suggest("lyssis", 5)
151
- legal_checker.suggest("contractt", 5)
178
+ medical_checker.suggestions("lyssis", 5)
179
+ legal_checker.suggestions("contractt", 5)
152
180
 
153
181
  # Each maintains its own state
154
182
  medical_checker.stats # Shows medical dictionary stats
@@ -169,7 +197,7 @@ SpellKit.configure do |config|
169
197
  end
170
198
 
171
199
  # This becomes the default instance
172
- SpellKit.suggest("word", 5) # Uses configured dictionary
200
+ SpellKit.suggestions("word", 5) # Uses configured dictionary
173
201
  ```
174
202
 
175
203
  ## Dictionary Format
@@ -238,10 +266,96 @@ SpellKit.load!(
238
266
  protected_path: "models/protected.txt", # optional
239
267
  protected_patterns: [/^[A-Z]{3,4}\d+$/], # optional
240
268
  edit_distance: 1, # 1 (default) or 2
241
- frequency_threshold: 10.0 # default: 10.0
269
+ frequency_threshold: 10.0, # default: 10.0 (minimum frequency for corrections)
270
+
271
+ # Skip pattern filters (all default to false)
272
+ skip_urls: true, # Skip URLs (http://, https://, www.)
273
+ skip_emails: true, # Skip email addresses
274
+ skip_hostnames: true, # Skip hostnames (example.com)
275
+ skip_code_patterns: true, # Skip code identifiers (camelCase, snake_case, etc.)
276
+ skip_numbers: true # Skip numeric patterns (versions, IDs, measurements)
242
277
  )
243
278
  ```
244
279
 
280
+ ### Frequency Threshold
281
+
282
+ The `frequency_threshold` parameter controls which corrections are accepted by `correct` and `correct_tokens`:
283
+
284
+ - **For misspelled words** (not in dictionary): Only suggest corrections with frequency ≥ `frequency_threshold`
285
+ - **For dictionary words**: Only suggest alternatives with frequency ≥ `frequency_threshold × original_frequency`
286
+
287
+ This prevents suggesting rare words as corrections for common typos.
288
+
289
+ **Example:**
290
+ ```ruby
291
+ # With default threshold (10.0), suggest any correction with freq ≥ 10
292
+ SpellKit.load!(dictionary: "dict.tsv")
293
+ SpellKit.correct("helllo") # => "hello" (if freq ≥ 10)
294
+
295
+ # With high threshold (1000.0), only suggest common corrections
296
+ SpellKit.load!(dictionary: "dict.tsv", frequency_threshold: 1000.0)
297
+ SpellKit.correct("helllo") # => "hello" (if freq ≥ 1000)
298
+ SpellKit.correct("rarword") # => "rarword" (no correction if freq < 1000)
299
+ ```
300
+
301
+ ### Skip Patterns
302
+
303
+ SpellKit can automatically skip certain patterns to avoid "correcting" technical terms, URLs, and other special content. Inspired by Aspell's filter modes, these patterns are automatically applied when configured.
304
+
305
+ **Available skip patterns:**
306
+
307
+ ```ruby
308
+ SpellKit.load!(
309
+ dictionary: "dict.tsv",
310
+ skip_urls: true, # Skip URLs: https://example.com, www.example.com
311
+ skip_emails: true, # Skip emails: user@domain.com, admin+tag@example.com
312
+ skip_hostnames: true, # Skip hostnames: example.com, api.example.com
313
+ skip_code_patterns: true, # Skip code: camelCase, snake_case, PascalCase, dotted.paths
314
+ skip_numbers: true # Skip numbers: 1.2.3, #123, 5kg, 100mb
315
+ )
316
+
317
+ # With skip patterns enabled, technical content is preserved
318
+ SpellKit.correct("https://example.com") # => "https://example.com"
319
+ SpellKit.correct("user@test.com") # => "user@test.com"
320
+ SpellKit.correct("getElementById") # => "getElementById"
321
+ SpellKit.correct("version-1.2.3") # => "version-1.2.3"
322
+
323
+ # Regular typos are still corrected
324
+ SpellKit.correct("helllo") # => "hello"
325
+ ```
326
+
327
+ **What each skip pattern matches:**
328
+
329
+ - **`skip_urls`**: `http://`, `https://`, `www.` URLs
330
+ - **`skip_emails`**: Email addresses with standard formats including `+` and `.` in usernames
331
+ - **`skip_hostnames`**: Domain names like `example.com`, `api.example.co.uk`
332
+ - **`skip_code_patterns`**:
333
+ - `camelCase` (starts lowercase)
334
+ - `PascalCase` (starts uppercase, mixed case)
335
+ - `snake_case` and `SCREAMING_SNAKE_CASE`
336
+ - `dotted.paths` like `Array.map` or `config.yml`
337
+ - **`skip_numbers`**:
338
+ - Version numbers: `1.0`, `2.5.3`, `10.15.7.1`
339
+ - Hash/IDs: `#123`, `#4567`
340
+ - Measurements: `5kg`, `2.5m`, `100mb`, `16px`
341
+ - Words starting with digits: `5test`, `123abc`
342
+
343
+ **Combining with protected_patterns:**
344
+
345
+ Skip patterns work alongside your custom `protected_patterns`:
346
+
347
+ ```ruby
348
+ SpellKit.load!(
349
+ dictionary: "dict.tsv",
350
+ skip_urls: true, # Built-in URL skipping
351
+ protected_patterns: [/^CUSTOM-\d+$/] # Your custom patterns
352
+ )
353
+
354
+ # Both work together automatically
355
+ SpellKit.correct("https://example.com") # => "https://example.com" (skip_urls)
356
+ SpellKit.correct("CUSTOM-123") # => "CUSTOM-123" (custom pattern)
357
+ ```
358
+
245
359
  ## API Reference
246
360
 
247
361
  ### `SpellKit.load!(**options)`
@@ -254,12 +368,24 @@ Load or reload dictionaries. Thread-safe atomic swap. Accepts URLs (auto-downloa
254
368
  - `protected_patterns:` (optional) - Array of Regexp or String patterns to protect
255
369
  - `edit_distance:` (default: 1) - Maximum edit distance (1 or 2)
256
370
  - `frequency_threshold:` (default: 10.0) - Minimum frequency ratio for corrections
371
+ - `skip_urls:` (default: false) - Skip URLs (http://, https://, www.)
372
+ - `skip_emails:` (default: false) - Skip email addresses
373
+ - `skip_hostnames:` (default: false) - Skip hostnames (example.com)
374
+ - `skip_code_patterns:` (default: false) - Skip code identifiers (camelCase, snake_case, etc.)
375
+ - `skip_numbers:` (default: false) - Skip numeric patterns (versions, IDs, measurements)
257
376
 
258
377
  **Examples:**
259
378
  ```ruby
260
379
  # From URL (recommended for getting started)
261
380
  SpellKit.load!(dictionary: SpellKit::DEFAULT_DICTIONARY_URL)
262
381
 
382
+ # With skip patterns for technical content
383
+ SpellKit.load!(
384
+ dictionary: SpellKit::DEFAULT_DICTIONARY_URL,
385
+ skip_urls: true,
386
+ skip_code_patterns: true
387
+ )
388
+
263
389
  # From custom URL
264
390
  SpellKit.load!(dictionary: "https://example.com/dict.tsv")
265
391
 
@@ -267,7 +393,24 @@ SpellKit.load!(dictionary: "https://example.com/dict.tsv")
267
393
  SpellKit.load!(dictionary: "/path/to/dictionary.tsv")
268
394
  ```
269
395
 
270
- ### `SpellKit.suggest(word, max = 5)`
396
+ ### `SpellKit.correct?(word)`
397
+
398
+ Check if a word is spelled correctly (exact dictionary match).
399
+
400
+ **Parameters:**
401
+ - `word` (required) - The word to check
402
+
403
+ **Returns:** Boolean - true if word exists in dictionary, false otherwise
404
+
405
+ **Performance:** Very fast O(1) HashMap lookup. Use this instead of `suggest()` when you only need to check correctness.
406
+
407
+ **Example:**
408
+ ```ruby
409
+ SpellKit.correct?("hello") # => true
410
+ SpellKit.correct?("helllo") # => false
411
+ ```
412
+
413
+ ### `SpellKit.suggestions(word, max = 5)`
271
414
 
272
415
  Get ranked suggestions for a word.
273
416
 
@@ -277,16 +420,38 @@ Get ranked suggestions for a word.
277
420
 
278
421
  **Returns:** Array of hashes with `"term"`, `"distance"`, and `"freq"` keys
279
422
 
280
- ### `SpellKit.correct_if_unknown(word, guard:)`
423
+ **Example:**
424
+ ```ruby
425
+ SpellKit.suggestions("helllo", 5)
426
+ # => [{"term"=>"hello", "distance"=>1, "freq"=>10000}, ...]
427
+ ```
428
+
429
+ ### `SpellKit.correct(word)`
281
430
 
282
- Return corrected word or original if no better match found.
431
+ Return corrected word or original if no better match found. Respects `frequency_threshold` configuration. Protected terms and skip patterns are automatically applied when configured.
283
432
 
284
- **Options:**
285
- - `guard:` - Set to `:domain` to enable protection checks
433
+ **Parameters:**
434
+ - `word` (required) - The word to correct
435
+
436
+ **Behavior:**
437
+ - Returns original word if it exists in dictionary
438
+ - For misspellings, only accepts corrections with frequency ≥ `frequency_threshold`
439
+ - Returns original word if no corrections pass the threshold
440
+ - Automatically respects protected terms and skip patterns configured in `load!`
441
+
442
+ **Example:**
443
+ ```ruby
444
+ SpellKit.correct("helllo") # => "hello"
445
+ SpellKit.correct("hello") # => "hello" (already correct)
446
+ SpellKit.correct("CDK10") # => "CDK10" (protected if configured)
447
+ ```
286
448
 
287
- ### `SpellKit.correct_tokens(tokens, guard:)`
449
+ ### `SpellKit.correct_tokens(tokens)`
288
450
 
289
- Batch correction of an array of tokens.
451
+ Batch correction of an array of tokens. Respects `frequency_threshold` configuration. Protected terms and skip patterns are automatically applied when configured.
452
+
453
+ **Parameters:**
454
+ - `tokens` (required) - Array of words to correct
290
455
 
291
456
  **Returns:** Array of corrected strings
292
457
 
@@ -306,7 +471,7 @@ Verify system is properly loaded. Raises error if not.
306
471
 
307
472
  ## Term Protection
308
473
 
309
- The `guard: :domain` option enables protection for specific terms:
474
+ When configured, SpellKit automatically protects specific terms from correction:
310
475
 
311
476
  ### Exact Matches
312
477
  Terms in `protected_path` file are never corrected, even if similar dictionary words exist. Matching is case-insensitive, but original casing is preserved in output.
@@ -370,24 +535,85 @@ end
370
535
  class SearchPreprocessor
371
536
  def self.correct_query(text)
372
537
  tokens = text.downcase.split(/\s+/)
373
- SpellKit.correct_tokens(tokens, guard: :domain).join(" ")
538
+ SpellKit.correct_tokens(tokens).join(" ")
374
539
  end
375
540
  end
376
541
  ```
377
542
 
378
543
  ## Performance
379
544
 
380
- Benchmarked on M1 MacBook Pro with 20-term test dictionary:
545
+ ### SpellKit Standalone (M1 MacBook Pro, Ruby 3.3.0, 80k dictionary)
546
+
547
+ **Single Word Suggestions:**
548
+ - 3,345 i/s (298.96 μs/i) with max: 1 suggestion
549
+ - 3,198 i/s (312.73 μs/i) with max: 5 suggestions
550
+ - 3,073 i/s (325.45 μs/i) with max: 10 suggestions
551
+
552
+ **Correction Performance:**
553
+ - `correct`: 1,858 i/s (538.17 μs/i)
554
+ - `correct_tokens` (batch): 2,005 i/s (498.76 μs/i)
555
+
556
+ **Protection Performance:**
557
+ - Without protection: 2,926 i/s (341.79 μs/i)
558
+ - With protection: 9,337 i/s (107.10 μs/i) - **3.19x faster!**
559
+ *(Protection checks short-circuit expensive dictionary lookups)*
560
+
561
+ **Latency Distribution (10,000 iterations):**
562
+ - p50: 61μs
563
+ - p95: 66μs
564
+ - p99: 105μs
565
+ - max: 298μs
566
+
567
+ **Raw Throughput:** 16,192 ops/sec
568
+
569
+ ### Key Takeaways
570
+ 1. **Consistent Performance**: p95 latency of 66μs with 80k dictionary, p99 at 105μs
571
+ 2. **Guards are Fast**: Protected term checks improve performance by 3.2x by avoiding dictionary lookups
572
+ 3. **High Throughput**: Over 16k operations per second with 80k word dictionary
573
+ 4. **Scales Well**: Minimal performance difference between 1 vs 10 suggestions
574
+
575
+ ## Benchmarks
576
+
577
+ SpellKit includes comprehensive benchmarks to measure performance and compare with other spell checkers.
578
+
579
+ ### Running Benchmarks
580
+
581
+ **Performance Benchmark** - Comprehensive SpellKit performance analysis:
582
+ ```bash
583
+ bundle exec ruby benchmark/performance.rb
584
+ ```
585
+
586
+ Measures:
587
+ - Single word suggestions with varying result limits
588
+ - Correction performance on mixed datasets
589
+ - Batch correction throughput
590
+ - Guard/protection overhead
591
+ - Latency distribution (p50, p95, p99)
592
+ - Raw throughput (ops/sec)
593
+
594
+ **Aspell Comparison** - Direct comparison with Aspell:
595
+ ```bash
596
+ # First install Aspell if needed:
597
+ # macOS: brew install aspell
598
+ # Ubuntu: sudo apt-get install aspell libaspell-dev
599
+
600
+ bundle exec ruby benchmark/comparison_aspell.rb
601
+ ```
602
+
603
+ Compares SpellKit with Aspell on:
604
+ - Single word correction performance
605
+ - Spell checking (correctness tests)
606
+ - Latency distribution at scale
607
+
608
+ See [benchmark/README.md](benchmark/README.md) for detailed results and analysis.
609
+
610
+ ### Why These Benchmarks?
381
611
 
382
- - **Load time**: < 100ms
383
- - **Suggestion latency**: p50 < 2µs, p95 < 2µs
384
- - **Guard checks**: p95 < 1µs
385
- - **Memory**: ~150MB for 1M term dictionary (estimated)
612
+ **SpellKit vs Aspell**: Both provide fuzzy matching and suggestions for misspelled words, but use different algorithms:
613
+ - **SpellKit (SymSpell)**: O(1) lookup complexity, optimized for speed with large dictionaries
614
+ - **Aspell**: Statistical scoring with phonetic similarity, good for natural language
386
615
 
387
- Target for production (1-5M terms):
388
- - Load: < 500ms
389
- - p50: < 30µs, p95: < 100µs
390
- - Memory: 50-150MB
616
+ The comparison shows SpellKit's performance advantage while solving the same problem.
391
617
 
392
618
  ## Building Dictionaries
393
619
 
@@ -113,12 +113,6 @@ dependencies = [
113
113
  "either",
114
114
  ]
115
115
 
116
- [[package]]
117
- name = "itoa"
118
- version = "1.0.15"
119
- source = "registry+https://github.com/rust-lang/crates.io-index"
120
- checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
121
-
122
116
  [[package]]
123
117
  name = "lazy_static"
124
118
  version = "1.5.0"
@@ -275,61 +269,12 @@ version = "1.1.0"
275
269
  source = "registry+https://github.com/rust-lang/crates.io-index"
276
270
  checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
277
271
 
278
- [[package]]
279
- name = "ryu"
280
- version = "1.0.20"
281
- source = "registry+https://github.com/rust-lang/crates.io-index"
282
- checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
283
-
284
272
  [[package]]
285
273
  name = "seq-macro"
286
274
  version = "0.3.6"
287
275
  source = "registry+https://github.com/rust-lang/crates.io-index"
288
276
  checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc"
289
277
 
290
- [[package]]
291
- name = "serde"
292
- version = "1.0.227"
293
- source = "registry+https://github.com/rust-lang/crates.io-index"
294
- checksum = "80ece43fc6fbed4eb5392ab50c07334d3e577cbf40997ee896fe7af40bba4245"
295
- dependencies = [
296
- "serde_core",
297
- "serde_derive",
298
- ]
299
-
300
- [[package]]
301
- name = "serde_core"
302
- version = "1.0.227"
303
- source = "registry+https://github.com/rust-lang/crates.io-index"
304
- checksum = "7a576275b607a2c86ea29e410193df32bc680303c82f31e275bbfcafe8b33be5"
305
- dependencies = [
306
- "serde_derive",
307
- ]
308
-
309
- [[package]]
310
- name = "serde_derive"
311
- version = "1.0.227"
312
- source = "registry+https://github.com/rust-lang/crates.io-index"
313
- checksum = "51e694923b8824cf0e9b382adf0f60d4e05f348f357b38833a3fa5ed7c2ede04"
314
- dependencies = [
315
- "proc-macro2",
316
- "quote",
317
- "syn",
318
- ]
319
-
320
- [[package]]
321
- name = "serde_json"
322
- version = "1.0.145"
323
- source = "registry+https://github.com/rust-lang/crates.io-index"
324
- checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
325
- dependencies = [
326
- "itoa",
327
- "memchr",
328
- "ryu",
329
- "serde",
330
- "serde_core",
331
- ]
332
-
333
278
  [[package]]
334
279
  name = "shell-words"
335
280
  version = "1.1.0"
@@ -349,8 +294,6 @@ dependencies = [
349
294
  "hashbrown",
350
295
  "magnus",
351
296
  "regex",
352
- "serde",
353
- "serde_json",
354
297
  "unicode-normalization",
355
298
  ]
356
299
 
@@ -12,8 +12,6 @@ crate-type = ["cdylib"]
12
12
 
13
13
  [dependencies]
14
14
  magnus = { version = "0.7", features = ["rb-sys"] }
15
- serde = { version = "1.0", features = ["derive"] }
16
- serde_json = "1.0"
17
15
  hashbrown = "0.15"
18
16
  unicode-normalization = "0.1"
19
17
  regex = "1.11"
@@ -1,5 +1,6 @@
1
1
  use hashbrown::HashSet;
2
- use regex::Regex;
2
+ use regex::{Regex, RegexBuilder};
3
+ use crate::symspell::SymSpell;
3
4
 
4
5
  #[derive(Debug, Clone)]
5
6
  pub struct Guards {
@@ -19,14 +20,31 @@ impl Guards {
19
20
  for line in content.lines() {
20
21
  let trimmed = line.trim();
21
22
  if !trimmed.is_empty() && !trimmed.starts_with('#') {
23
+ // Store literal form
22
24
  self.protected_set.insert(trimmed.to_string());
25
+ // Store lowercase form
23
26
  self.protected_set.insert(trimmed.to_lowercase());
27
+ // Store normalized form (strips whitespace, converts to lowercase)
28
+ // This ensures variants like "newyork" are protected if "New York" is in the list
29
+ let normalized = SymSpell::normalize_word(trimmed);
30
+ self.protected_set.insert(normalized);
24
31
  }
25
32
  }
26
33
  }
27
34
 
28
- pub fn add_pattern(&mut self, pattern: &str) -> Result<(), String> {
29
- match Regex::new(pattern) {
35
+ pub fn add_pattern_with_flags(
36
+ &mut self,
37
+ pattern: &str,
38
+ case_insensitive: bool,
39
+ multiline: bool,
40
+ extended: bool,
41
+ ) -> Result<(), String> {
42
+ match RegexBuilder::new(pattern)
43
+ .case_insensitive(case_insensitive)
44
+ .multi_line(multiline)
45
+ .ignore_whitespace(extended)
46
+ .build()
47
+ {
30
48
  Ok(regex) => {
31
49
  self.protected_patterns.push(regex);
32
50
  Ok(())