spellkit 0.1.0.pre.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8b0bb02947ee896cb9fab3cef53771c6a80e4e14123e518ffacb360b2c136b4d
4
- data.tar.gz: 2a5af3e67e414f37fb6ff0a3c3a56a1d771e1480f29380b6196000f9f27e7071
3
+ metadata.gz: df48970adccf2bea2fae1a69f0e3704c0276bac9ab908a1bc5af6835edde0008
4
+ data.tar.gz: 28a8f37086361c742f902fa33bfd39003a1fff758b7ee306de1069344cf114d0
5
5
  SHA512:
6
- metadata.gz: 1cf8d17fbdbcea925c32414e0d746ab4533604941b1603b127667f17f224b5c027f1aeb40b31f712c02a4bc584a1694ca75ef2a717563fe98d5d85d505a427da
7
- data.tar.gz: 379eb6fa669ea2f13906cb121a69877414c1338f2b68509f9be55d79c29946eaf293136bbee7a766c75841db2d7fc6527cecbb01550a0ebd4cbfb97bcaf13f7c
6
+ metadata.gz: fcc1a0678ff2714a8844657f883354594d7bfb5be6e76a2708aa6134fa5ef9ef0a407a4be06a5f727e540ab99e6d21bc086a0bc740d3bd6cc2a5ed7c6d9458b0
7
+ data.tar.gz: 043b4f1aa59dd2ced9219f11736053d577d8afa7bdc83cb746ab91c4dc4bb1d5fd28c05aee79c09e739a3b1dd89c49ee088b2c189af81148db09ac40e0de1f06
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # SpellKit
1
+ <img src="/docs/assets/spellkit-wide.png" alt="spellkit" height="160px">
2
2
 
3
3
  Fast, safe typo correction for search-term extraction, wrapping the SymSpell algorithm in Rust via Magnus.
4
4
 
@@ -9,6 +9,26 @@ SpellKit provides:
9
9
  - **Sub-millisecond latency** - p95 < 2µs on small dictionaries
10
10
  - **Thread-safe** - built with Rust's Arc<RwLock> for safe concurrent access
11
11
 
12
+ ## Why SpellKit?
13
+
14
+ ### No Runtime Dependencies
15
+ SpellKit is a pure Ruby gem with a Rust extension. Just `gem install spellkit` and you're done. No need to install Aspell, Hunspell, or other system packages. This makes deployment simpler and more reliable across different environments.
16
+
17
+ ### Fast Performance
18
+ Built on the SymSpell algorithm with Rust, SpellKit delivers:
19
+ - **350,000+ operations/second** for spell checking
20
+ - **3.7x faster** than Aspell for correctness checks
21
+ - **40x faster** than Aspell for generating suggestions
22
+ - **p99 latency < 25µs** even under load
23
+
24
+ See the [Benchmarks](#benchmarks) section for detailed comparisons.
25
+
26
+ ### Production Ready
27
+ - Thread-safe concurrent access
28
+ - Hot reload dictionaries without restarts
29
+ - Instance-based API for multi-domain support
30
+ - Comprehensive error handling
31
+
12
32
  ## Installation
13
33
 
14
34
  Add to your Gemfile:
@@ -42,13 +62,17 @@ end
42
62
  # Or load from local file
43
63
  # SpellKit.load!(dictionary: "path/to/dictionary.tsv")
44
64
 
65
+ # Check if a word is spelled correctly
66
+ puts SpellKit.correct?("hello")
67
+ # => true
68
+
45
69
  # Get suggestions for a misspelled word
46
- suggestions = SpellKit.suggest("helo", 5)
70
+ suggestions = SpellKit.suggestions("helllo", 5)
47
71
  puts suggestions.inspect
48
72
  # => [{"term"=>"hello", "distance"=>1, "freq"=>...}]
49
73
 
50
74
  # Correct a typo
51
- corrected = SpellKit.correct_if_unknown("helo")
75
+ corrected = SpellKit.correct("helllo")
52
76
  puts corrected
53
77
  # => "hello"
54
78
 
@@ -76,16 +100,20 @@ SpellKit.load!(dictionary: "https://example.com/dict.tsv")
76
100
  # Or from local file
77
101
  SpellKit.load!(dictionary: "models/dictionary.tsv", edit_distance: 1)
78
102
 
103
+ # Check if a word is correct
104
+ SpellKit.correct?("hello")
105
+ # => true
106
+
79
107
  # Get suggestions
80
- SpellKit.suggest("lyssis", 5)
108
+ SpellKit.suggestions("lyssis", 5)
81
109
  # => [{"term"=>"lysis", "distance"=>1, "freq"=>2000}, ...]
82
110
 
83
111
  # Correct a typo
84
- SpellKit.correct_if_unknown("helo")
112
+ SpellKit.correct("helllo")
85
113
  # => "hello"
86
114
 
87
115
  # Batch correction
88
- tokens = %w[helo wrld ruby]
116
+ tokens = %w[helllo wrld ruby]
89
117
  SpellKit.correct_tokens(tokens)
90
118
  # => ["hello", "world", "ruby"]
91
119
  ```
@@ -119,11 +147,11 @@ SpellKit.load!(
119
147
  )
120
148
 
121
149
  # Use guard: :domain to enable protection
122
- SpellKit.correct_if_unknown("CDK10", guard: :domain)
150
+ SpellKit.correct("CDK10", guard: :domain)
123
151
  # => "CDK10" # protected, never changed
124
152
 
125
153
  # Batch correction with guards
126
- tokens = %w[helo wrld ABC-123 for CDK10]
154
+ tokens = %w[helllo wrld ABC-123 for CDK10]
127
155
  SpellKit.correct_tokens(tokens, guard: :domain)
128
156
  # => ["hello", "world", "ABC-123", "for", "CDK10"]
129
157
  ```
@@ -147,8 +175,8 @@ legal_checker.load!(
147
175
  )
148
176
 
149
177
  # Use them independently
150
- medical_checker.suggest("lyssis", 5)
151
- legal_checker.suggest("contractt", 5)
178
+ medical_checker.suggestions("lyssis", 5)
179
+ legal_checker.suggestions("contractt", 5)
152
180
 
153
181
  # Each maintains its own state
154
182
  medical_checker.stats # Shows medical dictionary stats
@@ -169,7 +197,7 @@ SpellKit.configure do |config|
169
197
  end
170
198
 
171
199
  # This becomes the default instance
172
- SpellKit.suggest("word", 5) # Uses configured dictionary
200
+ SpellKit.suggestions("word", 5) # Uses configured dictionary
173
201
  ```
174
202
 
175
203
  ## Dictionary Format
@@ -238,8 +266,94 @@ SpellKit.load!(
238
266
  protected_path: "models/protected.txt", # optional
239
267
  protected_patterns: [/^[A-Z]{3,4}\d+$/], # optional
240
268
  edit_distance: 1, # 1 (default) or 2
241
- frequency_threshold: 10.0 # default: 10.0
269
+ frequency_threshold: 10.0, # default: 10.0 (minimum frequency for corrections)
270
+
271
+ # Skip pattern filters (all default to false)
272
+ skip_urls: true, # Skip URLs (http://, https://, www.)
273
+ skip_emails: true, # Skip email addresses
274
+ skip_hostnames: true, # Skip hostnames (example.com)
275
+ skip_code_patterns: true, # Skip code identifiers (camelCase, snake_case, etc.)
276
+ skip_numbers: true # Skip numeric patterns (versions, IDs, measurements)
277
+ )
278
+ ```
279
+
280
+ ### Frequency Threshold
281
+
282
+ The `frequency_threshold` parameter controls which corrections are accepted by `correct` and `correct_tokens`:
283
+
284
+ - **For misspelled words** (not in dictionary): Only suggest corrections with frequency ≥ `frequency_threshold`
285
+ - **For dictionary words**: Only suggest alternatives with frequency ≥ `frequency_threshold × original_frequency`
286
+
287
+ This prevents suggesting rare words as corrections for common typos.
288
+
289
+ **Example:**
290
+ ```ruby
291
+ # With default threshold (10.0), suggest any correction with freq ≥ 10
292
+ SpellKit.load!(dictionary: "dict.tsv")
293
+ SpellKit.correct("helllo") # => "hello" (if freq ≥ 10)
294
+
295
+ # With high threshold (1000.0), only suggest common corrections
296
+ SpellKit.load!(dictionary: "dict.tsv", frequency_threshold: 1000.0)
297
+ SpellKit.correct("helllo") # => "hello" (if freq ≥ 1000)
298
+ SpellKit.correct("rarword") # => "rarword" (no correction if freq < 1000)
299
+ ```
300
+
301
+ ### Skip Patterns
302
+
303
+ SpellKit can automatically skip certain patterns to avoid "correcting" technical terms, URLs, and other special content. Inspired by Aspell's filter modes, these patterns are applied when `guard: :domain` is enabled.
304
+
305
+ **Available skip patterns:**
306
+
307
+ ```ruby
308
+ SpellKit.load!(
309
+ dictionary: "dict.tsv",
310
+ skip_urls: true, # Skip URLs: https://example.com, www.example.com
311
+ skip_emails: true, # Skip emails: user@domain.com, admin+tag@example.com
312
+ skip_hostnames: true, # Skip hostnames: example.com, api.example.com
313
+ skip_code_patterns: true, # Skip code: camelCase, snake_case, PascalCase, dotted.paths
314
+ skip_numbers: true # Skip numbers: 1.2.3, #123, 5kg, 100mb
315
+ )
316
+
317
+ # With skip patterns enabled, technical content is preserved
318
+ SpellKit.correct("https://example.com", guard: :domain) # => "https://example.com"
319
+ SpellKit.correct("user@test.com", guard: :domain) # => "user@test.com"
320
+ SpellKit.correct("getElementById", guard: :domain) # => "getElementById"
321
+ SpellKit.correct("version-1.2.3", guard: :domain) # => "version-1.2.3"
322
+
323
+ # Regular typos are still corrected
324
+ SpellKit.correct("helllo", guard: :domain) # => "hello"
325
+ ```
326
+
327
+ **What each skip pattern matches:**
328
+
329
+ - **`skip_urls`**: `http://`, `https://`, `www.` URLs
330
+ - **`skip_emails`**: Email addresses with standard formats including `+` and `.` in usernames
331
+ - **`skip_hostnames`**: Domain names like `example.com`, `api.example.co.uk`
332
+ - **`skip_code_patterns`**:
333
+ - `camelCase` (starts lowercase)
334
+ - `PascalCase` (starts uppercase, mixed case)
335
+ - `snake_case` and `SCREAMING_SNAKE_CASE`
336
+ - `dotted.paths` like `Array.map` or `config.yml`
337
+ - **`skip_numbers`**:
338
+ - Version numbers: `1.0`, `2.5.3`, `10.15.7.1`
339
+ - Hash/IDs: `#123`, `#4567`
340
+ - Measurements: `5kg`, `2.5m`, `100mb`, `16px`
341
+ - Words starting with digits: `5test`, `123abc`
342
+
343
+ **Combining with protected_patterns:**
344
+
345
+ Skip patterns work alongside your custom `protected_patterns`:
346
+
347
+ ```ruby
348
+ SpellKit.load!(
349
+ dictionary: "dict.tsv",
350
+ skip_urls: true, # Built-in URL skipping
351
+ protected_patterns: [/^CUSTOM-\d+$/] # Your custom patterns
242
352
  )
353
+
354
+ # Both work together
355
+ SpellKit.correct("https://example.com", guard: :domain) # => "https://example.com" (skip_urls)
356
+ SpellKit.correct("CUSTOM-123", guard: :domain) # => "CUSTOM-123" (custom pattern)
243
357
  ```
244
358
 
245
359
  ## API Reference
@@ -254,12 +368,24 @@ Load or reload dictionaries. Thread-safe atomic swap. Accepts URLs (auto-downloa
254
368
  - `protected_patterns:` (optional) - Array of Regexp or String patterns to protect
255
369
  - `edit_distance:` (default: 1) - Maximum edit distance (1 or 2)
256
370
  - `frequency_threshold:` (default: 10.0) - Minimum frequency ratio for corrections
371
+ - `skip_urls:` (default: false) - Skip URLs (http://, https://, www.)
372
+ - `skip_emails:` (default: false) - Skip email addresses
373
+ - `skip_hostnames:` (default: false) - Skip hostnames (example.com)
374
+ - `skip_code_patterns:` (default: false) - Skip code identifiers (camelCase, snake_case, etc.)
375
+ - `skip_numbers:` (default: false) - Skip numeric patterns (versions, IDs, measurements)
257
376
 
258
377
  **Examples:**
259
378
  ```ruby
260
379
  # From URL (recommended for getting started)
261
380
  SpellKit.load!(dictionary: SpellKit::DEFAULT_DICTIONARY_URL)
262
381
 
382
+ # With skip patterns for technical content
383
+ SpellKit.load!(
384
+ dictionary: SpellKit::DEFAULT_DICTIONARY_URL,
385
+ skip_urls: true,
386
+ skip_code_patterns: true
387
+ )
388
+
263
389
  # From custom URL
264
390
  SpellKit.load!(dictionary: "https://example.com/dict.tsv")
265
391
 
@@ -267,7 +393,24 @@ SpellKit.load!(dictionary: "https://example.com/dict.tsv")
267
393
  SpellKit.load!(dictionary: "/path/to/dictionary.tsv")
268
394
  ```
269
395
 
270
- ### `SpellKit.suggest(word, max = 5)`
396
+ ### `SpellKit.correct?(word)`
397
+
398
+ Check if a word is spelled correctly (exact dictionary match).
399
+
400
+ **Parameters:**
401
+ - `word` (required) - The word to check
402
+
403
+ **Returns:** Boolean - true if word exists in dictionary, false otherwise
404
+
405
+ **Performance:** Very fast O(1) HashMap lookup. Use this instead of `suggest()` when you only need to check correctness.
406
+
407
+ **Example:**
408
+ ```ruby
409
+ SpellKit.correct?("hello") # => true
410
+ SpellKit.correct?("helllo") # => false
411
+ ```
412
+
413
+ ### `SpellKit.suggestions(word, max = 5)`
271
414
 
272
415
  Get ranked suggestions for a word.
273
416
 
@@ -277,16 +420,39 @@ Get ranked suggestions for a word.
277
420
 
278
421
  **Returns:** Array of hashes with `"term"`, `"distance"`, and `"freq"` keys
279
422
 
280
- ### `SpellKit.correct_if_unknown(word, guard:)`
423
+ **Example:**
424
+ ```ruby
425
+ SpellKit.suggestions("helllo", 5)
426
+ # => [{"term"=>"hello", "distance"=>1, "freq"=>10000}, ...]
427
+ ```
281
428
 
282
- Return corrected word or original if no better match found.
429
+ ### `SpellKit.correct(word, guard:)`
283
430
 
284
- **Options:**
285
- - `guard:` - Set to `:domain` to enable protection checks
431
+ Return corrected word or original if no better match found. Respects `frequency_threshold` configuration.
432
+
433
+ **Parameters:**
434
+ - `word` (required) - The word to correct
435
+ - `guard:` (optional) - Set to `:domain` to enable protection checks
436
+
437
+ **Behavior:**
438
+ - Returns original word if it exists in dictionary
439
+ - For misspellings, only accepts corrections with frequency ≥ `frequency_threshold`
440
+ - Returns original word if no corrections pass the threshold
441
+ - When `guard: :domain` is set, protected terms and skip patterns are applied
442
+
443
+ **Example:**
444
+ ```ruby
445
+ SpellKit.correct("helllo") # => "hello"
446
+ SpellKit.correct("hello") # => "hello" (already correct)
447
+ SpellKit.correct("CDK10", guard: :domain) # => "CDK10" (protected)
448
+ ```
286
449
 
287
450
  ### `SpellKit.correct_tokens(tokens, guard:)`
288
451
 
289
- Batch correction of an array of tokens.
452
+ Batch correction of an array of tokens. Respects `frequency_threshold` configuration.
453
+
454
+ **Options:**
455
+ - `guard:` - Set to `:domain` to enable protection checks
290
456
 
291
457
  **Returns:** Array of corrected strings
292
458
 
@@ -377,17 +543,78 @@ end
377
543
 
378
544
  ## Performance
379
545
 
380
- Benchmarked on M1 MacBook Pro with 20-term test dictionary:
546
+ ### SpellKit Standalone (M1 MacBook Pro, Ruby 3.3.0, 80k dictionary)
547
+
548
+ **Single Word Suggestions:**
549
+ - 3,345 i/s (298.96 μs/i) with max: 1 suggestion
550
+ - 3,198 i/s (312.73 μs/i) with max: 5 suggestions
551
+ - 3,073 i/s (325.45 μs/i) with max: 10 suggestions
552
+
553
+ **Correction Performance:**
554
+ - `correct`: 1,858 i/s (538.17 μs/i)
555
+ - `correct_tokens` (batch): 2,005 i/s (498.76 μs/i)
556
+
557
+ **Guard Performance:**
558
+ - Without guard: 2,926 i/s (341.79 μs/i)
559
+ - With guard: 9,337 i/s (107.10 μs/i) - **3.19x faster!**
560
+ *(Guards short-circuit expensive lookups)*
561
+
562
+ **Latency Distribution (10,000 iterations):**
563
+ - p50: 61μs
564
+ - p95: 66μs
565
+ - p99: 105μs
566
+ - max: 298μs
567
+
568
+ **Raw Throughput:** 16,192 ops/sec
569
+
570
+ ### Key Takeaways
571
+ 1. **Consistent Performance**: p95 latency of 66μs with 80k dictionary, p99 at 105μs
572
+ 2. **Guards are Fast**: Protected term checks improve performance by 3.2x by avoiding dictionary lookups
573
+ 3. **High Throughput**: Over 16k operations per second with 80k word dictionary
574
+ 4. **Scales Well**: Minimal performance difference between 1 vs 10 suggestions
575
+
576
+ ## Benchmarks
577
+
578
+ SpellKit includes comprehensive benchmarks to measure performance and compare with other spell checkers.
579
+
580
+ ### Running Benchmarks
581
+
582
+ **Performance Benchmark** - Comprehensive SpellKit performance analysis:
583
+ ```bash
584
+ bundle exec ruby benchmark/performance.rb
585
+ ```
586
+
587
+ Measures:
588
+ - Single word suggestions with varying result limits
589
+ - Correction performance on mixed datasets
590
+ - Batch correction throughput
591
+ - Guard/protection overhead
592
+ - Latency distribution (p50, p95, p99)
593
+ - Raw throughput (ops/sec)
594
+
595
+ **Aspell Comparison** - Direct comparison with Aspell:
596
+ ```bash
597
+ # First install Aspell if needed:
598
+ # macOS: brew install aspell
599
+ # Ubuntu: sudo apt-get install aspell libaspell-dev
600
+
601
+ bundle exec ruby benchmark/comparison_aspell.rb
602
+ ```
603
+
604
+ Compares SpellKit with Aspell on:
605
+ - Single word correction performance
606
+ - Spell checking (correctness tests)
607
+ - Latency distribution at scale
608
+
609
+ See [benchmark/README.md](benchmark/README.md) for detailed results and analysis.
610
+
611
+ ### Why These Benchmarks?
381
612
 
382
- - **Load time**: < 100ms
383
- - **Suggestion latency**: p50 < 2µs, p95 < 2µs
384
- - **Guard checks**: p95 < 1µs
385
- - **Memory**: ~150MB for 1M term dictionary (estimated)
613
+ **SpellKit vs Aspell**: Both provide fuzzy matching and suggestions for misspelled words, but use different algorithms:
614
+ - **SpellKit (SymSpell)**: O(1) lookup complexity, optimized for speed with large dictionaries
615
+ - **Aspell**: Statistical scoring with phonetic similarity, good for natural language
386
616
 
387
- Target for production (1-5M terms):
388
- - Load: < 500ms
389
- - p50: < 30µs, p95: < 100µs
390
- - Memory: 50-150MB
617
+ The comparison shows SpellKit's performance advantage while solving the same problem.
391
618
 
392
619
  ## Building Dictionaries
393
620
 
@@ -113,12 +113,6 @@ dependencies = [
113
113
  "either",
114
114
  ]
115
115
 
116
- [[package]]
117
- name = "itoa"
118
- version = "1.0.15"
119
- source = "registry+https://github.com/rust-lang/crates.io-index"
120
- checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
121
-
122
116
  [[package]]
123
117
  name = "lazy_static"
124
118
  version = "1.5.0"
@@ -275,61 +269,12 @@ version = "1.1.0"
275
269
  source = "registry+https://github.com/rust-lang/crates.io-index"
276
270
  checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
277
271
 
278
- [[package]]
279
- name = "ryu"
280
- version = "1.0.20"
281
- source = "registry+https://github.com/rust-lang/crates.io-index"
282
- checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
283
-
284
272
  [[package]]
285
273
  name = "seq-macro"
286
274
  version = "0.3.6"
287
275
  source = "registry+https://github.com/rust-lang/crates.io-index"
288
276
  checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc"
289
277
 
290
- [[package]]
291
- name = "serde"
292
- version = "1.0.227"
293
- source = "registry+https://github.com/rust-lang/crates.io-index"
294
- checksum = "80ece43fc6fbed4eb5392ab50c07334d3e577cbf40997ee896fe7af40bba4245"
295
- dependencies = [
296
- "serde_core",
297
- "serde_derive",
298
- ]
299
-
300
- [[package]]
301
- name = "serde_core"
302
- version = "1.0.227"
303
- source = "registry+https://github.com/rust-lang/crates.io-index"
304
- checksum = "7a576275b607a2c86ea29e410193df32bc680303c82f31e275bbfcafe8b33be5"
305
- dependencies = [
306
- "serde_derive",
307
- ]
308
-
309
- [[package]]
310
- name = "serde_derive"
311
- version = "1.0.227"
312
- source = "registry+https://github.com/rust-lang/crates.io-index"
313
- checksum = "51e694923b8824cf0e9b382adf0f60d4e05f348f357b38833a3fa5ed7c2ede04"
314
- dependencies = [
315
- "proc-macro2",
316
- "quote",
317
- "syn",
318
- ]
319
-
320
- [[package]]
321
- name = "serde_json"
322
- version = "1.0.145"
323
- source = "registry+https://github.com/rust-lang/crates.io-index"
324
- checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
325
- dependencies = [
326
- "itoa",
327
- "memchr",
328
- "ryu",
329
- "serde",
330
- "serde_core",
331
- ]
332
-
333
278
  [[package]]
334
279
  name = "shell-words"
335
280
  version = "1.1.0"
@@ -349,8 +294,6 @@ dependencies = [
349
294
  "hashbrown",
350
295
  "magnus",
351
296
  "regex",
352
- "serde",
353
- "serde_json",
354
297
  "unicode-normalization",
355
298
  ]
356
299
 
@@ -12,8 +12,6 @@ crate-type = ["cdylib"]
12
12
 
13
13
  [dependencies]
14
14
  magnus = { version = "0.7", features = ["rb-sys"] }
15
- serde = { version = "1.0", features = ["derive"] }
16
- serde_json = "1.0"
17
15
  hashbrown = "0.15"
18
16
  unicode-normalization = "0.1"
19
17
  regex = "1.11"
@@ -1,5 +1,6 @@
1
1
  use hashbrown::HashSet;
2
- use regex::Regex;
2
+ use regex::{Regex, RegexBuilder};
3
+ use crate::symspell::SymSpell;
3
4
 
4
5
  #[derive(Debug, Clone)]
5
6
  pub struct Guards {
@@ -19,14 +20,31 @@ impl Guards {
19
20
  for line in content.lines() {
20
21
  let trimmed = line.trim();
21
22
  if !trimmed.is_empty() && !trimmed.starts_with('#') {
23
+ // Store literal form
22
24
  self.protected_set.insert(trimmed.to_string());
25
+ // Store lowercase form
23
26
  self.protected_set.insert(trimmed.to_lowercase());
27
+ // Store normalized form (strips whitespace, converts to lowercase)
28
+ // This ensures variants like "newyork" are protected if "New York" is in the list
29
+ let normalized = SymSpell::normalize_word(trimmed);
30
+ self.protected_set.insert(normalized);
24
31
  }
25
32
  }
26
33
  }
27
34
 
28
- pub fn add_pattern(&mut self, pattern: &str) -> Result<(), String> {
29
- match Regex::new(pattern) {
35
+ pub fn add_pattern_with_flags(
36
+ &mut self,
37
+ pattern: &str,
38
+ case_insensitive: bool,
39
+ multiline: bool,
40
+ extended: bool,
41
+ ) -> Result<(), String> {
42
+ match RegexBuilder::new(pattern)
43
+ .case_insensitive(case_insensitive)
44
+ .multi_line(multiline)
45
+ .ignore_whitespace(extended)
46
+ .build()
47
+ {
30
48
  Ok(regex) => {
31
49
  self.protected_patterns.push(regex);
32
50
  Ok(())