kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,270 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Algorithms
5
+ # N-gram based suggestion algorithm.
6
+ #
7
+ # Ported from Spylls (Python) ngram_suggest.py
8
+ #
9
+ # This is the core Hunspell suggestion algorithm that uses n-gram
10
+ # similarity to rank and filter spelling corrections.
11
+ #
12
+ # The algorithm works in three stages:
13
+ # 1. root_score: Quick n-gram score + left common substring
14
+ # 2. rough_affix_score: Affixed form n-gram score
15
+ # 3. precise_affix_score: Full scoring with LCS, bigrams, etc.
16
+ module NgramSuggest
17
+ # Maximum number of root words to consider in first pass
18
+ MAX_ROOTS = 100
19
+
20
+ # Maximum number of suggestions to generate
21
+ MAX_GUESSES = 200
22
+
23
+ class << self
24
+ # Main entry point for n-gram based suggestions.
25
+ #
26
+ # @param misspelling [String] The misspelled word
27
+ # @param dictionary_words [Array<Hash>] Dictionary entries with stem and flags
28
+ # @param prefixes [Hash] Prefix flags to prefix objects mapping
29
+ # @param suffixes [Hash] Suffix flags to suffix objects mapping
30
+ # @param known [Set<String>] Already suggested words (to avoid duplicates)
31
+ # @param maxdiff [Integer] MAXDIFF value from aff file (0-10)
32
+ # @param onlymaxdiff [Boolean] ONLYMAXDIFF flag
33
+ # @param has_phonetic [Boolean] Whether PHONE table exists in aff file
34
+ # @yield [String] Each suggestion
35
+ #
36
+ # This is a simplified version that works with basic dictionary structures.
37
+ # Full implementation would need affix flag parsing and Word model objects.
38
+ def suggest(misspelling,
39
+ dictionary_words:,
40
+ prefixes: {},
41
+ suffixes: {},
42
+ known: Set.new,
43
+ maxdiff: 2,
44
+ onlymaxdiff: true,
45
+ has_phonetic: false,
46
+ &block)
47
+
48
+ # Stage 1: Find best root candidates by n-gram score
49
+ root_scores = []
50
+
51
+ dictionary_words.each do |word_entry|
52
+ stem = word_entry[:stem] || word_entry
53
+
54
+ # Skip words with length difference > 4
55
+ next if (stem.length - misspelling.length).abs > 4
56
+
57
+ score = root_score(misspelling, stem)
58
+
59
+ # Use heap to keep only MAX_ROOTS best results
60
+ if root_scores.size >= MAX_ROOTS
61
+ # Keep only the best scores
62
+ root_scores = root_scores.sort.reverse.first(MAX_ROOTS)
63
+ end
64
+
65
+ root_scores << [score, word_entry] if score > 0
66
+ end
67
+
68
+ # Stage 2: Generate affixed forms and score them
69
+ threshold = detect_threshold(misspelling)
70
+ guess_scores = []
71
+
72
+ # Sort by score descending
73
+ root_scores.sort_by { |score, _| -score }.first(MAX_ROOTS).each do |(_, root_entry)|
74
+ root = root_entry[:stem] || root_entry
75
+
76
+ # Generate forms with suffixes
77
+ forms = forms_for(root_entry, prefixes, suffixes, similar_to: misspelling)
78
+
79
+ forms.each do |form|
80
+ score = rough_affix_score(misspelling, form.to_s.downcase)
81
+ next unless score > threshold
82
+
83
+ guess_scores << [score, form.to_s, form.to_s]
84
+ end
85
+ end
86
+
87
+ # Limit to MAX_GUESSES and sort by score
88
+ guesses = guess_scores.sort.reverse.first(MAX_GUESSES)
89
+
90
+ # Stage 3: Calculate precise scores
91
+ fact = maxdiff >= 0 ? (10.0 - maxdiff) / 5.0 : 1.0
92
+
93
+ guesses2 = guesses.map do |score, compared, real|
94
+ [precise_affix_score(misspelling, compared.to_s.downcase,
95
+ fact, base: score, has_phonetic: has_phonetic), real.to_s]
96
+ end.sort.reverse
97
+
98
+ # Stage 4: Filter and yield suggestions
99
+ filter_guesses(guesses2, known: known, onlymaxdiff: onlymaxdiff, &block)
100
+ end
101
+
102
+ # Stage 1 scoring: 3-gram score + left common substring.
103
+ #
104
+ # @param word1 [String] Misspelled word
105
+ # @param word2 [String] Possible suggestion
106
+ # @return [Float] Root score
107
+ def root_score(word1, word2)
108
+ # Use lowercase for comparison as per Hunspell
109
+ word2_lower = word2.downcase
110
+
111
+ StringMetrics.ngram(3, word1, word2_lower, longer_worse: true) +
112
+ StringMetrics.leftcommonsubstring(word1, word2_lower).to_f
113
+ end
114
+
115
+ # Stage 2 scoring: N-gram score with n=len(word1) + left common substring.
116
+ #
117
+ # @param word1 [String] Misspelled word
118
+ # @param word2 [String] Possible suggestion
119
+ # @return [Float] Rough affix score
120
+ def rough_affix_score(word1, word2)
121
+ # Use lowercase for comparison as per Hunspell
122
+ word2_lower = word2.downcase
123
+
124
+ StringMetrics.ngram(word1.length, word1, word2_lower, any_mismatch: true) +
125
+ StringMetrics.leftcommonsubstring(word1, word2_lower).to_f
126
+ end
127
+
128
+ # Stage 3 scoring: Full precise scoring.
129
+ #
130
+ # Returns one of three "score groups":
131
+ # - > 1000: Very good (same word, different casing)
132
+ # - < -100: Questionable (too different)
133
+ # - -100 to 1000: Normal suggestion
134
+ #
135
+ # @param word1 [String] Misspelled word
136
+ # @param word2 [String] Possible suggestion
137
+ # @param diff_factor [Float] Factor based on MAXDIFF (0-2)
138
+ # @param base [Float] Base score from stage 2
139
+ # @param has_phonetic [Boolean] Whether PHONE table exists
140
+ # @return [Float] Precise affix score
141
+ def precise_affix_score(word1, word2, diff_factor, base:, has_phonetic: false)
142
+ # Use lowercase for LCS to catch case-only differences
143
+ word1_lower = word1.downcase
144
+ word2_lower = word2.downcase
145
+
146
+ lcs = StringMetrics.lcslen(word1_lower, word2_lower)
147
+
148
+ # Same characters with different casing -- "very good" suggestion
149
+ if word1.length == word2.length && word1.length == lcs
150
+ return base + 2000
151
+ end
152
+
153
+ # Score is: 2 * LCS - length difference
154
+ result = 2 * lcs - (word1.length - word2.length).abs
155
+
156
+ # Add common start substring length
157
+ result += StringMetrics.leftcommonsubstring(word1_lower, word2_lower)
158
+
159
+ # Add 1 if any characters match at same positions
160
+ result += 1 if StringMetrics.commoncharacters(word1_lower, word2_lower) > 0
161
+
162
+ # Add regular 4-gram score
163
+ result += StringMetrics.ngram(4, word1_lower, word2_lower, any_mismatch: true)
164
+
165
+ # Add weighted bigrams (both directions)
166
+ bigrams = (
167
+ StringMetrics.ngram(2, word1_lower, word2_lower, any_mismatch: true, weighted: true) +
168
+ StringMetrics.ngram(2, word2_lower, word1_lower, any_mismatch: true, weighted: true)
169
+ )
170
+ result += bigrams
171
+
172
+ # Apply "questionable" threshold based on diff_factor and has_phonetic
173
+ questionable_limit = if has_phonetic
174
+ word2.length * diff_factor
175
+ else
176
+ (word1.length + word2.length) * diff_factor
177
+ end
178
+
179
+ result -= 1000 if bigrams < questionable_limit
180
+
181
+ result
182
+ end
183
+
184
+ # Calculate minimum threshold for passable suggestions.
185
+ #
186
+ # Mangles the word in 3 different ways (replacing each 4th char with '*')
187
+ # and scores them to generate a minimum acceptable score.
188
+ #
189
+ # @param word [String] The misspelled word
190
+ # @return [Float] Minimum threshold score
191
+ def detect_threshold(word)
192
+ thresh = 0.0
193
+
194
+ (1..3).each do |start_pos|
195
+ mangled = word.chars.map.with_index do |char, pos|
196
+ ((pos - start_pos) % 4).zero? && pos >= start_pos ? "*" : char
197
+ end.join
198
+
199
+ thresh += StringMetrics.ngram(word.length, word, mangled, any_mismatch: true)
200
+ end
201
+
202
+ # Take average of the three scores and subtract 1
203
+ (thresh / 3.0) - 1
204
+ end
205
+
206
+ # Generate all possible affixed forms for a dictionary word.
207
+ #
208
+ # @param word_entry [Hash] Dictionary word with stem and flags
209
+ # @param all_prefixes [Hash] Available prefixes
210
+ # @param all_suffixes [Hash] Available suffixes
211
+ # @param similar_to [String] Original misspelling (for filtering)
212
+ # @return [Array<String>] Generated forms
213
+ def forms_for(word_entry, all_prefixes, all_suffixes, similar_to:)
214
+ stem = word_entry[:stem] || word_entry
215
+ flags = word_entry[:flags] || []
216
+
217
+ # Base form without affixes
218
+ res = [stem]
219
+
220
+ # Generate suffix forms
221
+ # Simplified: just return base form for now
222
+ # Full implementation would parse affix flags and apply them
223
+
224
+ res
225
+ end
226
+
227
+ # Filter guesses by score into quality buckets.
228
+ #
229
+ # Score buckets:
230
+ # - > 1000: Very good (same word, different casing)
231
+ # - 1000 to -100: Normal suggestions
232
+ # - < -100: Questionable (too different)
233
+ #
234
+ # Stops yielding when:
235
+ # - A very good suggestion was found and then a normal one
236
+ # - A questionable suggestion was found (only yields one)
237
+ #
238
+ # @param guesses [Array<Array>] Array of [score, value] pairs
239
+ # @param known [Set<String>] Already suggested words
240
+ # @param onlymaxdiff [Boolean] Whether to exclude questionable
241
+ # @yield [String] Each filtered suggestion
242
+ def filter_guesses(guesses, known:, onlymaxdiff: true)
243
+ seen = false
244
+ found = 0
245
+
246
+ guesses.each do |score, value|
247
+ # Stop if we saw very good and now have normal suggestions
248
+ return if seen && score <= 1000
249
+
250
+ if score > 1000
251
+ # Very good suggestion - set flag to only accept other very good ones
252
+ seen = true
253
+ elsif score < -100
254
+ # Questionable suggestion
255
+ # Stop if we already found good ones, or if we're excluding questionable
256
+ return if found.positive? || onlymaxdiff
257
+ seen = true
258
+ end
259
+
260
+ # Skip if this word was already suggested
261
+ next if known.any? { |known_word| value.include?(known_word) }
262
+
263
+ found += 1
264
+ yield value
265
+ end
266
+ end
267
+ end
268
+ end
269
+ end
270
+ end
@@ -0,0 +1,283 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Algorithms
5
+ # Word permutation algorithms for generating spelling variations.
6
+ #
7
+ # Ported from Spylls (Python) permutations.py
8
+ #
9
+ # These functions generate various word edits that are used by the
10
+ # suggestion system to find possible corrections for misspelled words.
11
+ #
12
+ # Method names match Hunspell's suggest.cxx to maintain compatibility.
13
+ module Permutations
14
+ MAX_CHAR_DISTANCE = 4
15
+
16
+ module_function
17
+
18
+ # Uses REP table (typical misspellings) to replace patterns in word.
19
+ #
20
+ # If the pattern's replacement contains "_", it means replacing to " "
21
+ # and yielding two different hypotheses:
22
+ # 1. It was one (dictionary) word "foo bar" (checked as such)
23
+ # 2. It was words ["foo", "bar"] (checked separately)
24
+ #
25
+ # @param word [String] The word to process
26
+ # @param reptable [Array<Hash>] Array of replacement pattern hashes with :regexp and :replacement
27
+ # @yield [String, Array<String>] Each suggestion (string or array of words)
28
+ #
29
+ # @example
30
+ # Kotoshu::Algorithms::Permutations.replchars("acces", [{regexp: /ac/, replacement: "ex"}]) do |sug|
31
+ # puts sug
32
+ # end
33
+ def replchars(word, reptable)
34
+ return if word.length < 2 || reptable.nil? || reptable.empty?
35
+
36
+ reptable.each do |pattern|
37
+ str = word.to_s
38
+ pos = 0
39
+
40
+ while (match_data = pattern[:regexp].match(str, pos))
41
+ suggestion = str[0...match_data.begin(0)] +
42
+ pattern[:replacement].gsub('_', ' ') +
43
+ str[match_data.end(0)..]
44
+
45
+ yield suggestion
46
+ yield suggestion.split(' ', 2) if suggestion.include?(' ')
47
+
48
+ # Move past this match to find next occurrence
49
+ pos = match_data.end(0)
50
+ break if pos >= str.length
51
+ end
52
+ end
53
+ end
54
+
55
+ # Uses MAP table (sets of potentially similar chars) and tries to replace them recursively.
56
+ #
57
+ # Example: Assuming MAP has entry "aáã", and we have misspelling "anarchia":
58
+ # mapchars will produce: "ánarchia", "ánárchia", "ánárchiá", etc.
59
+ #
60
+ # @param word [String] The word to process
61
+ # @param maptable [Array<Set<String>>] Array of character sets for mapping
62
+ # @yield [String] Each variant with mapped characters
63
+ #
64
+ # @example
65
+ # Kotoshu::Algorithms::Permutations.mapchars("anarchia", [Set.new(['a', 'á', 'ã'])]) do |variant|
66
+ # puts variant
67
+ # end
68
+ def mapchars(word, maptable)
69
+ return if word.length < 2 || maptable.nil? || maptable.empty?
70
+
71
+ mapchars_internal(word, 0, maptable) { |variant| yield variant }
72
+ end
73
+
74
+ # Produces permutations with adjacent chars swapped.
75
+ #
76
+ # For short (4 or 5 letters) words also produces double swaps: ahev -> have
77
+ #
78
+ # @param word [String] The word to process
79
+ # @yield [String] Each swap variant
80
+ def swapchar(word)
81
+ return if word.length < 2
82
+
83
+ chars = word.chars
84
+ (0...chars.length - 1).each do |i|
85
+ swapped = chars[0...i] + [chars[i + 1], chars[i]] + chars[(i + 2)..]
86
+ yield swapped.join
87
+ end
88
+
89
+ # Try double swaps for short words
90
+ # ahev -> have, owudl -> would
91
+ if [4, 5].include?(word.length)
92
+ yield word[1] + word[0] + (word.length == 5 ? word[2] : '') + word[-1] + word[-2]
93
+ if word.length == 5
94
+ yield word[0] + word[2] + word[1] + word[-1] + word[-2]
95
+ end
96
+ end
97
+ end
98
+
99
+ # Produces permutations with non-adjacent chars swapped (up to 4 chars distance).
100
+ #
101
+ # @param word [String] The word to process
102
+ # @yield [String] Each long swap variant
103
+ def longswapchar(word)
104
+ chars = word.chars
105
+ (0...chars.length - 2).each do |first|
106
+ ((first + 2)...[first + MAX_CHAR_DISTANCE, chars.length].min).each do |second|
107
+ swapped = chars[0...first] +
108
+ [chars[second]] +
109
+ chars[(first + 1)...second] +
110
+ [chars[first]] +
111
+ chars[(second + 1)..]
112
+ yield swapped.join
113
+ end
114
+ end
115
+ end
116
+
117
+ # Produces permutations with chars replaced by adjacent chars on keyboard layout
118
+ # ("vat -> cat") or downcased (if it was accidental uppercase).
119
+ #
120
+ # @param word [String] The word to process
121
+ # @param layout [String] Keyboard layout string (KEY from aff file)
122
+ # @yield [String] Each variant with replaced chars
123
+ def badcharkey(word, layout)
124
+ chars = word.chars
125
+ chars.each_with_index do |c, i|
126
+ before = word[0...i]
127
+ after = word[(i + 1)..]
128
+
129
+ # Try uppercasing if not already uppercase
130
+ unless c == c.upcase
131
+ yield before + c.upcase + after.to_s
132
+ end
133
+
134
+ next if layout.nil? || layout.empty?
135
+
136
+ # Try adjacent keys on keyboard
137
+ pos = layout.index(c)
138
+ next unless pos
139
+
140
+ while pos
141
+ if pos.positive? && layout[pos - 1] != '|'
142
+ yield before + layout[pos - 1] + after.to_s
143
+ end
144
+ if pos + 1 < layout.length && layout[pos + 1] != '|'
145
+ yield before + layout[pos + 1] + after.to_s
146
+ end
147
+ pos = layout.index(c, pos + 1)
148
+ end
149
+ end
150
+ end
151
+
152
+ # Produces permutations with one char removed in all possible positions.
153
+ #
154
+ # @param word [String] The word to process
155
+ # @yield [String] Each variant with one char removed
156
+ def extrachar(word)
157
+ return if word.length < 2
158
+
159
+ word.length.times do |i|
160
+ yield word[0...i] + word[(i + 1)..]
161
+ end
162
+ end
163
+
164
+ # Produces permutations with one char inserted in all possible positions.
165
+ #
166
+ # List of chars is taken from TRY string -- if absent, tries nothing.
167
+ # Chars are expected to be sorted in order of usage in language.
168
+ #
169
+ # @param word [String] The word to process
170
+ # @param trystring [String] Characters to try inserting (from aff TRY directive)
171
+ # @yield [String] Each variant with one char inserted
172
+ def forgotchar(word, trystring)
173
+ return if trystring.nil? || trystring.empty?
174
+
175
+ trystring.each_char do |c|
176
+ (0..word.length).each do |i|
177
+ yield word[0...i] + c + word[i..]
178
+ end
179
+ end
180
+ end
181
+
182
+ # Produces permutations with one character moved by 2, 3 or 4 places forward or backward
183
+ # (not 1, because adjacent swaps are already handled by swapchar).
184
+ #
185
+ # @param word [String] The word to process
186
+ # @yield [String] Each variant with moved character
187
+ def movechar(word)
188
+ return if word.length < 2
189
+
190
+ chars = word.chars
191
+
192
+ # Move characters forward
193
+ chars.each_with_index do |char, frompos|
194
+ ((frompos + 3)...[chars.length, frompos + MAX_CHAR_DISTANCE + 1].min).each do |topos|
195
+ moved = chars[0...frompos] + chars[(frompos + 1)...topos] + [char] + chars[topos..]
196
+ yield moved.join
197
+ end
198
+ end
199
+
200
+ # Move characters backward
201
+ (chars.length - 1).downto(0) do |frompos|
202
+ [[0, frompos - MAX_CHAR_DISTANCE + 1].max, frompos - 1].min.downto(0) do |topos|
203
+ moved = chars[0...topos] + [chars[frompos]] + chars[topos...frompos] + chars[(frompos + 1)..]
204
+ yield moved.join
205
+ end
206
+ end
207
+ end
208
+
209
+ # Produces permutations with chars replaced by chars in TRY set.
210
+ #
211
+ # @param word [String] The word to process
212
+ # @param trystring [String] Characters to try replacing with (from aff TRY directive)
213
+ # @yield [String] Each variant with replaced char
214
+ def badchar(word, trystring)
215
+ return if trystring.nil? || trystring.empty?
216
+
217
+ trystring.each_char do |c|
218
+ (word.length - 1).downto(0) do |i|
219
+ next if word[i] == c
220
+
221
+ yield word[0...i] + c + word[(i + 1)..]
222
+ end
223
+ end
224
+ end
225
+
226
+ # Produces permutations with accidental two-letter-doubling fixed.
227
+ # Example: "vacacation" -> "vacation"
228
+ #
229
+ # @param word [String] The word to process
230
+ # @yield [String] Each variant with fixed doubling
231
+ def doubletwochars(word)
232
+ return if word.length < 5
233
+
234
+ (2...word.length).each do |i|
235
+ # Check if word[i-2] == word[i] and word[i-3] == word[i-1]
236
+ # Example: vacacation -> "ca" at positions 3-4, so "vac" at 2-4
237
+ if word[i - 2] == word[i] && word[i - 3] == word[i - 1]
238
+ yield word[0...(i - 1)] + word[(i + 1)..]
239
+ end
240
+ end
241
+ end
242
+
243
+ # Produces permutations of splitting word into two in all possible positions.
244
+ #
245
+ # @param word [String] The word to process
246
+ # @yield [Array<String>] Each two-word split
247
+ def twowords(word)
248
+ (1...word.length).each do |i|
249
+ yield [word[0...i], word[i..]]
250
+ end
251
+ end
252
+
253
+ # Internal recursive method for mapchars.
254
+ #
255
+ # @param word [String] Current word state
256
+ # @param start [Integer] Starting position for search
257
+ # @param maptable [Array<Set<String>>] Character mapping table
258
+ # @yield [String] Each variant
259
+ def mapchars_internal(word, start, maptable)
260
+ return if start >= word.length
261
+
262
+ maptable.each do |options|
263
+ options.each do |option|
264
+ pos = word.index(option, start)
265
+ next unless pos
266
+
267
+ options.each do |other|
268
+ next if other == option
269
+
270
+ replaced = word[0...pos] + other + word[(pos + option.length)..]
271
+ yield replaced
272
+
273
+ # Recursively continue from this position
274
+ mapchars_internal(replaced, pos + 1, maptable) { |variant| yield variant }
275
+ end
276
+ end
277
+ end
278
+ end
279
+
280
+ private_class_method :mapchars_internal
281
+ end
282
+ end
283
+ end