kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,575 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Algorithms
5
+ # Main suggestion orchestration for spell checking.
6
+ #
7
+ # Ported from Spylls (Python) suggest.py
8
+ #
9
+ # On a bird's-eye view level, suggest does:
10
+ # 1. Tries small word "edits" (remove letters, insert letters, swap letters)
11
+ # and checks (with the help of Lookup) if there are any valid ones
12
+ # 2. If no good suggestions found, tries "ngram-based" suggestions
13
+ # (calculating ngram-based distance to all dictionary words)
14
+ # 3. If possible, tries metaphone-based suggestions (phonetic)
15
+ #
16
+ # Note: Spylls's implementation takes one liberty vs Hunspell:
17
+ # In Hunspell, ngram suggestions and phonetic suggestions are done in the
18
+ # same cycle. Spylls does them in two separate cycles for clarity.
19
+ #
20
+ # To follow algorithm details, see Suggest.suggestions method.
21
+ module Suggest
22
+ MAXPHONSUGS = 2
23
+ MAXSUGGESTIONS = 15
24
+ GOOD_EDITS = %w[spaceword uppercase replchars].freeze
25
+
26
+ # Represents a single word suggestion.
27
+ #
28
+ # Suggestions are produced internally to store enough information to
29
+ # make sure it is a good one.
30
+ class Suggestion
31
+ # @return [String] Actual suggestion text
32
+ attr_reader :text
33
+
34
+ # @return [String] How suggestion was produced (same as method name)
35
+ attr_reader :kind
36
+
37
+ def initialize(text, kind)
38
+ @text = text
39
+ @kind = kind
40
+ end
41
+
42
+ # Create a copy with changes.
43
+ #
44
+ # @param changes [Hash] Changes to apply
45
+ # @return [Suggestion] New suggestion with changes applied
46
+ def replace(**changes)
47
+ self.class.new(
48
+ changes.fetch(:text, @text),
49
+ changes.fetch(:kind, @kind)
50
+ )
51
+ end
52
+
53
+ # String representation.
54
+ #
55
+ # @return [String]
56
+ def to_s
57
+ @text
58
+ end
59
+
60
+ # Inspect string.
61
+ #
62
+ # @return [String]
63
+ def inspect
64
+ "Suggestion[#{@kind}](#{@text.inspect})"
65
+ end
66
+ end
67
+
68
+ # Represents suggestion to split words into several.
69
+ #
70
+ # Used when the algorithm suggests that a misspelled word should be
71
+ # split into multiple dictionary words.
72
+ class MultiWordSuggestion
73
+ # @return [Array<String>] List of words
74
+ attr_reader :words
75
+
76
+ # @return [String] Same as Suggestion.kind
77
+ attr_reader :source
78
+
79
+ # @return [Boolean] Whether words can be joined by dash
80
+ attr_reader :allow_dash
81
+
82
+ def initialize(words, source, allow_dash: true)
83
+ @words = words
84
+ @source = source
85
+ @allow_dash = allow_dash
86
+ end
87
+
88
+ # Convert to string suggestion.
89
+ #
90
+ # @param separator [String] Separator to join words with
91
+ # @return [Suggestion] String suggestion with joined words
92
+ def stringify(separator = ' ')
93
+ Suggestion.new(@words.join(separator), @source)
94
+ end
95
+
96
+ # Inspect string.
97
+ #
98
+ # @return [String]
99
+ def inspect
100
+ "Suggestion[#{@source}](#{@words.inspect})"
101
+ end
102
+ end
103
+
104
+ # Main suggestion class.
105
+ #
106
+ # Typically, you would not use this directly, but you might want to for
107
+ # experiments.
108
+ #
109
+ # Example:
110
+ # dictionary = Kotoshu::Dictionary.load('en_US')
111
+ # suggester = dictionary.suggester
112
+ #
113
+ # suggester.suggestions('spylls') do |suggestion|
114
+ # puts suggestion
115
+ # end
116
+ #
117
+ # # Output:
118
+ # # Suggestion[badchar](spell)
119
+ # # Suggestion[badchar](spill)
120
+ class Suggester
121
+ # @return [Object] Aff data structure (from aff file)
122
+ attr_reader :aff
123
+
124
+ # @return [Object] Dic data structure (from dic file)
125
+ attr_reader :dic
126
+
127
+ # @return [Object] Lookup object
128
+ attr_reader :lookup
129
+
130
+ def initialize(aff, dic, lookup)
131
+ @aff = aff
132
+ @dic = dic
133
+ @lookup = lookup
134
+
135
+ # Prepare words for ngram (exclude those with bad flags)
136
+ bad_flags = [
137
+ @aff[:FORBIDDENWORD],
138
+ @aff[:NOSUGGEST],
139
+ @aff[:ONLYINCOMPOUND]
140
+ ].compact
141
+
142
+ @words_for_ngram = @dic[:words].select do |word|
143
+ flags = word[:flags] || []
144
+ (flags & bad_flags).empty?
145
+ end
146
+ end
147
+
148
+ # Outer "public" interface: returns all valid suggestions as strings.
149
+ #
150
+ # Returns an enumerator for lazy evaluation.
151
+ #
152
+ # @param word [String] Word to check
153
+ # @return [Enumerator<String>] Suggestions as strings
154
+ def call(word)
155
+ return enum_for(:call, word) unless block_given?
156
+
157
+ suggestions(word) do |suggestion|
158
+ yield suggestion.text
159
+ end
160
+ end
161
+
162
+ # Main suggestion search loop.
163
+ #
164
+ # What it does, in general:
165
+ # 1. Generates possible misspelled word cases (capitalization variants)
166
+ # 2. Produces word edits with edits, checks them with Lookup
167
+ # 3. If needed, produces ngram-based suggestions
168
+ # 4. If needed, produces phonetically similar suggestions
169
+ #
170
+ # @param word [String] Word to check
171
+ # @yield [Suggestion, MultiWordSuggestion] Each suggestion object
172
+ def suggestions(word)
173
+ return enum_for(:suggestions, word) unless block_given?
174
+
175
+ # Track all suggestions we've already yielded
176
+ handled = Set.new
177
+
178
+ # Helper: Check if suggestion is a valid word
179
+ is_good_suggestion = ->(w) do
180
+ # Check if there's any good form of this exact word
181
+ # Note: We check good_forms directly to avoid ICONV and dash-breaking
182
+ good_forms = @lookup.good_forms(w, capitalization: false, allow_nosuggest: false)
183
+ good_forms.any?
184
+ end
185
+
186
+ # Helper: Check if word is forbidden
187
+ is_forbidden = ->(w) do
188
+ return false unless @aff[:FORBIDDENWORD]
189
+
190
+ @dic[:has_flag]&.call(w, @aff[:FORBIDDENWORD]) || false
191
+ end
192
+
193
+ # Get capitalization type and variants
194
+ captype, variants = @aff[:casing].corrections(word)
195
+
196
+ # Special case: FORCEUCASE with NO capitalization
197
+ if @aff[:FORCEUCASE] && captype == Capitalization::Type::NO
198
+ @aff[:casing].capitalize(word).each do |capitalized|
199
+ if is_good_suggestion.call(capitalized)
200
+ yield Suggestion.new(capitalized.capitalize, 'forceucase')
201
+ return
202
+ end
203
+ end
204
+ end
205
+
206
+ good_edits_found = false
207
+
208
+ # Process each capitalization variant
209
+ variants.each_with_index do |variant, idx|
210
+ # If different from original and is good, suggest it
211
+ if idx.positive? && is_good_suggestion.call(variant)
212
+ handle_found(
213
+ Suggestion.new(variant, 'case'),
214
+ captype: captype,
215
+ is_forbidden: is_forbidden,
216
+ handled: handled
217
+ ) do |suggestion|
218
+ yield suggestion
219
+ end
220
+ end
221
+
222
+ # Generate and check edits (non-compound first)
223
+ nocompound = false
224
+
225
+ edit_suggestions(variant, compounds: false, limit: MAXSUGGESTIONS) do |suggestion|
226
+ handle_found(
227
+ suggestion,
228
+ captype: captype,
229
+ is_forbidden: is_forbidden,
230
+ handled: handled,
231
+ check_inclusion: false
232
+ ) do |handled_suggestion|
233
+ yield handled_suggestion
234
+
235
+ kind = handled_suggestion.kind
236
+ good_edits_found = true if GOOD_EDITS.include?(kind)
237
+ nocompound = true if %w[uppercase replchars mapchars].include?(kind)
238
+
239
+ # If we found a spaceword that's in the dictionary as a whole,
240
+ # that's the only suggestion we need
241
+ return if kind == 'spaceword'
242
+ end
243
+ end
244
+
245
+ # Generate compound suggestions if not excluded
246
+ unless nocompound
247
+ limit = @aff[:MAXCPDSUGS] || MAXSUGGESTIONS
248
+ edit_suggestions(variant, compounds: true, limit: limit) do |suggestion|
249
+ handle_found(
250
+ suggestion,
251
+ captype: captype,
252
+ is_forbidden: is_forbidden,
253
+ handled: handled,
254
+ check_inclusion: false
255
+ ) do |handled_suggestion|
256
+ yield handled_suggestion
257
+ kind = handled_suggestion.kind
258
+ good_edits_found = true if GOOD_EDITS.include?(kind)
259
+ end
260
+ end
261
+ end
262
+ end
263
+
264
+ # Skip ngram/phonetic if we found good edits
265
+ return if good_edits_found
266
+
267
+ # Try fixing words with dashes
268
+ if word.include?('-') && handled.none? { |s| s.include?('-') }
269
+ chunks = word.split('-')
270
+ chunks.each_with_index do |chunk, idx|
271
+ next if is_good_suggestion.call(chunk)
272
+
273
+ # Try all suggestions for this chunk
274
+ call(chunk).each do |sug|
275
+ candidate = chunks[0...idx] + [sug] + chunks[(idx + 1)..]
276
+ candidate_str = candidate.join('-')
277
+
278
+ # Check if the whole word with replacement is good
279
+ if @lookup.call(candidate_str, capitalization: true, allow_nosuggest: true)
280
+ yield Suggestion.new(candidate_str, 'dashes')
281
+ end
282
+ end
283
+
284
+ # Only try one misspelled chunk
285
+ break
286
+ end
287
+ end
288
+
289
+ # Ngram-based suggestions
290
+ if @aff[:MAXNGRAMSUGS]&.positive?
291
+ ngrams_seen = 0
292
+ ngram_suggestions(word, handled: handled) do |sug|
293
+ handle_found(
294
+ Suggestion.new(sug, 'ngram'),
295
+ captype: captype,
296
+ is_forbidden: is_forbidden,
297
+ handled: handled,
298
+ check_inclusion: true
299
+ ) do |suggestion|
300
+ yield suggestion
301
+ ngrams_seen += 1
302
+ break if ngrams_seen >= @aff[:MAXNGRAMSUGS]
303
+ end
304
+ end
305
+ end
306
+
307
+ # Phonetic suggestions
308
+ if @aff[:PHONE]
309
+ phonet_seen = 0
310
+ phonet_suggestions(word) do |sug|
311
+ handle_found(
312
+ Suggestion.new(sug, 'phonet'),
313
+ captype: captype,
314
+ is_forbidden: is_forbidden,
315
+ handled: handled,
316
+ check_inclusion: true
317
+ ) do |suggestion|
318
+ yield suggestion
319
+ phonet_seen += 1
320
+ break if phonet_seen >= MAXPHONSUGS
321
+ end
322
+ end
323
+ end
324
+ end
325
+
326
+ # Generate all possible word edits in order of priority.
327
+ #
328
+ # Order is important - it's the order user receives suggestions.
329
+ #
330
+ # @param word [String] Word to mutate
331
+ # @yield [Suggestion, MultiWordSuggestion] Each edit suggestion
332
+ def edits(word)
333
+ # Uppercase suggestion (html -> HTML)
334
+ yield Suggestion.new(@aff[:casing].upper(word), 'uppercase')
335
+
336
+ # REP table replacements
337
+ reptable = @aff[:REP] || []
338
+ Permutations.replchars(word, reptable) do |suggestion|
339
+ if suggestion.is_a?(Array)
340
+ # Multi-word suggestion from REP with underscore
341
+ yield Suggestion.new(suggestion.join(' '), 'replchars')
342
+ yield MultiWordSuggestion.new(suggestion, 'replchars', allow_dash: false)
343
+ else
344
+ yield Suggestion.new(suggestion, 'replchars')
345
+ end
346
+ end
347
+
348
+ # Split into two words (spaceword)
349
+ Permutations.twowords(word) do |words|
350
+ yield Suggestion.new(words.join(' '), 'spaceword')
351
+ yield Suggestion.new(words.join('-'), 'spaceword') if use_dash?
352
+ end
353
+
354
+ # MAP table (related character replacements)
355
+ maptable = @aff[:MAP] || []
356
+ Permutations.mapchars(word, maptable) do |suggestion|
357
+ yield Suggestion.new(suggestion, 'mapchars')
358
+ end
359
+
360
+ # Swap adjacent characters
361
+ Permutations.swapchar(word) do |suggestion|
362
+ yield Suggestion.new(suggestion, 'swapchar')
363
+ end
364
+
365
+ # Long swaps (up to 4 chars distance)
366
+ Permutations.longswapchar(word) do |suggestion|
367
+ yield Suggestion.new(suggestion, 'longswapchar')
368
+ end
369
+
370
+ # Replace with keyboard-adjacent chars
371
+ layout = @aff[:KEY] || ''
372
+ Permutations.badcharkey(word, layout) do |suggestion|
373
+ yield Suggestion.new(suggestion, 'badcharkey')
374
+ end
375
+
376
+ # Remove one character
377
+ Permutations.extrachar(word) do |suggestion|
378
+ yield Suggestion.new(suggestion, 'extrachar')
379
+ end
380
+
381
+ # Insert one character (from TRY string)
382
+ trystring = @aff[:TRY] || ''
383
+ Permutations.forgotchar(word, trystring) do |suggestion|
384
+ yield Suggestion.new(suggestion, 'forgotchar')
385
+ end
386
+
387
+ # Move character forward/backward
388
+ Permutations.movechar(word) do |suggestion|
389
+ yield Suggestion.new(suggestion, 'movechar')
390
+ end
391
+
392
+ # Replace each character
393
+ Permutations.badchar(word, trystring) do |suggestion|
394
+ yield Suggestion.new(suggestion, 'badchar')
395
+ end
396
+
397
+ # Fix two-character doubling
398
+ Permutations.doubletwochars(word) do |suggestion|
399
+ yield Suggestion.new(suggestion, 'doubletwochars')
400
+ end
401
+
402
+ # Split by space in all positions
403
+ unless @aff[:NOSPLITSUGS]
404
+ Permutations.twowords(word) do |words|
405
+ yield MultiWordSuggestion.new(words, 'twowords', allow_dash: use_dash?)
406
+ end
407
+ end
408
+ end
409
+
410
+ # Generate edit suggestions and filter for valid words.
411
+ #
412
+ # @param word [String] Word to generate edits for
413
+ # @param compounds [Boolean] Whether to check compound words
414
+ # @param limit [Integer] Maximum number of suggestions to yield
415
+ # @yield [Suggestion, MultiWordSuggestion] Each valid edit suggestion
416
+ def edit_suggestions(word, compounds:, limit:)
417
+ count = 0
418
+
419
+ edits(word) do |suggestion|
420
+ break if count > limit
421
+
422
+ # Filter for valid words
423
+ filtered = filter_suggestion(suggestion, compounds)
424
+ next unless filtered
425
+
426
+ yield filtered
427
+ count += 1
428
+ end
429
+ end
430
+
431
+ # Generate ngram-based suggestions.
432
+ #
433
+ # @param word [String] Misspelled word
434
+ # @param handled [Set<String>] Already suggested words
435
+ # @yield [String] Each ngram suggestion
436
+ def ngram_suggestions(word, handled:)
437
+ return unless @aff[:MAXNGRAMSUGS]&.positive?
438
+
439
+ known_lower = handled.map(&:downcase).to_set
440
+
441
+ NgramSuggest.suggest(
442
+ word.downcase,
443
+ dictionary_words: @words_for_ngram,
444
+ prefixes: @aff[:PFX] || {},
445
+ suffixes: @aff[:SFX] || {},
446
+ known: known_lower,
447
+ maxdiff: @aff[:MAXDIFF] || 2,
448
+ onlymaxdiff: @aff[:ONLYMAXDIFF] || true,
449
+ has_phonetic: !@aff[:PHONE].nil?
450
+ ) do |suggestion|
451
+ yield suggestion
452
+ end
453
+ end
454
+
455
+ # Generate phonetic suggestions.
456
+ #
457
+ # @param word [String] Misspelled word
458
+ # @yield [String] Each phonetic suggestion
459
+ def phonet_suggestions(word)
460
+ return unless @aff[:PHONE]
461
+
462
+ PhonetSuggest.suggest(
463
+ word,
464
+ dictionary_words: @words_for_ngram,
465
+ table: @aff[:PHONE]
466
+ ) do |suggestion|
467
+ yield suggestion
468
+ end
469
+ end
470
+
471
+ # Check if dashes are allowed for joining words.
472
+ #
473
+ # Definition from Hunspell: Either dash is in TRY directive, or TRY
474
+ # indicates Latinic script (by having 'a').
475
+ #
476
+ # @return [Boolean] Whether dashes are allowed
477
+ def use_dash?
478
+ try_chars = @aff[:TRY] || ''
479
+ try_chars.include?('-') || try_chars.include?('a')
480
+ end
481
+
482
+ private
483
+
484
+ # Handle a found suggestion with proper capitalization and validation.
485
+ #
486
+ # @param suggestion [Suggestion, MultiWordSuggestion] Raw suggestion
487
+ # @param captype [Symbol] Original word's capitalization type
488
+ # @param is_forbidden [Proc] Function to check if word is forbidden
489
+ # @param handled [Set<String>] Already handled suggestions
490
+ # @param check_inclusion [Boolean] Whether to check for subsumption
491
+ # @yield [Suggestion] Processed suggestion if valid
492
+ def handle_found(suggestion, captype:, is_forbidden:, handled:, check_inclusion: false)
493
+ return unless block_given?
494
+
495
+ text = suggestion.text
496
+
497
+ # Apply capitalization coercion
498
+ unless @aff[:KEEPCASE] && suggestion_has_keepcase_flag?(suggestion)
499
+ text = @aff[:casing].coerce(text, captype)
500
+
501
+ # If coerced form is forbidden, revert to original
502
+ if text != suggestion.text && is_forbidden.call(text)
503
+ text = suggestion.text
504
+ end
505
+
506
+ # Fix "aNew" -> "a New" case
507
+ if [Capitalization::Type::HUH, Capitalization::Type::HUHINIT].include?(captype) && text.include?(' ')
508
+ pos = text.index(' ')
509
+ if pos && text[pos + 1] != text[pos] && text[pos + 1]&.upcase == text[pos]
510
+ text = text[0...pos + 1] + text[pos] + text[(pos + 2)..]
511
+ end
512
+ end
513
+ end
514
+
515
+ # Skip if forbidden
516
+ return if is_forbidden.call(text)
517
+
518
+ # Apply OCONV transformation if present
519
+ if @aff[:OCONV]
520
+ text = @aff[:OCONV].call(text)
521
+ end
522
+
523
+ # Skip if already seen
524
+ return if handled.include?(text)
525
+
526
+ # Skip if subsumed by existing suggestion
527
+ if check_inclusion
528
+ return if handled.any? { |prev| prev.downcase.in?(text.downcase) }
529
+ end
530
+
531
+ handled.add(text)
532
+ yield suggestion.replace(text: text)
533
+ end
534
+
535
+ # Check if suggestion has KEEPCASE flag.
536
+ #
537
+ # @param suggestion [Suggestion, MultiWordSuggestion]
538
+ # @return [Boolean]
539
+ def suggestion_has_keepcase_flag?(suggestion)
540
+ return false unless @aff[:KEEPCASE]
541
+
542
+ # Simplified check - full implementation would check dictionary
543
+ suggestion.text.include?('ß')
544
+ end
545
+
546
+ # Filter suggestion to only valid words.
547
+ #
548
+ # @param suggestion [Suggestion, MultiWordSuggestion]
549
+ # @param compounds [Boolean] Whether to check compound forms
550
+ # @return [Suggestion, nil] Filtered suggestion or nil if invalid
551
+ def filter_suggestion(suggestion, compounds)
552
+ is_good = ->(word) do
553
+ if compounds
554
+ @lookup.good_forms(word, capitalization: false, allow_nosuggest: false, affix_forms: false).any?
555
+ else
556
+ @lookup.good_forms(word, capitalization: false, allow_nosuggest: false, compound_forms: false).any?
557
+ end
558
+ end
559
+
560
+ if suggestion.is_a?(MultiWordSuggestion)
561
+ # Check all words are valid
562
+ return nil unless suggestion.words.all? { |w| is_good.call(w) }
563
+
564
+ suggestion.stringify
565
+ else
566
+ # Check single word is valid
567
+ return nil unless is_good.call(suggestion.text)
568
+
569
+ suggestion
570
+ end
571
+ end
572
+ end
573
+ end
574
+ end
575
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ # Algorithms namespace for spell checking algorithms.
5
+ #
6
+ # Contains the core algorithms ported from Spylls:
7
+ # - NgramSuggest: N-gram based suggestion algorithm
8
+ # - Lookup: Word correctness checking with affix support (TODO)
9
+ # - Suggest: Main suggestion orchestration (TODO)
10
+ #
11
+ # These are the core Hunspell algorithms that make spell checking work.
12
+ module Algorithms
13
+ end
14
+ end