kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,671 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require_relative "../suggestion"
5
+ require_relative "../suggestion_set"
6
+ require_relative "base_strategy"
7
+ require_relative "../../data/common_words_loader"
8
+
9
+ module Kotoshu
10
+ module Suggestions
11
+ module Strategies
12
+ # Edit distance suggestion strategy with enhanced ranking.
13
+ # Generates suggestions by finding words with small edit distance,
14
+ # ranked by word frequency, keyboard proximity, and common typo patterns.
15
+ #
16
+ # Multi-language support:
17
+ # - Automatically selects keyboard layout based on language_code
18
+ # - Loads frequency data from YAML files (Phase 1) or GitHub (Phase 2)
19
+ # - Supports language-specific typo patterns
20
+ #
21
+ # This is MORE OOP than Spylls which uses standalone functions
22
+ # for edit distance operations.
23
+ #
24
+ # Follows Open-Closed Principle: Extend by adding YAML files,
25
+ # NOT by modifying this class.
26
+ class EditDistanceStrategy < BaseStrategy
27
+ attr_reader :language_code, :keyboard_layout
28
+
29
+ # @param name [String, Symbol] Name of the strategy
30
+ # @param config [Hash] Configuration options
31
+ # @option config [String] :language_code Language code for keyboard layout (default: 'en')
32
+ # @option config [Keyboard::Layout] :keyboard_layout Custom keyboard layout (optional)
33
+ # @option config [Hash] :frequency_tiers Custom frequency tiers (optional)
34
+ # @option config [Integer] :max_distance Maximum edit distance (default: 2)
35
+ # @option config [Integer] :max_results Maximum results to return (default: 10)
36
+ def initialize(name: :edit_distance, language_code: 'en', keyboard_layout: nil,
37
+ frequency_tiers: nil, **config)
38
+ super(name: name, **config)
39
+ @language_code = language_code
40
+
41
+ # Use OOP registry for keyboard layout lookup
42
+ @keyboard_layout = resolve_keyboard_layout(keyboard_layout)
43
+
44
+ # Use custom frequency tiers if provided, otherwise load from Kelly data
45
+ if frequency_tiers
46
+ @frequency_tiers = frequency_tiers
47
+ @common_words = Set.new
48
+ else
49
+ # Load frequency data for the language from Kelly JSON
50
+ # This sets @frequency_tiers internally
51
+ load_frequency_data(language_code)
52
+ end
53
+ end
54
+
55
+ # Public method to get current keyboard being used
56
+ #
57
+ # @return [Keyboard::Layout] The keyboard layout instance
58
+ def keyboard
59
+ @keyboard_layout
60
+ end
61
+
62
+ # Public method to get keyboard name
63
+ #
64
+ # @return [String] Keyboard layout name
65
+ def keyboard_name
66
+ @keyboard_layout.name
67
+ end
68
+
69
+ # Check if a substitution is a keyboard-adjacent typo
70
+ #
71
+ # @param char1 [String] First character
72
+ # @param char2 [String] Second character
73
+ # @return [Boolean] True if keys are adjacent
74
+ def adjacent_key_typo?(char1, char2)
75
+ @keyboard_layout.adjacent_keys(char1).include?(char2)
76
+ end
77
+
78
+ # Get adjacent keys for a given key
79
+ #
80
+ # @param key [String] The key to find adjacent keys for
81
+ # @return [Array<String>] List of adjacent key characters
82
+ def adjacent_keys(key)
83
+ @keyboard_layout.adjacent_keys(key)
84
+ end
85
+
86
+ # Get frequency bonus for a word
87
+ #
88
+ # @param word [String] The word to check
89
+ # @return [Integer] Frequency bonus (0-200)
90
+ def frequency_bonus(word)
91
+ return 0 unless @frequency_tiers
92
+
93
+ word_downcase = word.downcase
94
+
95
+ # Top 50: 200 bonus
96
+ return 200 if @frequency_tiers[:top_50]&.include?(word_downcase)
97
+
98
+ # Top 200: 100 bonus
99
+ return 100 if @frequency_tiers[:top_200]&.include?(word_downcase)
100
+
101
+ # Top 1000: 50 bonus
102
+ return 50 if @frequency_tiers[:top_1000]&.include?(word_downcase)
103
+
104
+ # Not in common words: no bonus
105
+ 0
106
+ end
107
+
108
+ # Generate suggestions based on enhanced edit distance scoring.
109
+ #
110
+ # Scoring factors:
111
+ # - Edit distance (primary factor)
112
+ # - Word frequency (common words rank higher)
113
+ # - Keyboard proximity (adjacent key typos rank higher)
114
+ # - Common typo patterns (missing double letters, etc.)
115
+ #
116
+ # @param context [Context] The suggestion context
117
+ # @return [SuggestionSet] Suggestions within max_distance
118
+ def generate(context)
119
+ word = context.word
120
+ max_dist = get_config(:max_distance, 2)
121
+ min_confidence = get_config(:min_confidence, 0.75) # Higher threshold for quality
122
+ min_similarity = get_config(:min_jaro_similarity, 0.70) # Minimum Jaro-Winkler similarity (0.0-1.0)
123
+ min_results = get_config(:min_results, 3) # Always return at least 3 suggestions if available
124
+
125
+ # Get all dictionary words
126
+ all_words = dictionary_words(context)
127
+
128
+ # Calculate enhanced scores for all candidates
129
+ candidates = []
130
+ all_words.each do |dict_word|
131
+ next if dict_word == word
132
+
133
+ dist = edit_distance(word, dict_word)
134
+ next if dist > max_dist || dist <= 0
135
+
136
+ # Calculate enhanced score (lower is better)
137
+ score = calculate_enhanced_score(word, dict_word, dist)
138
+ candidates << [dict_word, dist, score]
139
+ end
140
+
141
+ # Sort by enhanced score (lower is better)
142
+ sorted_candidates = candidates.sort_by { |_, _, score| score }
143
+
144
+ # Calculate confidence scores with threshold filtering
145
+ if sorted_candidates.empty?
146
+ return SuggestionSet.empty
147
+ end
148
+
149
+ max_score = sorted_candidates.map { |_, _, s| s.to_f }.max
150
+ min_score = sorted_candidates.map { |_, _, s| s.to_f }.min
151
+ score_range = (max_score - min_score).abs
152
+
153
+ # Create suggestions with confidence-based filtering
154
+ suggestions = []
155
+ sorted_candidates.each do |dict_word, dist, score|
156
+ # Normalize score to confidence (0.0 to 1.0)
157
+ # Lower score = higher confidence
158
+ if score_range > 0
159
+ normalized = (score.to_f - min_score) / score_range # 0 to 1
160
+ confidence = 1.0 - normalized # Invert: lower score = higher confidence
161
+ else
162
+ confidence = 1.0
163
+ end
164
+
165
+ # Calculate Jaro-Winkler similarity for additional filtering
166
+ jaro_similarity = calculate_ngram_similarity(word, dict_word)
167
+
168
+ # Skip low-confidence or low-similarity suggestions (unless we need more for min_results)
169
+ if confidence < min_confidence || jaro_similarity < min_similarity
170
+ next if suggestions.size >= min_results
171
+ end
172
+
173
+ suggestions << Suggestion.new(
174
+ word: dict_word,
175
+ distance: dist,
176
+ confidence: confidence,
177
+ source: @name,
178
+ original_length: word.length,
179
+ ngram_score: jaro_similarity, # Now stores Jaro-Winkler similarity (0.0-1.0)
180
+ enhanced_score: score
181
+ )
182
+
183
+ # Stop when we have enough high-quality suggestions
184
+ break if suggestions.size >= max_results
185
+ end
186
+
187
+ SuggestionSet.new(suggestions, max_size: max_results)
188
+ end
189
+
190
+ # Check if this strategy should handle the context.
191
+ #
192
+ # @param context [Context] The suggestion context
193
+ # @return [Boolean] True if the word needs correction
194
+ def handles?(context)
195
+ return false unless enabled?
196
+
197
+ # Only handle if the word is not in the dictionary
198
+ !dictionary_lookup(context, context.word)
199
+ end
200
+
201
+ private
202
+
203
+ # Get all words from the dictionary.
204
+ #
205
+ # @param context [Context] The suggestion context
206
+ # @return [Array<String>] All dictionary words
207
+ def dictionary_words(context)
208
+ dictionary = context.dictionary
209
+
210
+ # Check for IndexedDictionary if Core module is loaded
211
+ if defined?(::Kotoshu::Core::IndexedDictionary) && dictionary.is_a?(::Kotoshu::Core::IndexedDictionary)
212
+ dictionary.all_words
213
+ elsif dictionary.respond_to?(:words)
214
+ dictionary.words
215
+ elsif dictionary.is_a?(Hash)
216
+ dictionary.keys
217
+ elsif dictionary.is_a?(Set)
218
+ dictionary.to_a
219
+ elsif dictionary.is_a?(Array)
220
+ dictionary
221
+ else
222
+ # Fallback: try to iterate
223
+ Array(dictionary).flat_map(&:to_a)
224
+ end
225
+ end
226
+
227
+ # Check if a word exists in the dictionary.
228
+ #
229
+ # @param context [Context] The suggestion context
230
+ # @param word [String] The word to check
231
+ # @return [Boolean] True if word exists
232
+ def dictionary_lookup(context, word)
233
+ dictionary = context.dictionary
234
+
235
+ # First check if it's a dictionary backend with lookup method
236
+ if dictionary.respond_to?(:lookup)
237
+ dictionary.lookup(word)
238
+ elsif defined?(::Kotoshu::Core::IndexedDictionary) && dictionary.is_a?(::Kotoshu::Core::IndexedDictionary)
239
+ dictionary.has_word?(word)
240
+ elsif dictionary.is_a?(Set)
241
+ dictionary.include?(word)
242
+ elsif dictionary.respond_to?(:include?)
243
+ dictionary.include?(word)
244
+ elsif dictionary.is_a?(Hash)
245
+ dictionary.key?(word)
246
+ else
247
+ false
248
+ end
249
+ end
250
+
251
+ # Calculate Damerau-Levenshtein edit distance between two strings.
252
+ # This extends Levenshtein by treating transposition of adjacent characters as 1 operation.
253
+ #
254
+ # Examples:
255
+ # "wrold" → "world" = 1 (transposition of 'r' and 'o')
256
+ # "hello" → "hell" = 1 (deletion)
257
+ # "cat" → "cut" = 1 (substitution)
258
+ #
259
+ # @param str1 [String] First string
260
+ # @param str2 [String] Second string
261
+ # @return [Integer] Edit distance
262
+ def edit_distance(str1, str2)
263
+ # Handle empty strings
264
+ return str2.length if str1.empty?
265
+ return str1.length if str2.empty?
266
+
267
+ len1 = str1.length
268
+ len2 = str2.length
269
+
270
+ # Create a 2D array for dynamic programming
271
+ d = Array.new(len1 + 1) { Array.new(len2 + 1, 0) }
272
+
273
+ # Initialize the first row and column
274
+ (0..len1).each { |i| d[i][0] = i }
275
+ (0..len2).each { |j| d[0][j] = j }
276
+
277
+ # Fill the matrix
278
+ (1..len1).each do |i|
279
+ (1..len2).each do |j|
280
+ cost = (str1[i - 1] == str2[j - 1]) ? 0 : 1
281
+
282
+ d[i][j] = [
283
+ d[i - 1][j] + 1, # deletion
284
+ d[i][j - 1] + 1, # insertion
285
+ d[i - 1][j - 1] + cost # substitution
286
+ ].min
287
+
288
+ # Check for transposition (Damerau extension)
289
+ if i > 1 && j > 1 &&
290
+ str1[i - 1] == str2[j - 2] &&
291
+ str1[i - 2] == str2[j - 1]
292
+ d[i][j] = [d[i][j], d[i - 2][j - 2] + 1].min
293
+ end
294
+ end
295
+ end
296
+
297
+ d[len1][len2]
298
+ end
299
+
300
+ # Optimized edit distance with early termination.
301
+ # Returns early if distance exceeds threshold.
302
+ #
303
+ # @param str1 [String] First string
304
+ # @param str2 [String] Second string
305
+ # @param threshold [Integer] Maximum distance to calculate
306
+ # @return [Integer, nil] Distance or nil if exceeds threshold
307
+ def edit_distance_with_threshold(str1, str2, threshold)
308
+ # For now, use the regular implementation
309
+ # This can be optimized later with early termination
310
+ dist = edit_distance(str1, str2)
311
+ dist <= threshold ? dist : nil
312
+ end
313
+
314
+ # Calculate enhanced score combining multiple factors.
315
+ #
316
+ # Lower score = better suggestion
317
+ #
318
+ # @param original [String] The original misspelled word
319
+ # @param suggestion [String] The suggested word
320
+ # @param distance [Integer] Edit distance
321
+ # @return [Float] Enhanced score (lower is better)
322
+ def calculate_enhanced_score(original, suggestion, distance)
323
+ score = distance * 1000.0 # Base score from edit distance
324
+
325
+ # Factor 1: Word frequency bonus (common words get lower score)
326
+ score -= frequency_bonus(suggestion)
327
+
328
+ # Factor 2: Keyboard proximity penalty (typo-like patterns get lower score)
329
+ score += keyboard_penalty(original, suggestion)
330
+
331
+ # Factor 3: Common typo pattern bonus
332
+ # Transposition (swap adjacent chars) is the MOST common typo
333
+ trans_bonus = transposition_bonus(original, suggestion)
334
+ score -= trans_bonus
335
+
336
+ # Factor 4: Missing double letter bonus (helo -> hello)
337
+ score -= typo_pattern_bonus(original, suggestion)
338
+
339
+ # Factor 5: Length similarity bonus (similar length is better)
340
+ length_diff = (original.length - suggestion.length).abs
341
+ score += length_diff * 50
342
+
343
+ score
344
+ end
345
+
346
+ # Calculate bonus for transposition (swap adjacent characters).
347
+ # This is the MOST common typing error, so it gets the highest bonus.
348
+ #
349
+ # @param original [String] The original word
350
+ # @param suggestion [String] The suggested word
351
+ # @return [Float] Transposition bonus (0 or 200)
352
+ def transposition_bonus(original, suggestion)
353
+ # Transposition only makes sense for same-length words
354
+ return 0 unless original.length == suggestion.length
355
+
356
+ o = original.downcase
357
+ s = suggestion.downcase
358
+
359
+ # Count transpositions needed
360
+ transpositions = 0
361
+ (0...o.length).each do |i|
362
+ next if o[i] == s[i]
363
+ # Find matching char in suggestion
364
+ match_idx = s.index(o[i], i + 1)
365
+ if match_idx && (match_idx == i + 1 || (match_idx > i + 1 && s[i] == o[match_idx]))
366
+ # This is a simple adjacent swap
367
+ transpositions += 1
368
+ end
369
+ end
370
+
371
+ # Only give bonus for single transposition
372
+ transpositions == 1 ? 200 : (transpositions * 100)
373
+ end
374
+
375
+ # Calculate keyboard proximity penalty.
376
+ #
377
+ # Substitutions between adjacent keys get lower penalty.
378
+ # Uses OOP keyboard layout for language-aware distance calculations.
379
+ #
380
+ # @param original [String] The original word
381
+ # @param suggestion [String] The suggested word
382
+ # @return [Float] Keyboard penalty (0-200)
383
+ def keyboard_penalty(original, suggestion)
384
+ penalty = 0
385
+
386
+ # Find the edit script to see what changed
387
+ o_chars = original.chars
388
+ s_chars = suggestion.chars
389
+
390
+ # Simple comparison for equal-length words (substitutions)
391
+ if o_chars.length == s_chars.length
392
+ o_chars.each_with_index do |c1, i|
393
+ c2 = s_chars[i]
394
+ next if c1 == c2
395
+
396
+ # Use OOP keyboard layout for distance calculation
397
+ key_dist = @keyboard_layout.distance(c1, c2)
398
+
399
+ if key_dist == Float::INFINITY
400
+ # Symbol or unknown key - medium penalty
401
+ penalty += 50
402
+ elsif key_dist == 1
403
+ penalty += 10 # Very likely typo (adjacent keys)
404
+ elsif key_dist == 2
405
+ penalty += 30 # Somewhat likely
406
+ else
407
+ penalty += 100 # Unlikely to be typo (far keys)
408
+ end
409
+ end
410
+ end
411
+
412
+ penalty
413
+ end
414
+
415
+ # Calculate bonus for common typo patterns.
416
+ #
417
+ # @param original [String] The original word
418
+ # @param suggestion [String] The suggested word
419
+ # @return [Float] Pattern bonus (0-300)
420
+ def typo_pattern_bonus(original, suggestion)
421
+ bonus = 0
422
+
423
+ # Pattern 1: Missing double letter (helo -> hello)
424
+ # This is the MOST COMMON typo after transposition, give it highest bonus
425
+ if suggestion.length == original.length + 1
426
+ # Check if suggestion has a double letter that original is missing
427
+ suggestion.chars.each_cons(2).with_index do |pair, i|
428
+ if pair[0] == pair[1] # Found double letter at positions i and i+1
429
+ # Check if removing the second occurrence (at i+1) gives us the original word
430
+ # For "hello" with "ll" at position 2, remove position 3: "hel" + "o" = "helo"
431
+ expected = suggestion[0...i + 1] + suggestion[i + 2..-1]
432
+ if expected == original
433
+ bonus += 300 # Strong bonus for missing double letter (MORE than transposition!)
434
+ break
435
+ end
436
+ end
437
+ end
438
+ end
439
+
440
+ # Pattern 2: Extra double letter (helllo -> hello)
441
+ if original.length == suggestion.length + 1
442
+ # Check if original has a double letter that suggestion doesn't
443
+ original.chars.each_cons(2).with_index do |pair, i|
444
+ if pair[0] == pair[1] # Found double letter in original
445
+ # Check if removing it gives the suggestion
446
+ reconstructed = original[0...i + 1] + original[i + 1..-1]
447
+ if reconstructed == suggestion
448
+ bonus += 100 # Bonus for extra double letter
449
+ break
450
+ end
451
+ end
452
+ end
453
+ end
454
+
455
+ # Pattern 3: Common prefixes/suffixes
456
+ if original.start_with?(suggestion[0...3]) && suggestion.length > original.length
457
+ bonus += 30 # Suggestion extends common prefix
458
+ end
459
+
460
+ bonus
461
+ end
462
+
463
+ private
464
+
465
+ # Resolve keyboard layout using OOP registry pattern
466
+ #
467
+ # @param keyboard_layout [Keyboard::Layout, String, nil] Layout override
468
+ # @return [Keyboard::Layout] Resolved layout
469
+ def resolve_keyboard_layout(keyboard_layout)
470
+ require_relative '../../../kotoshu/keyboard/registry'
471
+
472
+ if keyboard_layout.is_a?(Keyboard::Layout)
473
+ keyboard_layout
474
+ elsif keyboard_layout.is_a?(String)
475
+ Keyboard::Registry.layout_by_name(keyboard_layout)
476
+ elsif @language_code
477
+ Keyboard::Registry.layout_for(@language_code)
478
+ else
479
+ Keyboard::Registry.layout_by_name('QWERTY')
480
+ end
481
+ end
482
+
483
+ # Load frequency data for the language.
484
+ #
485
+ # Uses a tiered approach:
486
+ # 1. First tries to load from local Kelly JSON files (frequency-list-kelly/data/)
487
+ # 2. Then tries to load from GitHub frequency.json (Phase 2)
488
+ # 3. Falls back to local YAML files (Phase 1)
489
+ # 4. Falls back to empty set if no data available
490
+ #
491
+ # This follows the Open-Closed Principle: new languages are added
492
+ # by creating new JSON/YAML files, not by modifying this class.
493
+ #
494
+ # @param language_code [String] ISO 639-1 language code
495
+ # @return [Hash{Symbol => Set}] Hash with :tiers and :metadata
496
+ # Load frequency data for the language.
497
+ #
498
+ # Uses a tiered approach following OOP cache pattern:
499
+ # 1. First tries FrequencyCache (Kelly Project from GitHub with caching)
500
+ # 2. Falls back to local YAML files (legacy)
501
+ # 3. Falls back to empty set if no data available
502
+ #
503
+ # This follows the Open-Closed Principle: new languages are added
504
+ # by creating new JSON files, not by modifying this class.
505
+ #
506
+ # @param language_code [String] ISO 639-1 language code
507
+ # @return [Hash{Symbol => Set}] Hash with :tiers and :metadata
508
+ def load_frequency_data(language_code)
509
+ # Phase 1: Try Kelly FrequencyCache (GitHub download + local caching)
510
+ cache_result = try_load_from_frequency_cache(language_code)
511
+ if cache_result && cache_result[:tiers] && cache_result[:tiers][:top_1000].any?
512
+ @frequency_tiers = cache_result[:tiers]
513
+ return @frequency_tiers
514
+ end
515
+
516
+ # Phase 2: Load from local YAML files (legacy)
517
+ yaml_data = Data::CommonWordsLoader.load(language_code)
518
+
519
+ if yaml_data[:tiers][:top_1000].any?
520
+ @frequency_tiers = yaml_data[:tiers]
521
+ return @frequency_tiers
522
+ end
523
+
524
+ # No data available for this language
525
+ @frequency_tiers = {
526
+ top_50: Set.new,
527
+ top_200: Set.new,
528
+ top_1000: Set.new
529
+ }
530
+ @frequency_tiers
531
+ end
532
+
533
+ private
534
+
535
+ # Try to load frequency data from FrequencyCache (OOP cache pattern).
536
+ #
537
+ # Uses FrequencyCache to download Kelly frequency lists from GitHub
538
+ # with automatic caching in $XDG_CACHE_HOME/kotoshu/frequency-lists/
539
+ #
540
+ # @param language_code [String] ISO 639-1 language code
541
+ # @return [Hash, nil] Frequency data or nil if not available
542
+ def try_load_from_frequency_cache(language_code)
543
+ require_relative '../../../kotoshu/cache/frequency_cache'
544
+
545
+ cache = Cache::FrequencyCache.new
546
+
547
+ # Check if language is supported by Kelly
548
+ return nil unless cache.available_languages.include?(language_code)
549
+
550
+ begin
551
+ # Try to get from cache (will download if not cached or expired)
552
+ result = cache.get(language_code)
553
+ return result if result
554
+ rescue StandardError => e
555
+ warn "Warning: Failed to load frequency cache for #{language_code}: #{e.message}" if $VERBOSE
556
+ end
557
+
558
+ nil
559
+ end
560
+
561
+ # Deprecated: Use FrequencyCache instead.
562
+ # Kept for backwards compatibility during migration.
563
+ def try_load_from_github(language_code); end
564
+ def try_load_from_kelly(language_code); end
565
+ def try_load_kelly_local(language_code); end
566
+ def try_load_kelly_from_github(language_code); end
567
+ # Kelly Project frequency lists are stored in:
568
+ # frequency-list-kelly/data/{language_code}.json
569
+ #
570
+ # @param language_code [String] ISO 639-1 language code
571
+ # @return [Hash, nil] Frequency data or nil if not available
572
+ def try_load_from_kelly(language_code)
573
+ # Try local paths first
574
+ local_data = try_load_kelly_local(language_code)
575
+ return local_data if local_data
576
+
577
+ # If not found locally, try downloading from GitHub
578
+ try_load_kelly_from_github(language_code)
579
+ end
580
+
581
+ # Try to load Kelly data from local file paths.
582
+ #
583
+ # @param language_code [String] ISO 639-1 language code
584
+ # @return [Hash, nil] Frequency data or nil if not available
585
+ def try_load_kelly_local(language_code)
586
+ kelly_paths = [
587
+ # Check if we're in the kotoshu/kotoshu subdirectory
588
+ File.expand_path('../../../../frequency-list-kelly/data', __dir__),
589
+ # Check if we're in the kotoshu repo with frequency-list-kelly sibling
590
+ File.expand_path('../../frequency-list-kelly/data', __dir__),
591
+ # Check if we're in the kotoshu/lib subdirectory
592
+ File.expand_path('../../../frequency-list-kelly/data', __dir__),
593
+ # User's local kotoshu clone
594
+ File.expand_path('~/src/kotoshu/frequency-list-kelly/data'),
595
+ # Environment variable override
596
+ ENV['KELLY_DATA_PATH']
597
+ ].compact.uniq
598
+
599
+ kelly_paths.each do |path|
600
+ potential_file = File.join(path, "#{language_code}.json")
601
+ if File.exist?(potential_file)
602
+ begin
603
+ return Data::CommonWordsLoader.load_from_frequency_file(potential_file)
604
+ rescue StandardError => e
605
+ warn "Warning: Failed to load local Kelly data for #{language_code}: #{e.message}" if $VERBOSE
606
+ end
607
+ end
608
+ end
609
+
610
+ nil
611
+ end
612
+
613
+ # Try to download Kelly data from GitHub.
614
+ #
615
+ # Kelly data is cached in $XDG_CACHE_HOME/kotoshu/frequency-lists/
616
+ #
617
+ # @param language_code [String] ISO 639-1 language code
618
+ # @return [Hash, nil] Frequency data or nil if not available
619
+ def try_load_kelly_from_github(language_code)
620
+ require 'net/http'
621
+ require 'fileutils'
622
+
623
+ kelly_languages = %w[ar zh en el it no ru sv]
624
+ return nil unless kelly_languages.include?(language_code)
625
+
626
+ # Cache in $XDG_CACHE_HOME/kotoshu/frequency-lists/ (same pattern as dictionaries)
627
+ cache_dir = File.join(Kotoshu::Paths.cache_path, 'frequency-lists')
628
+ FileUtils.mkdir_p(cache_dir)
629
+
630
+ cached_file = File.join(cache_dir, "#{language_code}.json")
631
+ cache_ttl = 604_800 # 7 days
632
+
633
+ # Use cached file if it exists and is recent
634
+ if File.exist?(cached_file)
635
+ file_age = Time.now - File.mtime(cached_file)
636
+ if file_age < cache_ttl
637
+ begin
638
+ data = Data::CommonWordsLoader.load_from_frequency_file(cached_file)
639
+ return data[:tiers]
640
+ rescue StandardError => e
641
+ warn "Warning: Failed to load cached Kelly data for #{language_code}: #{e.message}" if $VERBOSE
642
+ end
643
+ end
644
+ end
645
+
646
+ # Download from GitHub (kotoshu/frequency-list-kelly repository)
647
+ url = "https://raw.githubusercontent.com/kotoshu/frequency-list-kelly/main/data/#{language_code}.json"
648
+
649
+ begin
650
+ warn "Downloading Kelly frequency data for #{language_code} from GitHub..." if $VERBOSE
651
+
652
+ uri = URI(url)
653
+ response = Net::HTTP.get(uri)
654
+
655
+ # Validate JSON before saving
656
+ JSON.parse(response) # Validate it's valid JSON
657
+
658
+ # Save to cache
659
+ File.write(cached_file, response)
660
+
661
+ data = Data::CommonWordsLoader.load_from_frequency_file(cached_file)
662
+ data[:tiers]
663
+ rescue StandardError => e
664
+ warn "Warning: Failed to download Kelly data for #{language_code}: #{e.message}" if $VERBOSE
665
+ nil
666
+ end
667
+ end
668
+ end
669
+ end
670
+ end
671
+ end