kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,228 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Suggestions
5
+ module Strategies
6
+ # Keyboard proximity suggestion strategy.
7
+ #
8
+ # Generates suggestions by finding words that can be formed by
9
+ # substituting adjacent keys on a QWERTY keyboard.
10
+ #
11
+ # @example Creating a keyboard proximity strategy
12
+ # strategy = KeyboardProximityStrategy.new
13
+ # result = strategy.generate(context)
14
+ class KeyboardProximityStrategy < BaseStrategy
15
+ # QWERTY keyboard layout (US).
16
+ #
17
+ # Each key maps to its adjacent keys.
18
+ KEYBOARD_LAYOUT = {
19
+ "`" => %w[1 tab],
20
+ "1" => ["`", "2", "q"],
21
+ "2" => %w[1 3 w q],
22
+ "3" => %w[2 4 e w],
23
+ "4" => %w[3 5 r e],
24
+ "5" => %w[4 6 t r],
25
+ "6" => %w[5 7 y t],
26
+ "7" => %w[6 8 u y],
27
+ "8" => %w[7 9 i u],
28
+ "9" => %w[8 0 o i],
29
+ "0" => %w[9 p o],
30
+ "-" => ["0", "="],
31
+ "=" => ["-"],
32
+ "q" => %w[tab w a 1],
33
+ "w" => %w[q e a s 2],
34
+ "e" => %w[w r s d 3],
35
+ "r" => %w[e t d f 4],
36
+ "t" => %w[r y f g 5],
37
+ "y" => %w[t u g h 6],
38
+ "u" => %w[y i h j 7],
39
+ "i" => %w[u o j k 8],
40
+ "o" => %w[i p k l 9],
41
+ "p" => ["o", "l", ";", "0"],
42
+ "[" => ["p", "'"],
43
+ "]" => ["enter", "\\"],
44
+ "\\" => ["enter"], # Backslash neighbors
45
+ "a" => %w[caps s z q],
46
+ "s" => %w[a d z x w],
47
+ "d" => %w[s f x c e],
48
+ "f" => %w[d g c v r],
49
+ "g" => %w[f h v b t],
50
+ "h" => %w[g j b n y],
51
+ "j" => %w[h k n m u],
52
+ "k" => ["j", "l", "m", ",", "i"],
53
+ "l" => ["k", ";", ",", ".", "o"],
54
+ ";" => ["l", "'", ".", "p"],
55
+ "'" => [";"],
56
+ "z" => %w[shift s x a],
57
+ "x" => %w[z c s d],
58
+ "c" => %w[x v d f],
59
+ "v" => %w[c b f g],
60
+ "b" => %w[v n g h],
61
+ "n" => %w[b m h j],
62
+ "m" => ["n", ",", "j", "k"],
63
+ "," => ["m", ".", "k", "l"],
64
+ "." => [",", "/", "l", ";"],
65
+ "/" => [".", "shift"],
66
+ " " => [] # Space has no neighbors
67
+ }.freeze
68
+
69
+ # Create a new keyboard proximity strategy.
70
+ #
71
+ # @param name [String, Symbol] Name of the strategy
72
+ # @param config [Hash] Configuration options
73
+ # @option config [Integer] max_distance Maximum keyboard distance
74
+ # @option config [Integer] max_results Maximum results to return
75
+ def initialize(name: :keyboard_proximity, **config)
76
+ super(name: name, **config)
77
+ end
78
+
79
+ # Generate suggestions based on keyboard proximity.
80
+ #
81
+ # @param context [Context] The suggestion context
82
+ # @return [SuggestionSet] Suggestions within keyboard distance
83
+ def generate(context)
84
+ word = context.word
85
+ max_dist = get_config(:max_distance, 2)
86
+ min_similarity = get_config(:min_similarity, 0.70) # Filter low-similarity suggestions
87
+
88
+ all_words = dictionary_words(context)
89
+
90
+ # Generate keyboard variants
91
+ variants = keyboard_variants(word, max_dist)
92
+
93
+ # Find matching dictionary words with their edit distances and similarity
94
+ results_with_distances = {}
95
+ variants.each do |variant|
96
+ dict_word = find_word(all_words, variant)
97
+ next unless dict_word && dict_word != word
98
+
99
+ # Calculate edit distance from original word
100
+ dist = edit_distance(word, dict_word)
101
+ next if dist > max_dist
102
+
103
+ # Calculate typo correction similarity
104
+ similarity = calculate_ngram_similarity(word, dict_word)
105
+ next if similarity < min_similarity # Filter by similarity threshold
106
+
107
+ # Keep the minimum distance for each word
108
+ results_with_distances[dict_word] ||= dist
109
+ results_with_distances[dict_word] = dist if dist < results_with_distances[dict_word]
110
+ end
111
+
112
+ # Sort by distance and create suggestions
113
+ sorted_words = results_with_distances.sort_by { |_, dist| dist }.map(&:first)
114
+ create_suggestion_set(sorted_words, distances: results_with_distances, original_word: word)
115
+ end
116
+
117
+ # Check if this strategy should handle the context.
118
+ #
119
+ # @param context [Context] The suggestion context
120
+ # @return [Boolean] True if the word needs correction
121
+ def handles?(context)
122
+ return false unless enabled?
123
+
124
+ !dictionary_lookup(context, context.word)
125
+ end
126
+
127
+ private
128
+
129
+ # Calculate edit distance between two strings.
130
+ # Uses Levenshtein distance (substitution, insertion, deletion).
131
+ #
132
+ # @param str1 [String] First string
133
+ # @param str2 [String] Second string
134
+ # @return [Integer] Edit distance
135
+ def edit_distance(str1, str2)
136
+ return str2.length if str1.empty?
137
+ return str1.length if str2.empty?
138
+
139
+ len1 = str1.length
140
+ len2 = str2.length
141
+
142
+ # Create a 2D array for dynamic programming
143
+ d = Array.new(len1 + 1) { Array.new(len2 + 1, 0) }
144
+
145
+ # Initialize the first row and column
146
+ (0..len1).each { |i| d[i][0] = i }
147
+ (0..len2).each { |j| d[0][j] = j }
148
+
149
+ # Fill the matrix
150
+ (1..len1).each do |i|
151
+ (1..len2).each do |j|
152
+ cost = (str1[i - 1] == str2[j - 1]) ? 0 : 1
153
+
154
+ d[i][j] = [
155
+ d[i - 1][j] + 1, # deletion
156
+ d[i][j - 1] + 1, # insertion
157
+ d[i - 1][j - 1] + cost # substitution
158
+ ].min
159
+ end
160
+ end
161
+
162
+ d[len1][len2]
163
+ end
164
+
165
+ # Get neighbors for a key.
166
+ #
167
+ # @param char [String] The character
168
+ # @return [Array<String>] Neighbor keys
169
+ def neighbors(char)
170
+ KEYBOARD_LAYOUT[char.downcase] || []
171
+ end
172
+
173
+ # Generate keyboard variants of a word.
174
+ #
175
+ # @param word [String] The word
176
+ # @param max_distance [Integer] Maximum edit distance
177
+ # @return [Array<String>] Keyboard variants
178
+ def keyboard_variants(word, max_distance)
179
+ return [] if word.nil? || word.empty?
180
+
181
+ word = word.downcase
182
+ variants = Set.new([word])
183
+
184
+ max_distance.times do
185
+ new_variants = Set.new
186
+
187
+ variants.each do |variant|
188
+ # Generate all single-key substitutions
189
+ variant.each_char.with_index do |char, i|
190
+ neighbors(char).each do |neighbor|
191
+ new_word = variant[0...i] + neighbor + variant[(i + 1)..]
192
+ new_variants.add(new_word)
193
+
194
+ # Also try insertions and deletions
195
+ new_variants.add(variant[0...i] + variant[(i + 1)..]) # Delete
196
+ new_variants.add(variant[0...i] + neighbor + variant[i..]) # Insert
197
+ end
198
+ end
199
+ end
200
+
201
+ variants = new_variants
202
+ end
203
+
204
+ variants.to_a
205
+ end
206
+
207
+ # Find a word in the dictionary (case-insensitive).
208
+ #
209
+ # @param all_words [Array<String>] All dictionary words
210
+ # @param word [String] The word to find
211
+ # @return [String, nil] The dictionary word or nil
212
+ def find_word(all_words, word)
213
+ return nil if word.nil? || word.empty?
214
+
215
+ word_lower = word.downcase
216
+
217
+ # First try exact match
218
+ return word if all_words.include?(word_lower)
219
+
220
+ # Then try case-insensitive search
221
+ all_words.find { |w| w.downcase == word_lower }
222
+ end
223
+ end
224
+ end
225
+ end
226
+ end
227
+
228
+ require "set" if RUBY_VERSION < "3.0"
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Suggestions
5
+ module Strategies
6
+ # N-gram suggestion strategy.
7
+ #
8
+ # Generates suggestions by finding words with high n-gram similarity.
9
+ # N-grams are contiguous sequences of n characters.
10
+ #
11
+ # @example Creating an n-gram strategy
12
+ # strategy = NgramStrategy.new(n: 3)
13
+ # result = strategy.generate(context)
14
+ class NgramStrategy < BaseStrategy
15
+ # Create a new n-gram strategy.
16
+ #
17
+ # @param name [String, Symbol] Name of the strategy
18
+ # @param config [Hash] Configuration options
19
+ # @option config [Integer] n N-gram size (default: 3)
20
+ # @option config [Float] min_similarity Minimum similarity threshold (0-1)
21
+ # @option config [Integer] max_results Maximum results to return
22
+ def initialize(name: :ngram, **config)
23
+ super(name: name, **config)
24
+ end
25
+
26
+ # Generate suggestions based on n-gram similarity.
27
+ #
28
+ # @param context [Context] The suggestion context
29
+ # @return [SuggestionSet] Suggestions with high n-gram similarity
30
+ def generate(context)
31
+ word = context.word
32
+ n = get_config(:n, 3)
33
+ min_sim = get_config(:min_similarity, 0.3)
34
+ min_typo_similarity = get_config(:min_typo_similarity, 0.70) # Filter by typo correction similarity
35
+
36
+ return create_suggestion_set([]) if word.length < n
37
+
38
+ all_words = dictionary_words(context)
39
+
40
+ # Get n-grams for input word
41
+ word_ngrams = extract_ngrams(word, n)
42
+
43
+ # Calculate n-gram similarity for each dictionary word
44
+ results = {}
45
+ all_words.each do |dict_word|
46
+ next if dict_word == word
47
+ next if dict_word.length < n
48
+
49
+ similarity = ngram_similarity(word_ngrams, dict_word, n)
50
+ next if similarity < min_sim
51
+
52
+ # Also check typo correction similarity for filtering
53
+ typo_sim = calculate_ngram_similarity(word, dict_word)
54
+ next if typo_sim < min_typo_similarity
55
+
56
+ # Convert similarity to distance (higher similarity = lower distance)
57
+ dist = ((1 - similarity) * 10).to_i
58
+ next if dist.zero?
59
+
60
+ results[dict_word] ||= dist
61
+ results[dict_word] = dist if dist < results[dict_word]
62
+ end
63
+
64
+ # Convert to suggestions sorted by similarity
65
+ sorted_words = results.sort_by { |_, dist| dist }.map(&:first)
66
+ create_suggestion_set(sorted_words, distances: results, original_word: word)
67
+ end
68
+
69
+ # Check if this strategy should handle the context.
70
+ #
71
+ # @param context [Context] The suggestion context
72
+ # @return [Boolean] True if the word needs correction
73
+ def handles?(context)
74
+ return false unless enabled?
75
+
76
+ !dictionary_lookup(context, context.word)
77
+ end
78
+
79
+ private
80
+
81
+ # Extract n-grams from a word.
82
+ #
83
+ # @param word [String] The word
84
+ # @param n [Integer] N-gram size
85
+ # @return [Hash] N-gram to count mapping
86
+ def extract_ngrams(word, n)
87
+ ngrams = Hash.new(0)
88
+
89
+ (word.length - n + 1).times do |i|
90
+ ngram = word[i...i + n]
91
+ ngrams[ngram] += 1
92
+ end
93
+
94
+ ngrams
95
+ end
96
+
97
+ # Calculate n-gram similarity between two words.
98
+ #
99
+ # Uses the Jaccard similarity coefficient:
100
+ # similarity = |intersection| / |union|
101
+ #
102
+ # @param word_ngrams [Hash] N-grams for the first word
103
+ # @param other_word [String] The second word
104
+ # @param n [Integer] N-gram size
105
+ # @return [Float] Similarity score (0-1)
106
+ def ngram_similarity(word_ngrams, other_word, n)
107
+ other_ngrams = extract_ngrams(other_word, n)
108
+
109
+ # Calculate intersection
110
+ intersection = 0
111
+ word_ngrams.each do |ngram, count|
112
+ other_count = other_ngrams[ngram]
113
+ intersection += [count, other_count].min if other_count
114
+ end
115
+
116
+ # Calculate union
117
+ all_ngrams = word_ngrams.keys | other_ngrams.keys
118
+ union = 0
119
+ all_ngrams.each do |ngram|
120
+ union += [word_ngrams[ngram] || 0, other_ngrams[ngram] || 0].max
121
+ end
122
+
123
+ return 0.0 if union.zero?
124
+
125
+ intersection.to_f / union
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,329 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Suggestions
5
+ module Strategies
6
+ # Phonetic suggestion strategy.
7
+ #
8
+ # Generates suggestions by finding words with similar phonetic codes
9
+ # using algorithms like Soundex and Metaphone.
10
+ #
11
+ # @example Creating a phonetic strategy
12
+ # strategy = PhoneticStrategy.new(algorithm: :soundex)
13
+ # result = strategy.generate(context)
14
+ class PhoneticStrategy < BaseStrategy
15
+ # Supported algorithms.
16
+ ALGORITHMS = %i[soundex metaphone].freeze
17
+
18
+ # Create a new phonetic strategy.
19
+ #
20
+ # @param name [String, Symbol] Name of the strategy
21
+ # @param config [Hash] Configuration options
22
+ # @option config [Symbol] algorithm The algorithm to use (:soundex or :metaphone)
23
+ # @option config [Integer] max_results Maximum results to return
24
+ def initialize(name: :phonetic, **config)
25
+ super(name: name, **config)
26
+ end
27
+
28
+ # Generate suggestions based on phonetic similarity.
29
+ #
30
+ # @param context [Context] The suggestion context
31
+ # @return [SuggestionSet] Suggestions with same phonetic code
32
+ def generate(context)
33
+ word = context.word
34
+ algorithm = get_config(:algorithm, :soundex)
35
+ max_dist = 2
36
+
37
+ all_words = dictionary_words(context)
38
+
39
+ # Get phonetic code for input word
40
+ word_code = phonetic_code(word, algorithm)
41
+
42
+ # Find words with same phonetic code
43
+ results = []
44
+ all_words.each do |dict_word|
45
+ next if dict_word == word
46
+
47
+ dict_code = phonetic_code(dict_word, algorithm)
48
+ next unless dict_code == word_code
49
+
50
+ dist = edit_distance(word, dict_word)
51
+ next if dist > max_dist || dist.zero?
52
+
53
+ results << [dict_word, dist]
54
+ end
55
+
56
+ # Sort by distance and convert to suggestions
57
+ sorted_words = results.sort_by { |_, dist| dist }.map(&:first)
58
+ create_suggestion_set(sorted_words)
59
+ end
60
+
61
+ # Check if this strategy should handle the context.
62
+ #
63
+ # @param context [Context] The suggestion context
64
+ # @return [Boolean] True if the word needs correction
65
+ def handles?(context)
66
+ return false unless enabled?
67
+
68
+ !dictionary_lookup(context, context.word)
69
+ end
70
+
71
+ private
72
+
73
+ # Get phonetic code for a word.
74
+ #
75
+ # @param word [String] The word
76
+ # @param algorithm [Symbol] The algorithm to use
77
+ # @return [String] The phonetic code
78
+ def phonetic_code(word, algorithm = :soundex)
79
+ case algorithm
80
+ when :soundex
81
+ soundex_code(word)
82
+ when :metaphone
83
+ metaphone_code(word)
84
+ else
85
+ soundex_code(word)
86
+ end
87
+ end
88
+
89
+ # Calculate Soundex code for a word.
90
+ #
91
+ # Soundex is a phonetic algorithm developed by Robert C. Russell
92
+ # and Margaret King Odell in the early 1900s.
93
+ #
94
+ # @param word [String] The word
95
+ # @return [String] The Soundex code (letter + 3 digits)
96
+ #
97
+ # @example
98
+ # soundex_code("Robert") # => "R163"
99
+ # soundex_code("Rupert") # => "R163"
100
+ # soundex_code("Ashcraft") # => "A226"
101
+ def soundex_code(word)
102
+ return "" if word.nil? || word.empty?
103
+
104
+ word = word.upcase.gsub(/[^A-Z]/, "")
105
+ return "" if word.empty?
106
+
107
+ # Keep first letter
108
+ first_letter = word[0]
109
+ rest = word[1..]
110
+
111
+ # Encode remaining letters
112
+ code = first_letter
113
+
114
+ prev_code = soundex_encode(first_letter)
115
+ i = 0
116
+
117
+ while code.length < 4 && i < rest.length
118
+ char = rest[i]
119
+ encoded = soundex_encode(char)
120
+
121
+ # Add code if different from previous (ignore h and w)
122
+ code += encoded if encoded != "0" && encoded != prev_code
123
+
124
+ prev_code = encoded if encoded != "0"
125
+ i += 1
126
+ end
127
+
128
+ # Pad with zeros if needed
129
+ code.ljust(4, "0")[0...4]
130
+ end
131
+
132
+ # Soundex encoding table.
133
+ #
134
+ # @param char [String] The character
135
+ # @return [String] The encoded digit or "0" for no code
136
+ def soundex_encode(char)
137
+ case char.upcase
138
+ when "B", "P", "F", "V"
139
+ "1"
140
+ when "C", "S", "K", "G", "J", "Q", "X", "Z"
141
+ "2"
142
+ when "D", "T"
143
+ "3"
144
+ when "L"
145
+ "4"
146
+ when "M", "N"
147
+ "5"
148
+ when "R"
149
+ "6"
150
+ else
151
+ "0"
152
+ end
153
+ end
154
+
155
+ # Calculate Metaphone code for a word.
156
+ #
157
+ # Metaphone is an improved phonetic algorithm developed by
158
+ # Lawrence Philips in 1990.
159
+ #
160
+ # @param word [String] The word
161
+ # @return [String] The Metaphone code
162
+ #
163
+ # @example
164
+ # metaphone_code("Schmidt") # => "XMT"
165
+ # metaphone_code("Smith") # => "SM0"
166
+ def metaphone_code(word)
167
+ return "" if word.nil? || word.empty?
168
+
169
+ word = word.upcase.gsub(/[^A-Z]/, "")
170
+ return "" if word.empty?
171
+
172
+ # Metaphone rules (simplified implementation)
173
+ code = ""
174
+ i = 0
175
+ length = word.length
176
+
177
+ while i < length && code.length < 4
178
+ char = word[i]
179
+ next_char = i + 1 < length ? word[i + 1] : ""
180
+
181
+ case char
182
+ when "A", "E", "I", "O", "U"
183
+ # Vowels are only encoded at the beginning
184
+ code += char if i.zero?
185
+ when "B"
186
+ code += "B"
187
+ when "C"
188
+ if next_char == "H" && i + 2 < length && %w[A E I O U].include?(word[i + 2])
189
+ # "CH" followed by vowel => "X"
190
+ code += "X"
191
+ i += 1
192
+ elsif next_char == "I" && i + 2 < length && word[i + 2] == "A"
193
+ # "CIA" => "X"
194
+ code += "X"
195
+ i += 2
196
+ elsif %w[S G].include?(next_char)
197
+ # "CS", "CG" => "X"
198
+ code += "X"
199
+ i += 1
200
+ else
201
+ code += "K"
202
+ end
203
+ when "D"
204
+ if next_char == "G" && i + 2 < length && %w[I E Y].include?(word[i + 2])
205
+ # "DG" followed by I, E, Y => "J"
206
+ code += "J"
207
+ i += 1
208
+ else
209
+ code += "T"
210
+ end
211
+ when "F"
212
+ code += "F"
213
+ when "G"
214
+ if next_char == "H"
215
+ # "GH" => silent unless at beginning or after vowel
216
+ if i.zero?
217
+ code += "K"
218
+ i += 1
219
+ end
220
+ elsif next_char == "N"
221
+ # "GN" => "N" (silent G)
222
+ i += 1
223
+ elsif next_char == "N" && i + 2 < length && word[i + 2] == "E" && i + 3 < length && word[i + 3] == "D"
224
+ # "GNED" => "N" (silent G)
225
+ i += 3
226
+ else
227
+ code += "K"
228
+ end
229
+ when "H"
230
+ # H is silent unless at beginning
231
+ code += "H" if i.zero?
232
+ when "J"
233
+ code += "J"
234
+ when "K"
235
+ code += "K"
236
+ i += 1 if next_char == "N" # "KN" => "N"
237
+ when "L"
238
+ code += "L"
239
+ when "M"
240
+ code += "M"
241
+ when "N"
242
+ code += "N"
243
+ when "P"
244
+ if next_char == "H"
245
+ # "PH" => "F"
246
+ code += "F"
247
+ i += 1
248
+ else
249
+ code += "P"
250
+ end
251
+ when "Q"
252
+ code += "K"
253
+ when "R"
254
+ code += "R"
255
+ when "S"
256
+ if next_char == "H"
257
+ # "SH" => "X"
258
+ code += "X"
259
+ i += 1
260
+ elsif next_char == "I" && i + 2 < length && word[i + 2] == "O"
261
+ # "SIO" or "SIA" => "X"
262
+ code += "X"
263
+ i += 2
264
+ else
265
+ code += "S"
266
+ end
267
+ when "T"
268
+ if next_char == "I" && i + 2 < length && %w[O A].include?(word[i + 2])
269
+ # "TIO" or "TIA" => "X"
270
+ code += "X"
271
+ i += 2
272
+ elsif next_char == "H"
273
+ # "TH" => "0"
274
+ code += "0"
275
+ i += 1
276
+ else
277
+ code += "T"
278
+ end
279
+ when "V"
280
+ code += "F"
281
+ when "W", "Y"
282
+ # W and Y are semi-vowels, only encode at beginning
283
+ code += char if i.zero?
284
+ when "X"
285
+ code += "KS"
286
+ when "Z"
287
+ code += "S"
288
+ end
289
+
290
+ i += 1
291
+ end
292
+
293
+ code[0...4] # Max 4 characters
294
+ end
295
+
296
+ # Calculate Levenshtein edit distance.
297
+ #
298
+ # @param str1 [String] First string
299
+ # @param str2 [String] Second string
300
+ # @return [Integer] Edit distance
301
+ def edit_distance(str1, str2)
302
+ return str2.length if str1.empty?
303
+ return str1.length if str2.empty?
304
+
305
+ # Use smaller string for inner loop
306
+ str1, str2 = str2, str1 if str1.length > str2.length
307
+
308
+ previous = (0..str1.length).to_a
309
+
310
+ str2.each_char.with_index do |char2, j|
311
+ current = [j + 1]
312
+
313
+ str1.each_char.with_index do |char1, i|
314
+ insert_cost = current[i] + 1
315
+ delete_cost = previous[i + 1] + 1
316
+ substitute_cost = previous[i] + (char1 == char2 ? 0 : 1)
317
+
318
+ current << [insert_cost, delete_cost, substitute_cost].min
319
+ end
320
+
321
+ previous = current
322
+ end
323
+
324
+ previous.last
325
+ end
326
+ end
327
+ end
328
+ end
329
+ end