kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,316 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'base_strategy'
4
+ require_relative '../suggestion'
5
+ require_relative '../suggestion_set'
6
+ require_relative '../../embeddings'
7
+
8
+ module Kotoshu
9
+ module Suggestions
10
+ module Strategies
11
+ # Semantic strategy using FastText ONNX embeddings.
12
+ #
13
+ # Provides embedding-based spell correction for:
14
+ # - Typos: Re-ranks edit-distance candidates by semantic similarity
15
+ # - Real-word errors: Detects when valid words are used incorrectly in context
16
+ #
17
+ # This strategy works alongside other strategies (EditDistance, Phonetic, etc.)
18
+ # to provide comprehensive spell checking with semantic awareness.
19
+ #
20
+ # @example Basic usage
21
+ # strategy = SemanticStrategy.new(language_code: 'en')
22
+ # suggestions = strategy.generate(context)
23
+ #
24
+ # @example With preloaded embeddings (faster)
25
+ # strategy = SemanticStrategy.new(
26
+ # language_code: 'en',
27
+ # preload_embeddings: true
28
+ # )
29
+ # suggestions = strategy.generate(context)
30
+ class SemanticStrategy < BaseStrategy
31
+ # @return [String] Language code (ISO 639-1)
32
+ attr_reader :language_code
33
+
34
+ # @return [Embeddings::Vocabulary] The vocabulary
35
+ attr_reader :vocabulary
36
+
37
+ # @return [Embeddings::OnnxRuntimeModel] The ONNX model
38
+ attr_reader :model
39
+
40
+ # @return [Embeddings::SimilaritySearch] The similarity search
41
+ attr_reader :search
42
+
43
+ # Create a new semantic strategy.
44
+ #
45
+ # @param language_code [String] ISO 639-1 language code
46
+ # @param cache [Cache::ModelCache, nil] Optional cache instance
47
+ # @param preload_embeddings [Boolean] Whether to preload embeddings
48
+ # @param max_context_window [Integer] Words to consider for context
49
+ # @param min_semantic_similarity [Float] Minimum similarity for semantic suggestions
50
+ # @param semantic_boost_weight [Float] Weight for semantic similarity in re-ranking
51
+ # @param config [Hash] Additional configuration
52
+ def initialize(language_code:, cache: nil, preload_embeddings: false,
53
+ max_context_window: 5, min_semantic_similarity: 0.5,
54
+ semantic_boost_weight: 0.3, **config)
55
+ super(name: :semantic, **config)
56
+ @language_code = language_code
57
+ @max_context_window = max_context_window
58
+ @min_semantic_similarity = min_semantic_similarity
59
+ @semantic_boost_weight = semantic_boost_weight
60
+
61
+ # Initialize embedding components
62
+ initialize_embeddings(cache, preload_embeddings)
63
+ end
64
+
65
+ # Generate suggestions using semantic similarity.
66
+ #
67
+ # Handles two cases:
68
+ # 1. Word not in vocabulary (typo): Re-ranks edit-distance candidates
69
+ # 2. Word in vocabulary (real-word error): Finds semantically similar alternatives
70
+ #
71
+ # @param context [Context] The suggestion context
72
+ # @return [SuggestionSet] Generated suggestions
73
+ def generate(context)
74
+ word = context.word
75
+ max_results = context.max_results || max_results
76
+
77
+ # Ensure embeddings are loaded
78
+ return SuggestionSet.empty unless @search
79
+
80
+ # Case 1: Word not in vocabulary (typo)
81
+ unless @vocabulary.include?(word)
82
+ return generate_for_typo(context)
83
+ end
84
+
85
+ # Case 2: Real-word error detection
86
+ # Find semantically similar words that might be correct in context
87
+ generate_for_real_word_error(context)
88
+ end
89
+
90
+ # Check if this strategy should handle the context.
91
+ #
92
+ # Semantic strategy handles:
93
+ # - Words not in vocabulary (for typo re-ranking)
94
+ # - Words in vocabulary (for real-word error detection)
95
+ #
96
+ # @param context [Context] The suggestion context
97
+ # @return [Boolean] True if the strategy should handle this context
98
+ def handles?(context)
99
+ return false unless enabled?
100
+ return false unless @search && @vocabulary
101
+
102
+ # Handle all words - we filter in generate()
103
+ true
104
+ end
105
+
106
+ # Get embedding for a word.
107
+ #
108
+ # @param word [String] The word
109
+ # @return [Array<Float>, nil] Embedding vector or nil if not found
110
+ def embedding_for(word)
111
+ return nil unless @search
112
+
113
+ @search.send(:get_embedding, word)
114
+ end
115
+
116
+ # Compute semantic similarity between two words.
117
+ #
118
+ # @param word1 [String] First word
119
+ # @param word2 [String] Second word
120
+ # @return [Float, nil] Cosine similarity or nil if either word not found
121
+ def semantic_similarity(word1, word2)
122
+ return nil unless @search
123
+
124
+ @search.similarity(word1, word2)
125
+ end
126
+
127
+ # Find semantically similar words.
128
+ #
129
+ # @param word [String] The query word
130
+ # @param k [Integer] Number of neighbors
131
+ # @return [Array<Hash>] Array of {word, similarity} hashes
132
+ def find_similar_words(word, k: 10)
133
+ return [] unless @search
134
+
135
+ @search.find_nearest(word, k: k, exclude_self: false)
136
+ end
137
+
138
+ # String representation.
139
+ #
140
+ # @return [String] String representation
141
+ def to_s
142
+ "SemanticStrategy(language: #{@language_code}, vocab_size: #{@vocabulary&.size || 0}, loaded: #{@search && true})"
143
+ end
144
+ alias inspect to_s
145
+
146
+ private
147
+
148
+ # Initialize embedding components.
149
+ #
150
+ # @param cache [Cache::ModelCache, nil] Cache instance
151
+ # @param preload [Boolean] Whether to preload embeddings
152
+ def initialize_embeddings(cache, preload)
153
+ # Try to load from cache
154
+ @search = Embeddings::SimilaritySearch.from_cache(
155
+ @language_code,
156
+ cache: cache,
157
+ preload: preload
158
+ )
159
+
160
+ # Extract vocabulary and model from search
161
+ if @search
162
+ @vocabulary = @search.vocabulary
163
+ @model = @search.model
164
+ else
165
+ @vocabulary = nil
166
+ @model = nil
167
+
168
+ warn "Warning: Could not load ONNX model for language '#{@language_code}'. Semantic strategy will be disabled." if $VERBOSE
169
+ end
170
+ end
171
+
172
+ # Generate suggestions for a typo (word not in vocabulary).
173
+ #
174
+ # Uses semantic similarity to re-rank candidates from other strategies.
175
+ #
176
+ # @param context [Context] The suggestion context
177
+ # @return [SuggestionSet] Re-ranked suggestions
178
+ def generate_for_typo(context)
179
+ word = context.word
180
+ max_results = context.max_results || max_results
181
+
182
+ # For typos, we find semantically similar words in vocabulary
183
+ # that are also close in spelling (handled by edit distance strategy)
184
+ neighbors = @search.find_nearest(
185
+ word,
186
+ k: max_results * 2, # Get more candidates for filtering
187
+ exclude_self: true,
188
+ min_similarity: @min_semantic_similarity
189
+ )
190
+
191
+ return SuggestionSet.empty if neighbors.empty?
192
+
193
+ # Convert to suggestions
194
+ # Confidence is based on semantic similarity
195
+ suggestions = neighbors.map do |neighbor|
196
+ similarity = neighbor[:similarity]
197
+ confidence = normalize_similarity(similarity)
198
+
199
+ # Calculate "distance" as inverse of similarity
200
+ # High similarity = low distance
201
+ distance = similarity_to_distance(similarity)
202
+
203
+ create_suggestion(
204
+ neighbor[:word],
205
+ distance: distance,
206
+ confidence: confidence,
207
+ semantic_similarity: similarity
208
+ )
209
+ end
210
+
211
+ # Sort and limit
212
+ SuggestionSet.new(suggestions, max_size: max_results)
213
+ end
214
+
215
+ # Generate suggestions for a real-word error.
216
+ #
217
+ # Finds semantically similar words that might be correct in context.
218
+ #
219
+ # @param context [Context] The suggestion context
220
+ # @return [SuggestionSet] Alternative suggestions
221
+ def generate_for_real_word_error(context)
222
+ word = context.word
223
+ max_results = context.max_results || max_results
224
+
225
+ # Get context words from the surrounding text
226
+ context_words = get_context_words(context)
227
+
228
+ # Find semantically similar words
229
+ neighbors = @search.find_nearest(
230
+ word,
231
+ k: max_results * 3,
232
+ exclude_self: true,
233
+ min_similarity: @min_semantic_similarity
234
+ )
235
+
236
+ return SuggestionSet.empty if neighbors.empty?
237
+
238
+ # Re-rank by context similarity
239
+ suggestions = neighbors.map do |neighbor|
240
+ candidate_word = neighbor[:word]
241
+ similarity = neighbor[:similarity]
242
+
243
+ # Check if candidate makes more sense in context
244
+ context_score = compute_context_fit(candidate_word, context_words)
245
+
246
+ # Combine semantic similarity with context fit
247
+ combined_score = (similarity * 0.7) + (context_score * 0.3)
248
+
249
+ confidence = normalize_similarity(combined_score)
250
+ distance = similarity_to_distance(combined_score)
251
+
252
+ create_suggestion(
253
+ candidate_word,
254
+ distance: distance,
255
+ confidence: confidence,
256
+ semantic_similarity: similarity,
257
+ context_score: context_score
258
+ )
259
+ end
260
+
261
+ # Sort by combined score and limit
262
+ SuggestionSet.new(suggestions.sort_by { |s| -s.metadata[:context_score] }, max_size: max_results)
263
+ end
264
+
265
+ # Get context words for semantic analysis.
266
+ #
267
+ # @param context [Context] The suggestion context
268
+ # @return [Array<String>] Context words
269
+ def get_context_words(context)
270
+ # For now, return empty - context analysis would need full text
271
+ # This could be extended in the future
272
+ []
273
+ end
274
+
275
+ # Compute how well a word fits in context.
276
+ #
277
+ # @param candidate [String] Candidate word
278
+ # @param context_words [Array<String>] Context words
279
+ # @return [Float] Context fit score (0.0 to 1.0)
280
+ def compute_context_fit(candidate, context_words)
281
+ return 0.5 if context_words.empty?
282
+
283
+ # Compute average similarity between candidate and context words
284
+ similarities = context_words.map do |ctx_word|
285
+ @search.similarity(candidate, ctx_word)
286
+ end.compact
287
+
288
+ return 0.5 if similarities.empty?
289
+
290
+ similarities.sum / similarities.size
291
+ end
292
+
293
+ # Normalize similarity to confidence (0.0 to 1.0).
294
+ #
295
+ # @param similarity [Float] Cosine similarity (-1.0 to 1.0)
296
+ # @return [Float] Normalized confidence (0.0 to 1.0)
297
+ def normalize_similarity(similarity)
298
+ # Map from [-1, 1] to [0, 1]
299
+ ((similarity + 1) / 2.0).clamp(0.0, 1.0)
300
+ end
301
+
302
+ # Convert similarity to "distance" for ranking.
303
+ #
304
+ # @param similarity [Float] Cosine similarity (-1.0 to 1.0)
305
+ # @return [Integer] Pseudo-distance (lower = better)
306
+ def similarity_to_distance(similarity)
307
+ # Map similarity to distance: higher similarity = lower distance
308
+ # Similarity 1.0 -> distance 0
309
+ # Similarity 0.0 -> distance 2
310
+ # Similarity -1.0 -> distance 4
311
+ ((1.0 - similarity) * 2).to_i.clamp(0, 5)
312
+ end
313
+ end
314
+ end
315
+ end
316
+ end
@@ -0,0 +1,275 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Suggestions
5
+ module Strategies
6
+ # SymSpell suggestion strategy.
7
+ #
8
+ # Uses deletion distance algorithm for fast approximate string matching.
9
+ # Pre-computes deletion variants for all dictionary words, enabling O(1)
10
+ # lookup for common misspellings.
11
+ #
12
+ # This is 10-100x faster than EditDistanceStrategy for large dictionaries.
13
+ #
14
+ # The algorithm works by:
15
+ # 1. Pre-computing single deletion variants for each dictionary word
16
+ # 2. Looking up input word's deletion variants in the pre-computed map
17
+ # 3. Distance is inferred from the deletion level
18
+ #
19
+ # @see https://github.com/wolfgarbe/SymSpell Original SymSpell paper
20
+ class SymSpellStrategy < BaseStrategy
21
+ # Maximum deletion distance to consider
22
+ DEFAULT_MAX_DELETION_DISTANCE = 2
23
+ # Maximum dictionary words to process (increased for better coverage)
24
+ DEFAULT_MAX_DICTIONARY_SIZE = 500_000
25
+ # Enable transposition handling (slower pre-computation, better accuracy)
26
+ DEFAULT_HANDLE_TRANSPOSITIONS = true
27
+
28
+ # Create a new SymSpell strategy.
29
+ #
30
+ # @param dictionary [Object] Dictionary to use for suggestions
31
+ # @param name [String, Symbol] Strategy name
32
+ # @param config [Hash] Configuration options
33
+ # @option config [Integer] max_deletion_distance Maximum deletion distance (default: 2)
34
+ # @option config [Integer] max_results Maximum results to return (default: 10)
35
+ # @option config [Integer] max_dictionary_size Maximum words to process (default: 500_000)
36
+ # @option config [Boolean] handle_transpositions Generate transposition variants (default: true)
37
+ def initialize(dictionary:, name: :symspell, **config)
38
+ super(name: name, **config)
39
+ @dictionary = dictionary
40
+ @max_deletion_distance = config.fetch(:max_deletion_distance, DEFAULT_MAX_DELETION_DISTANCE)
41
+ @max_dictionary_size = config.fetch(:max_dictionary_size, DEFAULT_MAX_DICTIONARY_SIZE)
42
+ @handle_transpositions = config.fetch(:handle_transpositions, DEFAULT_HANDLE_TRANSPOSITIONS)
43
+ @deletes = Hash.new { |h, k| h[k] = [] } # deletion_variant -> [original_words]
44
+ @words = Set.new
45
+ precompute!
46
+ end
47
+
48
+ # Generate suggestions using deletion distance.
49
+ #
50
+ # @param context [Context] The suggestion context
51
+ # @return [SuggestionSet] Generated suggestions
52
+ def generate(context)
53
+ word = context.word
54
+ max_dist = get_config(:max_deletion_distance, @max_deletion_distance)
55
+
56
+ # Normalize to lowercase for case-insensitive matching
57
+ word_lower = word.downcase
58
+
59
+ # Check if word is in dictionary
60
+ return SuggestionSet.empty if @words.include?(word_lower)
61
+
62
+ # Collect candidates with their distances
63
+ candidates = {}
64
+ checked = Set.new([word_lower])
65
+
66
+ # First, check if the input word is a deletion variant of any dictionary word
67
+ @deletes[word_lower].each do |dict_word|
68
+ candidates[dict_word] ||= 1
69
+ end
70
+
71
+ # If transpositions are enabled, check them too
72
+ if @handle_transpositions
73
+ generate_transpositions(word_lower).each do |transposed|
74
+ @deletes[transposed].each do |dict_word|
75
+ candidates[dict_word] ||= 1
76
+ end
77
+ end
78
+ end
79
+
80
+ # Generate deletion variants and check for matches
81
+ max_dist.times do |dist|
82
+ generate_deletions_from_set(checked).each do |variant|
83
+ next if checked.include?(variant)
84
+
85
+ checked.add(variant)
86
+
87
+ # Check if variant is directly in dictionary
88
+ candidates[variant] = dist + 1 if @words.include?(variant)
89
+
90
+ # Check if variant maps to dictionary words
91
+ @deletes[variant].each do |dict_word|
92
+ # Distance = deletions from input + deletions from dict_word
93
+ # Both reach the same variant
94
+ candidates[dict_word] ||= dist + 2
95
+ end
96
+ end
97
+ end
98
+
99
+ # Sort by distance and create suggestions
100
+ sorted_words = candidates.sort_by { |_, dist| dist }.map(&:first)
101
+ create_suggestion_set(sorted_words, distances: candidates, original_word: context.word)
102
+ end
103
+
104
+ # Pre-compute deletion variants for all dictionary words.
105
+ #
106
+ # This is called during initialization and builds the index.
107
+ def precompute!
108
+ words = dictionary_words(@dictionary)
109
+
110
+ words.first(@max_dictionary_size).each do |word|
111
+ next if word.nil? || word.empty?
112
+
113
+ word_lower = word.downcase
114
+ @words.add(word_lower)
115
+
116
+ # Generate only single deletion variants for efficiency
117
+ # Multiple deletions are handled during lookup
118
+ generate_single_deletions(word_lower).each do |variant|
119
+ @deletes[variant] << word_lower
120
+ end
121
+
122
+ # Generate transposition variants if enabled
123
+ if @handle_transpositions
124
+ generate_transpositions(word_lower).each do |variant|
125
+ @deletes[variant] << word_lower
126
+ end
127
+ end
128
+ end
129
+ end
130
+
131
+ # Generate all adjacent transposition variants of a word.
132
+ #
133
+ # For example, "world" → ["owrld", "wrold", "wolrd", "wordl"]
134
+ #
135
+ # @param word [String] The word
136
+ # @return [Array<String>] Array of variants with adjacent characters swapped
137
+ def generate_transpositions(word)
138
+ variants = []
139
+ word.chars.each_with_index do |_, i|
140
+ next if i == word.length - 1 # Can't swap last character
141
+
142
+ variant = word.dup
143
+ variant[i], variant[i + 1] = variant[i + 1], variant[i]
144
+ variants << variant unless variant == word
145
+ end
146
+ variants
147
+ end
148
+
149
+ # Calculate deletion distance between two words.
150
+ #
151
+ # For SymSpell, this is the length of their longest common subsequence
152
+ # based distance (minimum deletions to make them equal).
153
+ #
154
+ # @param str1 [String] First word
155
+ # @param str2 [String] Second word
156
+ # @return [Integer] Deletion distance
157
+ def deletion_distance(str1, str2)
158
+ return str2.length if str1.empty?
159
+ return str1.length if str2.empty?
160
+ return 0 if str1 == str2
161
+
162
+ # Simple approach: find if one can be transformed to the other
163
+ # by only deletions (check if str1 is subsequence of str2 or vice versa)
164
+ if is_subsequence?(str1, str2)
165
+ str2.length - str1.length
166
+ elsif is_subsequence?(str2, str1)
167
+ str1.length - str2.length
168
+ else
169
+ # Fallback to edit distance approximation
170
+ # This shouldn't happen often with proper SymSpell usage
171
+ lcs_len = longest_common_subsequence_length(str1, str2)
172
+ str1.length + str2.length - 2 * lcs_len
173
+ end
174
+ end
175
+
176
+ private
177
+
178
+ # Generate all single-deletion variants of a word.
179
+ #
180
+ # @param word [String] The word
181
+ # @return [Array<String>] Array of variants with one character deleted
182
+ def generate_single_deletions(word)
183
+ variants = []
184
+ word.chars.each_with_index do |_, i|
185
+ variant = word[0...i] + word[(i + 1)..].to_s
186
+ variants << variant unless variant.empty? || variant == word
187
+ end
188
+ variants
189
+ end
190
+
191
+ # Generate deletion variants from a set of words.
192
+ #
193
+ # @param words_set [Set<String>] Set of words to process
194
+ # @return [Set<String>] New set with all single deletions
195
+ def generate_deletions_from_set(words_set)
196
+ result = Set.new
197
+ words_set.each do |word|
198
+ generate_single_deletions(word).each do |variant|
199
+ result.add(variant)
200
+ end
201
+ end
202
+ result
203
+ end
204
+
205
+ # Check if str1 is a subsequence of str2.
206
+ #
207
+ # @param str1 [String] Potential subsequence
208
+ # @param str2 [String] String to check against
209
+ # @return [Boolean] True if str1 is subsequence of str2
210
+ def is_subsequence?(str1, str2)
211
+ return true if str1.empty?
212
+ return false if str1.length > str2.length
213
+
214
+ i = 0
215
+ str2.each_char do |c|
216
+ i += 1 if c == str1[i]
217
+ return true if i == str1.length
218
+ end
219
+ i == str1.length
220
+ end
221
+
222
+ # Calculate the length of the longest common subsequence.
223
+ #
224
+ # Uses dynamic programming for efficiency.
225
+ #
226
+ # @param str1 [String] First string
227
+ # @param str2 [String] Second string
228
+ # @return [Integer] LCS length
229
+ def longest_common_subsequence_length(str1, str2)
230
+ return 0 if str1.empty? || str2.empty?
231
+
232
+ # Use shorter string for inner loop
233
+ str1, str2 = str2, str1 if str1.length > str2.length
234
+
235
+ # Previous row of DP table
236
+ previous = Array.new(str1.length + 1, 0)
237
+
238
+ str2.each_char do |char2|
239
+ current = [0] # First column is always 0
240
+
241
+ str1.each_char.with_index do |char1, i|
242
+ current << if char1 == char2
243
+ previous[i] + 1
244
+ else
245
+ [current[i], previous[i + 1]].max
246
+ end
247
+ end
248
+
249
+ previous = current
250
+ end
251
+
252
+ previous.last
253
+ end
254
+
255
+ # Get all words from the dictionary.
256
+ #
257
+ # @param dictionary [Object] Dictionary object
258
+ # @return [Array<String>] All words
259
+ def dictionary_words(dictionary)
260
+ if dictionary.respond_to?(:words)
261
+ dictionary.words
262
+ elsif dictionary.is_a?(Array)
263
+ dictionary
264
+ elsif dictionary.is_a?(Hash)
265
+ dictionary.keys
266
+ elsif dictionary.respond_to?(:all_words)
267
+ dictionary.all_words
268
+ else
269
+ []
270
+ end
271
+ end
272
+ end
273
+ end
274
+ end
275
+ end