kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,295 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../models/embedding_model'
4
+ require_relative '../models/semantic_error'
5
+ require_relative '../models/context'
6
+ require_relative '../documents/document'
7
+
8
+ module Kotoshu
9
+ module Analyzers
10
+ # Unified semantic error analyzer.
11
+ #
12
+ # Uses word embeddings for context-aware error detection and suggestions.
13
+ # Provides unified semantic analysis without artificial spelling/grammar split.
14
+ #
15
+ # @example Analyzing a document
16
+ # model = FastTextModel.from_github('en')
17
+ # analyzer = SemanticAnalyzer.new(model)
18
+ # errors = analyzer.analyze(document)
19
+ #
20
+ # @example Checking a single word
21
+ # suggestions = analyzer.suggest_corrections('helo', context_words: ['hello', 'world'])
22
+ class SemanticAnalyzer
23
+ # Similarity threshold for high-confidence suggestions
24
+ HIGH_CONFIDENCE_THRESHOLD = 0.85
25
+
26
+ # Similarity threshold for medium-confidence suggestions
27
+ MEDIUM_CONFIDENCE_THRESHOLD = 0.70
28
+
29
+ # Minimum similarity for suggestions
30
+ MIN_SIMILARITY = 0.50
31
+
32
+ # Default number of suggestions to generate
33
+ DEFAULT_MAX_SUGGESTIONS = 5
34
+
35
+ attr_reader :model, :max_suggestions
36
+
37
+ # Create a new semantic analyzer.
38
+ #
39
+ # @param model [EmbeddingModel] The embedding model to use
40
+ # @param max_suggestions [Integer] Maximum suggestions per error
41
+ # @param min_similarity [Float] Minimum similarity threshold
42
+ def initialize(model, max_suggestions: DEFAULT_MAX_SUGGESTIONS, min_similarity: MIN_SIMILARITY)
43
+ raise ArgumentError, "Model must be an EmbeddingModel" unless model.is_a?(Models::EmbeddingModel)
44
+
45
+ @model = model
46
+ @max_suggestions = max_suggestions
47
+ @min_similarity = min_similarity
48
+ end
49
+
50
+ # Analyze a document for semantic errors.
51
+ #
52
+ # @param document [Document] The document to analyze
53
+ # @return [Array<Models::SemanticError>] List of errors found
54
+ def analyze(document)
55
+ errors = []
56
+
57
+ # Get text nodes from document
58
+ document.text_nodes.each do |text_node|
59
+ # Tokenize and check each word
60
+ words = tokenize_words(text_node.text)
61
+
62
+ words.each do |word|
63
+ next if valid_word?(word)
64
+
65
+ # Detect error
66
+ error = detect_error(
67
+ word: word,
68
+ location: text_node.location,
69
+ context: document.context_for(text_node.location)
70
+ )
71
+
72
+ errors << error if error
73
+ end
74
+ end
75
+
76
+ # Sort errors by location and confidence
77
+ errors.sort
78
+ end
79
+
80
+ # Detect semantic error for a single word.
81
+ #
82
+ # @param word [String] The word to check
83
+ # @param location [Location] Error location
84
+ # @param context [Models::Context, nil] Context around the word
85
+ # @return [Models::SemanticError, nil] Error object or nil if valid
86
+ def detect_error(word:, location:, context: nil)
87
+ return nil if valid_word?(word)
88
+
89
+ # Get suggestions
90
+ suggestions = suggest_corrections(word, context: context)
91
+
92
+ # Determine error type based on analysis
93
+ error_type = classify_error(word, suggestions, context)
94
+
95
+ # Calculate confidence based on suggestions
96
+ confidence = calculate_confidence(suggestions)
97
+
98
+ # Create error object
99
+ Models::SemanticError.new(
100
+ id: generate_error_id(word, location),
101
+ location: location,
102
+ original: word,
103
+ suggestions: suggestions,
104
+ error_type: error_type,
105
+ confidence: confidence,
106
+ context: context
107
+ )
108
+ end
109
+
110
+ # Suggest corrections for a word.
111
+ #
112
+ # @param word [String] The misspelled word
113
+ # @param context [Models::Context, nil] Context for context-aware suggestions
114
+ # @return [Array<Models::Suggestion>] Suggested corrections
115
+ def suggest_corrections(word, context: nil)
116
+ return [] if word.nil? || word.empty?
117
+
118
+ # Get nearest neighbors from embedding model
119
+ neighbors = @model.nearest_neighbors(word, k: @max_suggestions * 3)
120
+
121
+ # Filter by minimum similarity
122
+ neighbors = neighbors.select { |n| n.similarity >= @min_similarity }
123
+
124
+ # If we have context, rank by contextual relevance
125
+ if context && context.respond_to?(:surrounding_words)
126
+ neighbors = rank_by_context(neighbors, context)
127
+ end
128
+
129
+ # Convert to Suggestions
130
+ neighbors.first(@max_suggestions).map do |neighbor|
131
+ Models::Suggestion.new(
132
+ word: neighbor.word,
133
+ confidence: neighbor.similarity,
134
+ source: :semantic,
135
+ metadata: {
136
+ distance: neighbor.distance,
137
+ similarity: neighbor.similarity
138
+ }
139
+ )
140
+ end
141
+ end
142
+
143
+ # Check if a word is valid (exists in vocabulary).
144
+ #
145
+ # @param word [String] The word to check
146
+ # @return [Boolean] True if word is valid
147
+ def valid_word?(word)
148
+ return false if word.nil? || word.empty?
149
+
150
+ # Skip numbers
151
+ return true if word =~ /^\d+$/
152
+
153
+ # Skip single characters (likely abbreviations)
154
+ return true if word.length == 1
155
+
156
+ # Check if word exists in model vocabulary
157
+ @model.has_word?(word)
158
+ end
159
+
160
+ # Calculate confidence score for suggestions.
161
+ #
162
+ # @param suggestions [Array<Models::Suggestion>] List of suggestions
163
+ # @return [Float] Confidence score (0.0 to 1.0)
164
+ def calculate_confidence(suggestions)
165
+ return 0.0 unless suggestions&.any?
166
+
167
+ # Confidence is based on top suggestion quality
168
+ top = suggestions.first
169
+
170
+ # High confidence: top suggestion > 0.85 similarity
171
+ return 1.0 if top.confidence > HIGH_CONFIDENCE_THRESHOLD
172
+
173
+ # Medium confidence: top suggestion > 0.70 similarity
174
+ return 0.7 if top.confidence > MEDIUM_CONFIDENCE_THRESHOLD
175
+
176
+ # Low confidence: top suggestion < 0.70
177
+ 0.5
178
+ end
179
+
180
+ private
181
+
182
+ # Tokenize text into words.
183
+ #
184
+ # @param text [String] Text to tokenize
185
+ # @return [Array<String>] Words
186
+ def tokenize_words(text)
187
+ return [] unless text
188
+
189
+ # Simple word tokenization (splits on non-word characters)
190
+ # In full implementation, would use language-specific tokenization
191
+ text.downcase.scan(/[a-z]+(?:['’-][a-z]+)*/i)
192
+ end
193
+
194
+ # Classify error type based on word and suggestions.
195
+ #
196
+ # @param word [String] The error word
197
+ # @param suggestions [Array<Models::Suggestion>] Suggestions
198
+ # @param context [Models::Context, nil] Context
199
+ # @return [Symbol] Error type
200
+ def classify_error(word, suggestions, context)
201
+ return :orthographic if suggestions&.empty?
202
+
203
+ top_suggestion = suggestions.first
204
+
205
+ # Check if it's a capitalization error
206
+ if word.downcase == top_suggestion.word.downcase
207
+ return :capitalization
208
+ end
209
+
210
+ # Check if it's a diacritic/accent error
211
+ if similar_without_diacritics?(word, top_suggestion.word)
212
+ return :orthographic
213
+ end
214
+
215
+ # Check if it's a word choice error (semantic similarity but different word)
216
+ if suggestions.first&.source == :semantic
217
+ return :word_choice
218
+ end
219
+
220
+ # Default to orthographic (spelling)
221
+ :orthographic
222
+ end
223
+
224
+ # Check if two words are similar ignoring diacritics.
225
+ #
226
+ # @param word1 [String] First word
227
+ # @param word2 [String] Second word
228
+ # @return [Boolean] True if similar without diacritics
229
+ def similar_without_diacritics?(word1, word2)
230
+ # Remove diacritics and compare
231
+ normalize_diacritics(word1) == normalize_diacritics(word2)
232
+ end
233
+
234
+ # Normalize diacritics from a word.
235
+ #
236
+ # @param word [String] Word with diacritics
237
+ # @return [String] Word without diacritics
238
+ def normalize_diacritics(word)
239
+ # Simple normalization (transliterate to ASCII)
240
+ word.encode('ASCII', fallback: ->(c) { c == 'ä' ? 'ae' : c == 'ö' ? 'oe' : c == 'ü' ? 'ue' : c == 'ß' ? 'ss' : c })
241
+ .downcase
242
+ end
243
+
244
+ # Rank neighbors by contextual relevance.
245
+ #
246
+ # @param neighbors [Array<Models::NearestNeighbor>] Neighbors to rank
247
+ # @param context [Models::Context] Context for ranking
248
+ # @return [Array<Models::NearestNeighbor>] Ranked neighbors
249
+ def rank_by_context(neighbors, context)
250
+ # Get surrounding words
251
+ surrounding = context.surrounding_words(3)
252
+ return neighbors unless surrounding&.any?
253
+
254
+ # Boost neighbors that appear in similar context
255
+ # In full implementation, would use more sophisticated context modeling
256
+ neighbors.map do |neighbor|
257
+ boost = context_boost(neighbor.word, surrounding)
258
+ # Create boosted neighbor (create new object to avoid mutation)
259
+ boosted_similarity = [neighbor.similarity + boost, 1.0].min
260
+ Models::NearestNeighbor.new(
261
+ word: neighbor.word,
262
+ similarity: boosted_similarity,
263
+ embedding: neighbor.embedding
264
+ )
265
+ end.sort.reverse
266
+ end
267
+
268
+ # Calculate context boost for a word.
269
+ #
270
+ # @param word [String] Word to boost
271
+ # @param surrounding [Array<String>] Surrounding words
272
+ # @return [Float] Boost amount (0.0 to 0.1)
273
+ def context_boost(word, surrounding)
274
+ return 0.0 unless surrounding&.any?
275
+
276
+ # Simple boost: if word is semantically similar to surrounding words
277
+ surrounding.reduce(0.0) do |boost, surrounding_word|
278
+ sim = @model.similarity(word, surrounding_word)
279
+ boost + (sim || 0.0) * 0.02 # Small boost for each similar word
280
+ end
281
+ end
282
+
283
+ # Generate unique error ID.
284
+ #
285
+ # @param word [String] The error word
286
+ # @param location [Location] Error location
287
+ # @return [String] Unique ID
288
+ def generate_error_id(word, location)
289
+ # Create ID from word and location hash
290
+ base = "#{word}-#{location}"
291
+ Digest::SHA256.hexdigest(base)[0...16]
292
+ end
293
+ end
294
+ end
295
+ end