kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,331 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'vocabulary'
4
+ require_relative 'onnx_runtime_model'
5
+
6
+ module Kotoshu
7
+ module Embeddings
8
+ # Similarity search for embedding-based nearest neighbor lookup.
9
+ #
10
+ # Efficiently finds semantically similar words using cosine similarity.
11
+ # Supports both on-the-fly computation and pre-computed embedding matrices.
12
+ #
13
+ # @example Basic usage
14
+ # search = SimilaritySearch.new(
15
+ # vocabulary: vocab,
16
+ # model: model
17
+ # )
18
+ # neighbors = search.find_nearest('hello', k: 10)
19
+ #
20
+ # @example With pre-loaded embedding matrix (faster)
21
+ # search = SimilaritySearch.new(
22
+ # vocabulary: vocab,
23
+ # model: model,
24
+ # preload_embeddings: true
25
+ # )
26
+ # neighbors = search.find_nearest('hello', k: 10)
27
+ class SimilaritySearch
28
+ # @return [Vocabulary] The vocabulary
29
+ attr_reader :vocabulary
30
+
31
+ # @return [OnnxRuntimeModel] The ONNX model
32
+ attr_reader :model
33
+
34
+ # @return [Boolean] Whether embeddings are pre-loaded
35
+ attr_reader :embeddings_loaded
36
+
37
+ # Create a new similarity search instance.
38
+ #
39
+ # @param vocabulary [Vocabulary] Word vocabulary
40
+ # @param model [OnnxRuntimeModel] ONNX model for embeddings
41
+ # @param preload_embeddings [Boolean] Whether to preload all embeddings
42
+ # @param max_cache_size [Integer] Maximum embeddings to cache (if not preloading)
43
+ def initialize(vocabulary:, model:, preload_embeddings: false, max_cache_size: 1000)
44
+ @vocabulary = vocabulary
45
+ @model = model
46
+ @preload_embeddings = preload_embeddings
47
+ @max_cache_size = max_cache_size
48
+
49
+ # Embedding cache (word -> vector)
50
+ @embedding_cache = {}
51
+
52
+ # Pre-loaded embedding matrix (for faster search)
53
+ @embedding_matrix = nil
54
+
55
+ # Track whether embeddings are preloaded
56
+ @embeddings_loaded = false
57
+
58
+ # Load embeddings if requested
59
+ preload_embeddings! if preload_embeddings
60
+ end
61
+
62
+ # Find k nearest neighbors for a word.
63
+ #
64
+ # @param query_word [String] The query word
65
+ # @param k [Integer] Number of neighbors to return
66
+ # @param exclude_self [Boolean] Whether to exclude the query word itself
67
+ # @param min_similarity [Float] Minimum similarity threshold (0.0 to 1.0)
68
+ # @return [Array<Hash>] Array of {word, similarity} hashes
69
+ def find_nearest(query_word, k: 10, exclude_self: true, min_similarity: 0.0)
70
+ # Get query embedding
71
+ query_vec = get_embedding(query_word)
72
+ return [] unless query_vec
73
+
74
+ # Find neighbors
75
+ if @embedding_matrix
76
+ nearest_from_matrix(query_vec, k, exclude_self, min_similarity)
77
+ else
78
+ nearest_from_cache(query_vec, k, exclude_self, min_similarity)
79
+ end
80
+ end
81
+
82
+ # Find k nearest neighbors for multiple words.
83
+ #
84
+ # @param query_words [Array<String>] Query words
85
+ # @param k [Integer] Number of neighbors per word
86
+ # @return [Hash<String, Array<Hash>>] Word to neighbors mapping
87
+ def find_nearest_batch(query_words, k: 10)
88
+ query_words.each_with_object({}) do |word, result|
89
+ result[word] = find_nearest(word, k: k)
90
+ end
91
+ end
92
+
93
+ # Compute similarity between two words.
94
+ #
95
+ # @param word1 [String] First word
96
+ # @param word2 [String] Second word
97
+ # @return [Float] Cosine similarity (-1.0 to 1.0, or nil if either word not found)
98
+ def similarity(word1, word2)
99
+ vec1 = get_embedding(word1)
100
+ vec2 = get_embedding(word2)
101
+
102
+ return nil unless vec1 && vec2
103
+
104
+ cosine_similarity(vec1, vec2)
105
+ end
106
+
107
+ # Compute similarity between two embedding vectors.
108
+ #
109
+ # @param vec1 [Array<Float>] First vector
110
+ # @param vec2 [Array<Float>] Second vector
111
+ # @return [Float] Cosine similarity (-1.0 to 1.0)
112
+ def cosine_similarity(vec1, vec2)
113
+ return 0.0 if vec1.nil? || vec2.nil?
114
+
115
+ # Compute dot product
116
+ dot = vec1.zip(vec2).sum { |a, b| a * b }
117
+
118
+ # Compute magnitudes
119
+ norm1 = Math.sqrt(vec1.sum { |x| x * x })
120
+ norm2 = Math.sqrt(vec2.sum { |x| x * x })
121
+
122
+ return 0.0 if norm1.zero? || norm2.zero?
123
+
124
+ dot / (norm1 * norm2)
125
+ end
126
+
127
+ # Preload all embeddings into memory for faster search.
128
+ #
129
+ # @return [Boolean] True if loaded successfully
130
+ def preload_embeddings!
131
+ return false if @embedding_matrix
132
+
133
+ # Get all indices
134
+ all_indices = (0...@vocabulary.size).to_a
135
+
136
+ # Batch load embeddings
137
+ vectors = @model.get_embeddings(all_indices)
138
+ return false if vectors.nil? || vectors.empty?
139
+
140
+ # Store as hash for now (could use Numo::SFloat for efficiency)
141
+ @embedding_matrix = {}
142
+ all_indices.zip(vectors).each do |idx, vec|
143
+ @embedding_matrix[idx] = vec
144
+ end
145
+
146
+ @embeddings_loaded = true
147
+ true
148
+ rescue StandardError => e
149
+ warn "Failed to preload embeddings: #{e.message}"
150
+ false
151
+ end
152
+
153
+ # Clear the embedding cache.
154
+ #
155
+ # @return [self] Self for chaining
156
+ def clear_cache
157
+ @embedding_cache.clear
158
+ @embedding_matrix = nil
159
+ @embeddings_loaded = false
160
+ self
161
+ end
162
+
163
+ # Get cache statistics.
164
+ #
165
+ # @return [Hash] Cache statistics
166
+ def cache_stats
167
+ stats = {
168
+ size: @embedding_cache.size,
169
+ max_size: @max_cache_size
170
+ }
171
+ stats[:hit_rate] = @cache_hits.to_f / (@cache_hits + @cache_misses) if defined?(@cache_hits)
172
+ stats
173
+ end
174
+
175
+ # String representation.
176
+ #
177
+ # @return [String] String representation
178
+ def to_s
179
+ "SimilaritySearch(vocab_size: #{@vocabulary.size}, loaded: #{@embeddings_loaded})"
180
+ end
181
+ alias inspect to_s
182
+
183
+ private
184
+
185
+ # Get embedding for a word (with caching).
186
+ #
187
+ # @param word [String] The word
188
+ # @return [Array<Float>, nil] Embedding vector or nil if not found
189
+ def get_embedding(word)
190
+ # Check cache first
191
+ if @embedding_cache.key?(word)
192
+ @cache_hits += 1 if defined?(@cache_hits)
193
+ return @embedding_cache[word]
194
+ end
195
+
196
+ @cache_misses ||= 0
197
+ @cache_hits ||= 0
198
+ @cache_misses += 1
199
+
200
+ # Get from model
201
+ index = @vocabulary.lookup(word)
202
+ return nil unless index
203
+
204
+ vec = if @embedding_matrix
205
+ @embedding_matrix[index]
206
+ else
207
+ @model.get_embedding(index)
208
+ end
209
+
210
+ return nil unless vec
211
+
212
+ # Cache if not preloading (preload has all in memory already)
213
+ unless @preload_embeddings
214
+ # Evict oldest if cache is full
215
+ if @embedding_cache.size >= @max_cache_size
216
+ @embedding_cache.shift
217
+ end
218
+ @embedding_cache[word] = vec
219
+ end
220
+
221
+ vec
222
+ end
223
+
224
+ # Find nearest neighbors using pre-loaded matrix.
225
+ #
226
+ # @param query_vec [Array<Float>] Query embedding
227
+ # @param k [Integer] Number of neighbors
228
+ # @param exclude_self [Boolean] Whether to exclude exact matches
229
+ # @param min_similarity [Float] Minimum similarity
230
+ # @return [Array<Hash>] Nearest neighbors
231
+ def nearest_from_matrix(query_vec, k, exclude_self, min_similarity)
232
+ similarities = []
233
+
234
+ @vocabulary.words.each do |word|
235
+ index = @vocabulary.lookup(word)
236
+ vec = @embedding_matrix[index]
237
+
238
+ next unless vec
239
+
240
+ sim = cosine_similarity(query_vec, vec)
241
+
242
+ # Skip exact match if requested
243
+ next if exclude_self && sim >= 0.9999
244
+
245
+ # Skip below threshold
246
+ next if sim < min_similarity
247
+
248
+ similarities << { word: word, similarity: sim }
249
+ end
250
+
251
+ # Sort by similarity (descending) and take top k
252
+ similarities.sort_by { |s| -s[:similarity] }.first(k)
253
+ end
254
+
255
+ # Find nearest neighbors using cache (no pre-loading).
256
+ #
257
+ # @param query_vec [Array<Float>] Query embedding
258
+ # @param k [Integer] Number of neighbors
259
+ # @param exclude_self [Boolean] Whether to exclude exact matches
260
+ # @param min_similarity [Float] Minimum similarity
261
+ # @return [Array<Hash>] Nearest neighbors
262
+ def nearest_from_cache(query_vec, k, exclude_self, min_similarity)
263
+ similarities = []
264
+
265
+ # Sample from vocabulary for efficiency (or use common words)
266
+ sample_words = sample_vocabulary(k * 10)
267
+
268
+ sample_words.each do |word|
269
+ vec = get_embedding(word)
270
+ next unless vec
271
+
272
+ sim = cosine_similarity(query_vec, vec)
273
+
274
+ # Skip exact match if requested
275
+ next if exclude_self && sim >= 0.9999
276
+
277
+ # Skip below threshold
278
+ next if sim < min_similarity
279
+
280
+ similarities << { word: word, similarity: sim }
281
+ end
282
+
283
+ # Sort by similarity (descending) and take top k
284
+ similarities.sort_by { |s| -s[:similarity] }.first(k)
285
+ end
286
+
287
+ # Sample words from vocabulary for search.
288
+ #
289
+ # Prioritizes common words (first N in vocabulary).
290
+ #
291
+ # @param n [Integer] Number of words to sample
292
+ # @return [Array<String>] Sampled words
293
+ def sample_vocabulary(n)
294
+ # Use first N words (FastText orders by frequency)
295
+ # plus a random sample of the rest
296
+ common_size = [n / 2, 100].min
297
+ random_size = n - common_size
298
+
299
+ common = @vocabulary.common_words(n: common_size)
300
+
301
+ if @vocabulary.size > common_size
302
+ # Get a random sample from the rest
303
+ rest = @vocabulary.words.drop(common_size)
304
+ random_sample = rest.sample(random_size)
305
+ common + random_sample
306
+ else
307
+ common
308
+ end
309
+ end
310
+
311
+ # Create from cache.
312
+ #
313
+ # @param language_code [String] ISO 639-1 language code
314
+ # @param cache [Cache::ModelCache, nil] Optional cache instance
315
+ # @param preload [Boolean] Whether to preload embeddings
316
+ # @return [SimilaritySearch, nil] New search instance or nil if not available
317
+ def self.from_cache(language_code, cache: nil, preload: false)
318
+ vocab = Vocabulary.from_cache(language_code, cache: cache)
319
+ model = OnnxRuntimeModel.from_cache(language_code, cache: cache)
320
+
321
+ return nil unless vocab && model
322
+
323
+ new(
324
+ vocabulary: vocab,
325
+ model: model,
326
+ preload_embeddings: preload
327
+ )
328
+ end
329
+ end
330
+ end
331
+ end
@@ -0,0 +1,257 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require_relative 'protocol'
5
+
6
+ # Vocabulary - Word to index mapping
7
+ #
8
+ # Provides efficient lookup from words to integer indices for embedding retrieval.
9
+ # Supports JSON file loading and saving.
10
+ #
11
+ # @example Creating a vocabulary
12
+ # vocab = Kotoshu::Embeddings::Vocabulary.new(
13
+ # language_code: 'en',
14
+ # word_to_index: { 'hello' => 0, 'world' => 1 }
15
+ # )
16
+ #
17
+ # @example Loading from file
18
+ # vocab = Kotoshu::Embeddings::Vocabulary.from_file('/path/to/vocab.json', language_code: 'en')
19
+ #
20
+ class Vocabulary
21
+ include VocabularyProtocol
22
+
23
+ # @return [String] ISO 639-1 language code
24
+ attr_reader :language_code
25
+
26
+ # @return [Hash{String => Integer}] Word to index mapping
27
+ attr_reader :word_to_index
28
+
29
+ # @return [Array<String>] Index to word mapping (sparse array)
30
+ attr_reader :index_to_word
31
+
32
+ # Create a new vocabulary
33
+ #
34
+ # @param language_code [String] ISO 639-1 language code
35
+ # @param word_to_index [Hash{String => Integer}] Word to index mapping
36
+ #
37
+ # @raise [ArgumentError] If word_to_index is empty
38
+ #
39
+ def initialize(language_code:, word_to_index:)
40
+ raise ArgumentError, 'word_to_index cannot be empty' if word_to_index.nil? || word_to_index.empty?
41
+
42
+ @language_code = language_code
43
+ @word_to_index = word_to_index.dup.freeze
44
+
45
+ # Build reverse index (index -> word)
46
+ @index_to_word = Array.new(@word_to_index.size)
47
+ @word_to_index.each do |word, index|
48
+ @index_to_word[index] = word if index < @index_to_word.size
49
+ end
50
+ @index_to_word.freeze
51
+ end
52
+
53
+ # Look up word index
54
+ #
55
+ # @param word [String] The word to look up
56
+ # @return [Integer, nil] Index of the word, or nil if not found
57
+ #
58
+ def lookup(word)
59
+ @word_to_index[word]
60
+ end
61
+
62
+ # Get word by index
63
+ #
64
+ # @param index [Integer] The index to look up
65
+ # @return [String, nil] Word at the index, or nil if not found
66
+ #
67
+ def get_word(index)
68
+ @index_to_word[index]
69
+ end
70
+
71
+ # Check if word exists in vocabulary
72
+ #
73
+ # @param word [String] Word to check
74
+ # @return [Boolean] True if word exists
75
+ #
76
+ def include?(word)
77
+ @word_to_index.key?(word)
78
+ end
79
+
80
+ # Get vocabulary size
81
+ #
82
+ # @return [Integer] Number of words in vocabulary
83
+ #
84
+ def size
85
+ @word_to_index.size
86
+ end
87
+
88
+ # Check if index is valid
89
+ #
90
+ # @param index [Integer] Index to check
91
+ # @return [Boolean] True if index is valid
92
+ #
93
+ def valid_index?(index)
94
+ index.is_a?(Integer) && index >= 0 && index < @word_to_index.size
95
+ end
96
+
97
+ # Get common/most frequent words
98
+ #
99
+ # @param n [Integer] Number of words to return
100
+ # @return [Array<String>] Array of common words
101
+ #
102
+ def common_words(n: 10)
103
+ return [] if @word_to_index.empty?
104
+
105
+ @word_to_index.keys.first(n)
106
+ end
107
+
108
+ # Convert to Hash
109
+ #
110
+ # @return [Hash{String => Integer}] Copy of word_to_index mapping
111
+ #
112
+ def to_h
113
+ @word_to_index.dup
114
+ end
115
+
116
+ # Get all words as enumerator
117
+ #
118
+ # @return [Enumerator<String>] Enumerator of all words
119
+ #
120
+ def words
121
+ @word_to_index.each_key
122
+ end
123
+
124
+ # Load vocabulary from JSON file
125
+ #
126
+ # @param path [String] Path to JSON file
127
+ # @param language_code [String] Language code (auto-detected from filename if nil)
128
+ # @return [Vocabulary] New vocabulary instance
129
+ #
130
+ # @raise [ArgumentError] If file doesn't exist
131
+ # @raise [Json::ParserError] If file is not valid JSON
132
+ #
133
+ def self.from_file(path, language_code: nil)
134
+ raise ArgumentError, "File not found: #{path}" unless File.exist?(path)
135
+
136
+ language_code ||= detect_language_from_path(path)
137
+
138
+ data = JSON.parse(File.read(path))
139
+
140
+ case data
141
+ when Hash
142
+ word_to_index = data.transform_keys(&:freeze).freeze
143
+ when Array
144
+ word_to_index = {}
145
+ data.each_with_index do |word, index|
146
+ word_to_index[word.freeze] = index
147
+ end
148
+ word_to_index.freeze
149
+ else
150
+ raise ArgumentError, "Invalid vocabulary format: expected Hash or Array"
151
+ end
152
+
153
+ new(language_code: language_code, word_to_index: word_to_index)
154
+ end
155
+
156
+ # Create vocabulary from Array of words
157
+ #
158
+ # @param words [Array<String>] Array of words
159
+ # @param language_code [String] Language code
160
+ # @return [Vocabulary] New vocabulary instance
161
+ #
162
+ def self.from_words(words, language_code: 'en')
163
+ word_to_index = {}
164
+ words.each_with_index do |word, index|
165
+ word_to_index[word.freeze] = index
166
+ end
167
+ word_to_index.freeze
168
+
169
+ new(language_code: language_code, word_to_index: word_to_index)
170
+ end
171
+
172
+ # Save vocabulary to JSON file
173
+ #
174
+ # @param path [String] Path to save file
175
+ # @param format [Symbol] Format: :hash or :array
176
+ #
177
+ def save_to_file(path, format: :hash)
178
+ case format
179
+ when :hash
180
+ data = @word_to_index.dup
181
+ when :array
182
+ max_index = @index_to_word.compact.length
183
+ data = @index_to_word.compact.first(max_index)
184
+ else
185
+ raise ArgumentError, "Unknown format: #{format}"
186
+ end
187
+
188
+ File.write(path, JSON.pretty_generate(data))
189
+ end
190
+
191
+ # Check if vocabulary is empty
192
+ #
193
+ # @return [Boolean] True if empty
194
+ #
195
+ def empty?
196
+ @word_to_index.empty?
197
+ end
198
+
199
+ # Get a sample of words
200
+ #
201
+ # @param n [Integer] Number of words to sample
202
+ # @return [Array<String>] Sample of words
203
+ #
204
+ def sample(n: 10)
205
+ @word_to_index.keys.sample(n)
206
+ end
207
+
208
+ # Create a sub-vocabulary containing only specified words
209
+ #
210
+ # @param words [Array<String>] Words to include
211
+ # @return [Vocabulary] New vocabulary with subset of words
212
+ #
213
+ def sub_vocabulary(words)
214
+ filtered = @word_to_index.select { |w, _| words.include?(w) }
215
+ self.class.new(language_code: @language_code, word_to_index: filtered)
216
+ end
217
+
218
+ # Find words starting with a prefix
219
+ #
220
+ # @param prefix [String] Prefix to match
221
+ # @return [Array<String>] Matching words
222
+ #
223
+ def words_starting_with(prefix)
224
+ pattern = /^#{Regexp.escape(prefix)}/
225
+ @word_to_index.keys.grep(pattern)
226
+ end
227
+
228
+ # String representation
229
+ #
230
+ # @return [String]
231
+ #
232
+ def to_s
233
+ "Vocabulary(language: #{@language_code}, size: #{@word_to_index.size})"
234
+ end
235
+ alias inspect to_s
236
+
237
+ private_class_method
238
+
239
+ # Detect language code from file path
240
+ #
241
+ # @param path [String] File path
242
+ # @return [String] Detected language code
243
+ #
244
+ def self.detect_language_from_path(path)
245
+ basename = File.basename(path)
246
+
247
+ if basename =~ /(\w+)\.vocab\.json\z/
248
+ return $1
249
+ end
250
+
251
+ if basename =~ /\.(\w+)\.vocab\.json\z/
252
+ return $1
253
+ end
254
+
255
+ 'unknown'
256
+ end
257
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Load all components
4
+ require_relative 'embeddings/protocol'
5
+ require_relative 'embeddings/lru_cache'
6
+ require_relative 'embeddings/vocabulary'
7
+ require_relative 'embeddings/onnx_runtime_model'
8
+ require_relative 'embeddings/similarity_engine'
9
+ require_relative 'embeddings/search'
10
+ require_relative 'embeddings/embedding_pipeline'
11
+ require_relative 'embeddings/registry'
12
+
13
+ # Embeddings module for FastText ONNX integration.
14
+ #
15
+ # Provides semantic spell checking using FastText word embeddings.
16
+ # Supports 157 languages through pre-converted ONNX models.
17
+ #
18
+ # @example Simple usage (recommended)
19
+ # pipeline = Kotoshu::Embeddings.from_cache(language: 'en')
20
+ # neighbors = pipeline.find_nearest('semantic', k: 5)
21
+ #
22
+ # @example Advanced usage
23
+ # vocab = Kotoshu::Embeddings::Vocabulary.from_file('vocab.json')
24
+ # model = Kotoshu::Embeddings::OnnxRuntimeModel.from_file('model.onnx')
25
+ # engine = Kotoshu::Embeddings::SimilarityEngine.new(pre_normalize: true)
26
+ #
27
+ module Kotoshu
28
+ module Embeddings
29
+ # Constants
30
+ DEFAULT_DIMENSION = 300
31
+ MAX_VOCABULARY_SIZE = 100_000
32
+ VERSION = '2.0.0'
33
+
34
+ # Expose classes
35
+ Vocabulary = ::Vocabulary
36
+ OnnxRuntimeModel = ::OnnxRuntimeModel
37
+ SimilarityEngine = ::SimilarityEngine
38
+ Search = ::Search
39
+ EmbeddingPipeline = ::EmbeddingPipeline
40
+ LruCache = ::LruCache
41
+ Registry = ::EmbeddingRegistry
42
+
43
+ # Protocols namespace
44
+ module Protocols
45
+ EmbeddingModel = ::EmbeddingModelProtocol
46
+ SimilarityEngine = ::SimilarityEngineProtocol
47
+ Vocabulary = ::VocabularyProtocol
48
+ end
49
+
50
+ # Create an EmbeddingPipeline from cache
51
+ #
52
+ # @param language [String] ISO 639-1 language code
53
+ # @param preload [Boolean] Preload embeddings into memory
54
+ # @return [EmbeddingPipeline]
55
+ #
56
+ def self.from_cache(language:, preload: false, index: :exact)
57
+ EmbeddingPipeline.from_cache(language: language, preload: preload, index: index)
58
+ end
59
+
60
+ # Check if a language is supported
61
+ #
62
+ # @param language [String] ISO 639-1 language code
63
+ # @return [Boolean]
64
+ #
65
+ def self.language_supported?(language)
66
+ require_relative '../cache/model_cache'
67
+ cache = Cache::ModelCache.new
68
+ cache.available_models_for(language.to_sym).include?(:onnx)
69
+ end
70
+
71
+ # List all supported languages
72
+ #
73
+ # @return [Array<String>]
74
+ #
75
+ def self.supported_languages
76
+ require_relative '../cache/model_cache'
77
+ cache = Cache::ModelCache.new
78
+ cache.all_available_models[:onnx].keys.map(&:to_s)
79
+ end
80
+
81
+ # Create a custom embedding pipeline
82
+ #
83
+ # @param vocabulary [Vocabulary] Vocabulary instance
84
+ # @param model [EmbeddingModel] Model instance
85
+ # @param preload [Boolean] Preload embeddings
86
+ # @return [EmbeddingPipeline]
87
+ #
88
+ def self.create_pipeline(vocabulary:, model:, preload: false, pre_normalize: false)
89
+ EmbeddingPipeline.new(
90
+ vocabulary: vocabulary,
91
+ model: model,
92
+ preload: preload,
93
+ pre_normalize: pre_normalize
94
+ )
95
+ end
96
+ end
97
+ end