kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,192 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Search - Brute force nearest neighbor search
4
+ #
5
+ # Performs exhaustive search over all vocabulary entries.
6
+ # Uses min-heap for efficient top-k selection (O(n log k) instead of O(n log n)).
7
+ #
8
+ # @example
9
+ # search = ExactSearch.new(
10
+ # vocabulary: vocab,
11
+ # model: model,
12
+ # similarity_engine: engine
13
+ # )
14
+ # neighbors = search.find_nearest('hello', k: 5)
15
+ #
16
+ class Search
17
+ # Min-heap for top-k selection
18
+ class MinHeap
19
+ def initialize(max_size)
20
+ @max_size = max_size
21
+ @heap = []
22
+ end
23
+
24
+ def push(item)
25
+ @heap << item
26
+ @heap.sort_by! { |i| i[:similarity] }
27
+ @heap.shift if @heap.size > @max_size
28
+ end
29
+
30
+ def empty?
31
+ @heap.empty?
32
+ end
33
+
34
+ def size
35
+ @heap.size
36
+ end
37
+
38
+ def each(&block)
39
+ @heap.each(&block)
40
+ end
41
+
42
+ def to_a
43
+ @heap.dup
44
+ end
45
+ end
46
+
47
+ # @return [Vocabulary]
48
+ attr_reader :vocabulary
49
+
50
+ # @return [EmbeddingModel]
51
+ attr_reader :model
52
+
53
+ # @return [SimilarityEngine]
54
+ attr_reader :similarity_engine
55
+
56
+ # @return [Boolean] Whether embeddings are preloaded
57
+ attr_reader :embeddings_loaded
58
+
59
+ # Create a new exact search
60
+ #
61
+ # @param vocabulary [Vocabulary] Word vocabulary
62
+ # @param model [EmbeddingModel] Embedding provider
63
+ # @param similarity_engine [SimilarityEngine] Similarity calculator
64
+ # @param pre_normalize [Boolean] Pre-normalize vectors for faster similarity
65
+ #
66
+ def initialize(vocabulary:, model:, similarity_engine:, pre_normalize: false)
67
+ @vocabulary = vocabulary
68
+ @model = model
69
+ @similarity_engine = similarity_engine
70
+ @pre_normalize = pre_normalize
71
+
72
+ @embedding_cache = {}
73
+ @embeddings_loaded = false
74
+ end
75
+
76
+ # Find k nearest neighbors for a word
77
+ #
78
+ # @param query_word [String] Query word
79
+ # @param k [Integer] Number of neighbors to return
80
+ # @param exclude_self [Boolean] Exclude query word from results
81
+ # @param min_similarity [Float] Minimum similarity threshold
82
+ # @return [Array<Hash>] Array of {word, similarity, index}
83
+ #
84
+ def find_nearest(query_word, k: 10, exclude_self: true, min_similarity: 0.0)
85
+ query_vec = get_embedding_for_word(query_word)
86
+ return [] unless query_vec
87
+
88
+ heap = MinHeap.new(k)
89
+
90
+ @vocabulary.words.each do |word|
91
+ next if exclude_self && word == query_word
92
+
93
+ vec = get_embedding_for_word(word)
94
+ next unless vec
95
+
96
+ similarity = @similarity_engine.cosine(query_vec, vec)
97
+ next if similarity < min_similarity
98
+
99
+ index = @vocabulary.lookup(word)
100
+ heap.push(word: word, similarity: similarity, index: index)
101
+ end
102
+
103
+ # Return sorted by similarity descending
104
+ heap.to_a.sort_by { |r| -r[:similarity] }
105
+ end
106
+
107
+ # Find nearest neighbors for multiple words
108
+ #
109
+ # @param query_words [Array<String>] Query words
110
+ # @param k [Integer] Number of neighbors per word
111
+ # @return [Hash<String, Array<Hash>>] Word to results mapping
112
+ #
113
+ def find_nearest_batch(query_words, k: 10)
114
+ query_words.each_with_object({}) do |word, results|
115
+ results[word] = find_nearest(word, k: k)
116
+ end
117
+ end
118
+
119
+ # Compute similarity between two words
120
+ #
121
+ # @param word1 [String] First word
122
+ # @param word2 [String] Second word
123
+ # @return [Float, nil] Similarity or nil if either word not found
124
+ #
125
+ def similarity(word1, word2)
126
+ vec1 = get_embedding_for_word(word1)
127
+ vec2 = get_embedding_for_word(word2)
128
+ return nil unless vec1 && vec2
129
+
130
+ @similarity_engine.cosine(vec1, vec2)
131
+ end
132
+
133
+ # Preload all embeddings into memory
134
+ #
135
+ # @return [self]
136
+ #
137
+ def preload_embeddings!
138
+ all_indices = (0...@vocabulary.size).to_a
139
+ embeddings = @model.get_embeddings(all_indices)
140
+
141
+ @vocabulary.words.each_with_index do |word, i|
142
+ @embedding_cache[word] = embeddings[i]
143
+ end
144
+
145
+ @embeddings_loaded = true
146
+ self
147
+ end
148
+
149
+ # Clear embedding cache
150
+ #
151
+ # @return [self]
152
+ #
153
+ def clear_cache
154
+ @embedding_cache.clear
155
+ @embeddings_loaded = false
156
+ self
157
+ end
158
+
159
+ # String representation
160
+ #
161
+ # @return [String]
162
+ #
163
+ def to_s
164
+ "ExactSearch(vocab: #{@vocabulary.size}, loaded: #{@embeddings_loaded})"
165
+ end
166
+ alias inspect to_s
167
+
168
+ private
169
+
170
+ # Get embedding for a word (with caching)
171
+ #
172
+ # @param word [String] Word
173
+ # @return [Array<Float>, nil]
174
+ #
175
+ def get_embedding_for_word(word)
176
+ # Check cache first
177
+ if @embedding_cache.key?(word)
178
+ return @embedding_cache[word]
179
+ end
180
+
181
+ index = @vocabulary.lookup(word)
182
+ return nil unless index
183
+
184
+ vec = @model.get_embedding(index)
185
+ return nil unless vec
186
+
187
+ # Cache if not preloaded (to avoid repeated lookups)
188
+ @embedding_cache[word] = vec unless @embeddings_loaded
189
+
190
+ vec
191
+ end
192
+ end
@@ -0,0 +1,248 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'protocol'
4
+
5
+ # SimilarityEngine - Compute similarity between embedding vectors
6
+ #
7
+ # Provides various similarity/distance metrics with optimizations like
8
+ # norm caching and pre-normalized vector support.
9
+ #
10
+ # @example Basic usage
11
+ # engine = SimilarityEngine.new
12
+ # engine.cosine([1.0, 0.0], [1.0, 0.0]) # => 1.0
13
+ #
14
+ # @example Pre-normalized vectors (faster)
15
+ # engine = SimilarityEngine.new(pre_normalize: true)
16
+ # engine.pre_normalize([1.0, 0.0]) # => [1.0, 0.0]
17
+ #
18
+ class SimilarityEngine
19
+ include SimilarityEngineProtocol
20
+
21
+ # Default embedding dimension for norm cache initialization
22
+ DEFAULT_CACHE_SIZE = 10_000
23
+
24
+ # @return [Boolean] Whether vectors are pre-normalized
25
+ attr_reader :pre_normalize
26
+
27
+ # @return [Integer] Number of cache hits
28
+ attr_reader :cache_hits
29
+
30
+ # @return [Integer] Number of cache misses
31
+ attr_reader :cache_misses
32
+
33
+ # Create a new similarity engine
34
+ #
35
+ # @param pre_normalize [Boolean] Whether to pre-normalize vectors
36
+ # @param cache_norms [Boolean] Whether to cache vector norms
37
+ #
38
+ def initialize(pre_normalize: false, cache_norms: true)
39
+ @pre_normalize = pre_normalize
40
+ @cache_norms = cache_norms
41
+ @norm_cache = cache_norms ? {} : nil
42
+ @cache_hits = 0
43
+ @cache_misses = 0
44
+ end
45
+
46
+ # Compute cosine similarity between two vectors
47
+ #
48
+ # Cosine similarity = dot(v1, v2) / (||v1|| * ||v2||)
49
+ # Range: -1.0 (opposite) to 1.0 (identical)
50
+ #
51
+ # @param vec1 [Array<Float>] First vector
52
+ # @param vec2 [Array<Float>] Second vector
53
+ # @return [Float] Cosine similarity, or 0.0 if either vector is nil/empty
54
+ #
55
+ def cosine(vec1, vec2)
56
+ return 0.0 if vec1.nil? || vec2.nil? || vec1.empty? || vec2.empty?
57
+
58
+ norm1 = get_norm(vec1)
59
+ norm2 = get_norm(vec2)
60
+
61
+ return 0.0 if norm1.zero? || norm2.zero?
62
+
63
+ dot = dot_product(vec1, vec2)
64
+ dot / (norm1 * norm2)
65
+ end
66
+
67
+ # Compute dot product between two vectors
68
+ #
69
+ # @param vec1 [Array<Float>] First vector
70
+ # @param vec2 [Array<Float>] Second vector
71
+ # @return [Float] Dot product
72
+ #
73
+ def dot_product(vec1, vec2)
74
+ return 0.0 if vec1.nil? || vec2.nil? || vec1.empty? || vec2.empty?
75
+
76
+ vec1.zip(vec2).sum { |a, b| a * b }
77
+ end
78
+
79
+ # Compute Euclidean distance between two vectors
80
+ #
81
+ # @param vec1 [Array<Float>] First vector
82
+ # @param vec2 [Array<Float>] Second vector
83
+ # @return [Float] Euclidean distance
84
+ #
85
+ def euclidean(vec1, vec2)
86
+ return 0.0 if vec1.nil? || vec2.nil? || vec1.empty? || vec2.empty?
87
+ return 0.0 if vec1.equal?(vec2)
88
+
89
+ sum = 0.0
90
+ vec1.zip(vec2) do |a, b|
91
+ diff = a - b
92
+ sum += diff * diff
93
+ end
94
+ Math.sqrt(sum)
95
+ end
96
+
97
+ # Compute Manhattan (L1) distance between two vectors
98
+ #
99
+ # @param vec1 [Array<Float>] First vector
100
+ # @param vec2 [Array<Float>] Second vector
101
+ # @return [Float] Manhattan distance
102
+ #
103
+ def manhattan(vec1, vec2)
104
+ return 0.0 if vec1.nil? || vec2.nil? || vec1.empty? || vec2.empty?
105
+
106
+ vec1.zip(vec2).sum { |a, b| (a - b).abs }
107
+ end
108
+
109
+ # Pre-normalize a vector to unit length
110
+ #
111
+ # @param vec [Array<Float>] Vector to normalize
112
+ # @return [Array<Float>] Normalized vector
113
+ #
114
+ def pre_normalize(vec)
115
+ return vec.dup if vec.nil? || vec.empty?
116
+
117
+ norm = get_norm(vec)
118
+ return vec.dup if norm.zero?
119
+
120
+ vec.map { |x| x / norm }
121
+ end
122
+
123
+ # Normalize and compute similarity in one pass
124
+ #
125
+ # For pre-normalized vectors, this is just dot product (much faster).
126
+ #
127
+ # @param vec1 [Array<Float>] First vector
128
+ # @param vec2 [Array<Float>] Second vector
129
+ # @return [Float] Cosine similarity
130
+ #
131
+ def normalize_and_compute(vec1, vec2)
132
+ return 0.0 if vec1.nil? || vec2.nil? || vec1.empty? || vec2.empty?
133
+
134
+ if @pre_normalize
135
+ # For normalized vectors, cosine similarity = dot product
136
+ dot_product(vec1, vec2)
137
+ else
138
+ cosine(vec1, vec2)
139
+ end
140
+ end
141
+
142
+ # Check if vectors are normalized (unit length)
143
+ #
144
+ # @param vec [Array<Float>] Vector to check
145
+ # @return [Boolean] True if vector is normalized
146
+ #
147
+ def is_normalized?(vec)
148
+ return true if vec.nil? || vec.empty?
149
+
150
+ norm = get_norm(vec)
151
+ (norm - 1.0).abs < Float::EPSILON * 10
152
+ end
153
+
154
+ # Check if normalization is required for accurate similarity
155
+ #
156
+ # @return [Boolean] True if normalization should be applied
157
+ #
158
+ def normalization_required?
159
+ !@pre_normalize
160
+ end
161
+
162
+ # Clear the norm cache
163
+ #
164
+ # @return [self]
165
+ #
166
+ def clear_cache
167
+ @norm_cache&.clear
168
+ @cache_hits = 0
169
+ @cache_misses = 0
170
+ self
171
+ end
172
+
173
+ # Get cache statistics
174
+ #
175
+ # @return [Hash] Cache statistics
176
+ #
177
+ def cache_stats
178
+ total = @cache_hits + @cache_misses
179
+ {
180
+ hits: @cache_hits,
181
+ misses: @cache_misses,
182
+ hit_rate: total.zero? ? 0.0 : @cache_hits.to_f / total,
183
+ cache_size: @norm_cache&.size || 0
184
+ }
185
+ end
186
+
187
+ # Compute similarity for a batch of vector pairs
188
+ #
189
+ # More efficient than calling cosine() repeatedly.
190
+ #
191
+ # @param pairs [Array<Array<Array<Float>>>] Array of [vec1, vec2] pairs
192
+ # @return [Array<Float>] Array of similarities
193
+ #
194
+ def cosine_batch(pairs)
195
+ pairs.map { |v1, v2| cosine(v1, v2) }
196
+ end
197
+
198
+ # Compute all pairwise similarities for a set of vectors
199
+ #
200
+ # @param vectors [Array<Array<Float>>>] Array of vectors
201
+ # @return [Array<Array<Float>>] Similarity matrix
202
+ #
203
+ def compute_all_pairs(vectors)
204
+ n = vectors.length
205
+ matrix = Array.new(n) { Array.new(n, 0.0) }
206
+
207
+ (0...n).each do |i|
208
+ matrix[i][i] = 1.0
209
+ ((i + 1)...n).each do |j|
210
+ sim = cosine(vectors[i], vectors[j])
211
+ matrix[i][j] = sim
212
+ matrix[j][i] = sim
213
+ end
214
+ end
215
+
216
+ matrix
217
+ end
218
+
219
+ private
220
+
221
+ # Get norm with caching
222
+ #
223
+ # @param vec [Array<Float>] Vector
224
+ # @return [Float] Vector norm (magnitude)
225
+ #
226
+ def get_norm(vec)
227
+ return 0.0 if vec.nil? || vec.empty?
228
+
229
+ if @norm_cache && @norm_cache.key?(vec.object_id)
230
+ @cache_hits += 1
231
+ return @norm_cache[vec.object_id]
232
+ end
233
+
234
+ @cache_misses += 1 if @norm_cache
235
+
236
+ norm = Math.sqrt(vec.sum { |x| x * x })
237
+
238
+ if @norm_cache
239
+ # Avoid memory leaks by limiting cache size
240
+ if @norm_cache.size >= 100_000
241
+ @norm_cache.shift
242
+ end
243
+ @norm_cache[vec.object_id] = norm
244
+ end
245
+
246
+ norm
247
+ end
248
+ end