kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Search - Brute force nearest neighbor search
|
|
4
|
+
#
|
|
5
|
+
# Performs exhaustive search over all vocabulary entries.
|
|
6
|
+
# Uses min-heap for efficient top-k selection (O(n log k) instead of O(n log n)).
|
|
7
|
+
#
|
|
8
|
+
# @example
|
|
9
|
+
# search = ExactSearch.new(
|
|
10
|
+
# vocabulary: vocab,
|
|
11
|
+
# model: model,
|
|
12
|
+
# similarity_engine: engine
|
|
13
|
+
# )
|
|
14
|
+
# neighbors = search.find_nearest('hello', k: 5)
|
|
15
|
+
#
|
|
16
|
+
class Search
|
|
17
|
+
# Min-heap for top-k selection
|
|
18
|
+
class MinHeap
|
|
19
|
+
def initialize(max_size)
|
|
20
|
+
@max_size = max_size
|
|
21
|
+
@heap = []
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def push(item)
|
|
25
|
+
@heap << item
|
|
26
|
+
@heap.sort_by! { |i| i[:similarity] }
|
|
27
|
+
@heap.shift if @heap.size > @max_size
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def empty?
|
|
31
|
+
@heap.empty?
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def size
|
|
35
|
+
@heap.size
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def each(&block)
|
|
39
|
+
@heap.each(&block)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def to_a
|
|
43
|
+
@heap.dup
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# @return [Vocabulary]
|
|
48
|
+
attr_reader :vocabulary
|
|
49
|
+
|
|
50
|
+
# @return [EmbeddingModel]
|
|
51
|
+
attr_reader :model
|
|
52
|
+
|
|
53
|
+
# @return [SimilarityEngine]
|
|
54
|
+
attr_reader :similarity_engine
|
|
55
|
+
|
|
56
|
+
# @return [Boolean] Whether embeddings are preloaded
|
|
57
|
+
attr_reader :embeddings_loaded
|
|
58
|
+
|
|
59
|
+
# Create a new exact search
|
|
60
|
+
#
|
|
61
|
+
# @param vocabulary [Vocabulary] Word vocabulary
|
|
62
|
+
# @param model [EmbeddingModel] Embedding provider
|
|
63
|
+
# @param similarity_engine [SimilarityEngine] Similarity calculator
|
|
64
|
+
# @param pre_normalize [Boolean] Pre-normalize vectors for faster similarity
|
|
65
|
+
#
|
|
66
|
+
def initialize(vocabulary:, model:, similarity_engine:, pre_normalize: false)
|
|
67
|
+
@vocabulary = vocabulary
|
|
68
|
+
@model = model
|
|
69
|
+
@similarity_engine = similarity_engine
|
|
70
|
+
@pre_normalize = pre_normalize
|
|
71
|
+
|
|
72
|
+
@embedding_cache = {}
|
|
73
|
+
@embeddings_loaded = false
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Find k nearest neighbors for a word
|
|
77
|
+
#
|
|
78
|
+
# @param query_word [String] Query word
|
|
79
|
+
# @param k [Integer] Number of neighbors to return
|
|
80
|
+
# @param exclude_self [Boolean] Exclude query word from results
|
|
81
|
+
# @param min_similarity [Float] Minimum similarity threshold
|
|
82
|
+
# @return [Array<Hash>] Array of {word, similarity, index}
|
|
83
|
+
#
|
|
84
|
+
def find_nearest(query_word, k: 10, exclude_self: true, min_similarity: 0.0)
|
|
85
|
+
query_vec = get_embedding_for_word(query_word)
|
|
86
|
+
return [] unless query_vec
|
|
87
|
+
|
|
88
|
+
heap = MinHeap.new(k)
|
|
89
|
+
|
|
90
|
+
@vocabulary.words.each do |word|
|
|
91
|
+
next if exclude_self && word == query_word
|
|
92
|
+
|
|
93
|
+
vec = get_embedding_for_word(word)
|
|
94
|
+
next unless vec
|
|
95
|
+
|
|
96
|
+
similarity = @similarity_engine.cosine(query_vec, vec)
|
|
97
|
+
next if similarity < min_similarity
|
|
98
|
+
|
|
99
|
+
index = @vocabulary.lookup(word)
|
|
100
|
+
heap.push(word: word, similarity: similarity, index: index)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Return sorted by similarity descending
|
|
104
|
+
heap.to_a.sort_by { |r| -r[:similarity] }
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Find nearest neighbors for multiple words
|
|
108
|
+
#
|
|
109
|
+
# @param query_words [Array<String>] Query words
|
|
110
|
+
# @param k [Integer] Number of neighbors per word
|
|
111
|
+
# @return [Hash<String, Array<Hash>>] Word to results mapping
|
|
112
|
+
#
|
|
113
|
+
def find_nearest_batch(query_words, k: 10)
|
|
114
|
+
query_words.each_with_object({}) do |word, results|
|
|
115
|
+
results[word] = find_nearest(word, k: k)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Compute similarity between two words
|
|
120
|
+
#
|
|
121
|
+
# @param word1 [String] First word
|
|
122
|
+
# @param word2 [String] Second word
|
|
123
|
+
# @return [Float, nil] Similarity or nil if either word not found
|
|
124
|
+
#
|
|
125
|
+
def similarity(word1, word2)
|
|
126
|
+
vec1 = get_embedding_for_word(word1)
|
|
127
|
+
vec2 = get_embedding_for_word(word2)
|
|
128
|
+
return nil unless vec1 && vec2
|
|
129
|
+
|
|
130
|
+
@similarity_engine.cosine(vec1, vec2)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Preload all embeddings into memory
|
|
134
|
+
#
|
|
135
|
+
# @return [self]
|
|
136
|
+
#
|
|
137
|
+
def preload_embeddings!
|
|
138
|
+
all_indices = (0...@vocabulary.size).to_a
|
|
139
|
+
embeddings = @model.get_embeddings(all_indices)
|
|
140
|
+
|
|
141
|
+
@vocabulary.words.each_with_index do |word, i|
|
|
142
|
+
@embedding_cache[word] = embeddings[i]
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
@embeddings_loaded = true
|
|
146
|
+
self
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Clear embedding cache
|
|
150
|
+
#
|
|
151
|
+
# @return [self]
|
|
152
|
+
#
|
|
153
|
+
def clear_cache
|
|
154
|
+
@embedding_cache.clear
|
|
155
|
+
@embeddings_loaded = false
|
|
156
|
+
self
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# String representation
|
|
160
|
+
#
|
|
161
|
+
# @return [String]
|
|
162
|
+
#
|
|
163
|
+
def to_s
|
|
164
|
+
"ExactSearch(vocab: #{@vocabulary.size}, loaded: #{@embeddings_loaded})"
|
|
165
|
+
end
|
|
166
|
+
alias inspect to_s
|
|
167
|
+
|
|
168
|
+
private
|
|
169
|
+
|
|
170
|
+
# Get embedding for a word (with caching)
|
|
171
|
+
#
|
|
172
|
+
# @param word [String] Word
|
|
173
|
+
# @return [Array<Float>, nil]
|
|
174
|
+
#
|
|
175
|
+
def get_embedding_for_word(word)
|
|
176
|
+
# Check cache first
|
|
177
|
+
if @embedding_cache.key?(word)
|
|
178
|
+
return @embedding_cache[word]
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
index = @vocabulary.lookup(word)
|
|
182
|
+
return nil unless index
|
|
183
|
+
|
|
184
|
+
vec = @model.get_embedding(index)
|
|
185
|
+
return nil unless vec
|
|
186
|
+
|
|
187
|
+
# Cache if not preloaded (to avoid repeated lookups)
|
|
188
|
+
@embedding_cache[word] = vec unless @embeddings_loaded
|
|
189
|
+
|
|
190
|
+
vec
|
|
191
|
+
end
|
|
192
|
+
end
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'protocol'
|
|
4
|
+
|
|
5
|
+
# SimilarityEngine - Compute similarity between embedding vectors
|
|
6
|
+
#
|
|
7
|
+
# Provides various similarity/distance metrics with optimizations like
|
|
8
|
+
# norm caching and pre-normalized vector support.
|
|
9
|
+
#
|
|
10
|
+
# @example Basic usage
|
|
11
|
+
# engine = SimilarityEngine.new
|
|
12
|
+
# engine.cosine([1.0, 0.0], [1.0, 0.0]) # => 1.0
|
|
13
|
+
#
|
|
14
|
+
# @example Pre-normalized vectors (faster)
|
|
15
|
+
# engine = SimilarityEngine.new(pre_normalize: true)
|
|
16
|
+
# engine.pre_normalize([1.0, 0.0]) # => [1.0, 0.0]
|
|
17
|
+
#
|
|
18
|
+
class SimilarityEngine
|
|
19
|
+
include SimilarityEngineProtocol
|
|
20
|
+
|
|
21
|
+
# Default embedding dimension for norm cache initialization
|
|
22
|
+
DEFAULT_CACHE_SIZE = 10_000
|
|
23
|
+
|
|
24
|
+
# @return [Boolean] Whether vectors are pre-normalized
|
|
25
|
+
attr_reader :pre_normalize
|
|
26
|
+
|
|
27
|
+
# @return [Integer] Number of cache hits
|
|
28
|
+
attr_reader :cache_hits
|
|
29
|
+
|
|
30
|
+
# @return [Integer] Number of cache misses
|
|
31
|
+
attr_reader :cache_misses
|
|
32
|
+
|
|
33
|
+
# Create a new similarity engine
|
|
34
|
+
#
|
|
35
|
+
# @param pre_normalize [Boolean] Whether to pre-normalize vectors
|
|
36
|
+
# @param cache_norms [Boolean] Whether to cache vector norms
|
|
37
|
+
#
|
|
38
|
+
def initialize(pre_normalize: false, cache_norms: true)
|
|
39
|
+
@pre_normalize = pre_normalize
|
|
40
|
+
@cache_norms = cache_norms
|
|
41
|
+
@norm_cache = cache_norms ? {} : nil
|
|
42
|
+
@cache_hits = 0
|
|
43
|
+
@cache_misses = 0
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Compute cosine similarity between two vectors
|
|
47
|
+
#
|
|
48
|
+
# Cosine similarity = dot(v1, v2) / (||v1|| * ||v2||)
|
|
49
|
+
# Range: -1.0 (opposite) to 1.0 (identical)
|
|
50
|
+
#
|
|
51
|
+
# @param vec1 [Array<Float>] First vector
|
|
52
|
+
# @param vec2 [Array<Float>] Second vector
|
|
53
|
+
# @return [Float] Cosine similarity, or 0.0 if either vector is nil/empty
|
|
54
|
+
#
|
|
55
|
+
def cosine(vec1, vec2)
|
|
56
|
+
return 0.0 if vec1.nil? || vec2.nil? || vec1.empty? || vec2.empty?
|
|
57
|
+
|
|
58
|
+
norm1 = get_norm(vec1)
|
|
59
|
+
norm2 = get_norm(vec2)
|
|
60
|
+
|
|
61
|
+
return 0.0 if norm1.zero? || norm2.zero?
|
|
62
|
+
|
|
63
|
+
dot = dot_product(vec1, vec2)
|
|
64
|
+
dot / (norm1 * norm2)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Compute dot product between two vectors
|
|
68
|
+
#
|
|
69
|
+
# @param vec1 [Array<Float>] First vector
|
|
70
|
+
# @param vec2 [Array<Float>] Second vector
|
|
71
|
+
# @return [Float] Dot product
|
|
72
|
+
#
|
|
73
|
+
def dot_product(vec1, vec2)
|
|
74
|
+
return 0.0 if vec1.nil? || vec2.nil? || vec1.empty? || vec2.empty?
|
|
75
|
+
|
|
76
|
+
vec1.zip(vec2).sum { |a, b| a * b }
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Compute Euclidean distance between two vectors
|
|
80
|
+
#
|
|
81
|
+
# @param vec1 [Array<Float>] First vector
|
|
82
|
+
# @param vec2 [Array<Float>] Second vector
|
|
83
|
+
# @return [Float] Euclidean distance
|
|
84
|
+
#
|
|
85
|
+
def euclidean(vec1, vec2)
|
|
86
|
+
return 0.0 if vec1.nil? || vec2.nil? || vec1.empty? || vec2.empty?
|
|
87
|
+
return 0.0 if vec1.equal?(vec2)
|
|
88
|
+
|
|
89
|
+
sum = 0.0
|
|
90
|
+
vec1.zip(vec2) do |a, b|
|
|
91
|
+
diff = a - b
|
|
92
|
+
sum += diff * diff
|
|
93
|
+
end
|
|
94
|
+
Math.sqrt(sum)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Compute Manhattan (L1) distance between two vectors
|
|
98
|
+
#
|
|
99
|
+
# @param vec1 [Array<Float>] First vector
|
|
100
|
+
# @param vec2 [Array<Float>] Second vector
|
|
101
|
+
# @return [Float] Manhattan distance
|
|
102
|
+
#
|
|
103
|
+
def manhattan(vec1, vec2)
|
|
104
|
+
return 0.0 if vec1.nil? || vec2.nil? || vec1.empty? || vec2.empty?
|
|
105
|
+
|
|
106
|
+
vec1.zip(vec2).sum { |a, b| (a - b).abs }
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Pre-normalize a vector to unit length
|
|
110
|
+
#
|
|
111
|
+
# @param vec [Array<Float>] Vector to normalize
|
|
112
|
+
# @return [Array<Float>] Normalized vector
|
|
113
|
+
#
|
|
114
|
+
def pre_normalize(vec)
|
|
115
|
+
return vec.dup if vec.nil? || vec.empty?
|
|
116
|
+
|
|
117
|
+
norm = get_norm(vec)
|
|
118
|
+
return vec.dup if norm.zero?
|
|
119
|
+
|
|
120
|
+
vec.map { |x| x / norm }
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Normalize and compute similarity in one pass
|
|
124
|
+
#
|
|
125
|
+
# For pre-normalized vectors, this is just dot product (much faster).
|
|
126
|
+
#
|
|
127
|
+
# @param vec1 [Array<Float>] First vector
|
|
128
|
+
# @param vec2 [Array<Float>] Second vector
|
|
129
|
+
# @return [Float] Cosine similarity
|
|
130
|
+
#
|
|
131
|
+
def normalize_and_compute(vec1, vec2)
|
|
132
|
+
return 0.0 if vec1.nil? || vec2.nil? || vec1.empty? || vec2.empty?
|
|
133
|
+
|
|
134
|
+
if @pre_normalize
|
|
135
|
+
# For normalized vectors, cosine similarity = dot product
|
|
136
|
+
dot_product(vec1, vec2)
|
|
137
|
+
else
|
|
138
|
+
cosine(vec1, vec2)
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Check if vectors are normalized (unit length)
|
|
143
|
+
#
|
|
144
|
+
# @param vec [Array<Float>] Vector to check
|
|
145
|
+
# @return [Boolean] True if vector is normalized
|
|
146
|
+
#
|
|
147
|
+
def is_normalized?(vec)
|
|
148
|
+
return true if vec.nil? || vec.empty?
|
|
149
|
+
|
|
150
|
+
norm = get_norm(vec)
|
|
151
|
+
(norm - 1.0).abs < Float::EPSILON * 10
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Check if normalization is required for accurate similarity
|
|
155
|
+
#
|
|
156
|
+
# @return [Boolean] True if normalization should be applied
|
|
157
|
+
#
|
|
158
|
+
def normalization_required?
|
|
159
|
+
!@pre_normalize
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Clear the norm cache
|
|
163
|
+
#
|
|
164
|
+
# @return [self]
|
|
165
|
+
#
|
|
166
|
+
def clear_cache
|
|
167
|
+
@norm_cache&.clear
|
|
168
|
+
@cache_hits = 0
|
|
169
|
+
@cache_misses = 0
|
|
170
|
+
self
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Get cache statistics
|
|
174
|
+
#
|
|
175
|
+
# @return [Hash] Cache statistics
|
|
176
|
+
#
|
|
177
|
+
def cache_stats
|
|
178
|
+
total = @cache_hits + @cache_misses
|
|
179
|
+
{
|
|
180
|
+
hits: @cache_hits,
|
|
181
|
+
misses: @cache_misses,
|
|
182
|
+
hit_rate: total.zero? ? 0.0 : @cache_hits.to_f / total,
|
|
183
|
+
cache_size: @norm_cache&.size || 0
|
|
184
|
+
}
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Compute similarity for a batch of vector pairs
|
|
188
|
+
#
|
|
189
|
+
# More efficient than calling cosine() repeatedly.
|
|
190
|
+
#
|
|
191
|
+
# @param pairs [Array<Array<Array<Float>>>] Array of [vec1, vec2] pairs
|
|
192
|
+
# @return [Array<Float>] Array of similarities
|
|
193
|
+
#
|
|
194
|
+
def cosine_batch(pairs)
|
|
195
|
+
pairs.map { |v1, v2| cosine(v1, v2) }
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Compute all pairwise similarities for a set of vectors
|
|
199
|
+
#
|
|
200
|
+
# @param vectors [Array<Array<Float>>>] Array of vectors
|
|
201
|
+
# @return [Array<Array<Float>>] Similarity matrix
|
|
202
|
+
#
|
|
203
|
+
def compute_all_pairs(vectors)
|
|
204
|
+
n = vectors.length
|
|
205
|
+
matrix = Array.new(n) { Array.new(n, 0.0) }
|
|
206
|
+
|
|
207
|
+
(0...n).each do |i|
|
|
208
|
+
matrix[i][i] = 1.0
|
|
209
|
+
((i + 1)...n).each do |j|
|
|
210
|
+
sim = cosine(vectors[i], vectors[j])
|
|
211
|
+
matrix[i][j] = sim
|
|
212
|
+
matrix[j][i] = sim
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
matrix
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
private
|
|
220
|
+
|
|
221
|
+
# Get norm with caching
|
|
222
|
+
#
|
|
223
|
+
# @param vec [Array<Float>] Vector
|
|
224
|
+
# @return [Float] Vector norm (magnitude)
|
|
225
|
+
#
|
|
226
|
+
def get_norm(vec)
|
|
227
|
+
return 0.0 if vec.nil? || vec.empty?
|
|
228
|
+
|
|
229
|
+
if @norm_cache && @norm_cache.key?(vec.object_id)
|
|
230
|
+
@cache_hits += 1
|
|
231
|
+
return @norm_cache[vec.object_id]
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
@cache_misses += 1 if @norm_cache
|
|
235
|
+
|
|
236
|
+
norm = Math.sqrt(vec.sum { |x| x * x })
|
|
237
|
+
|
|
238
|
+
if @norm_cache
|
|
239
|
+
# Avoid memory leaks by limiting cache size
|
|
240
|
+
if @norm_cache.size >= 100_000
|
|
241
|
+
@norm_cache.shift
|
|
242
|
+
end
|
|
243
|
+
@norm_cache[vec.object_id] = norm
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
norm
|
|
247
|
+
end
|
|
248
|
+
end
|