kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'vocabulary'
|
|
4
|
+
require_relative 'onnx_runtime_model'
|
|
5
|
+
|
|
6
|
+
module Kotoshu
|
|
7
|
+
module Embeddings
|
|
8
|
+
# Similarity search for embedding-based nearest neighbor lookup.
|
|
9
|
+
#
|
|
10
|
+
# Efficiently finds semantically similar words using cosine similarity.
|
|
11
|
+
# Supports both on-the-fly computation and pre-computed embedding matrices.
|
|
12
|
+
#
|
|
13
|
+
# @example Basic usage
|
|
14
|
+
# search = SimilaritySearch.new(
|
|
15
|
+
# vocabulary: vocab,
|
|
16
|
+
# model: model
|
|
17
|
+
# )
|
|
18
|
+
# neighbors = search.find_nearest('hello', k: 10)
|
|
19
|
+
#
|
|
20
|
+
# @example With pre-loaded embedding matrix (faster)
|
|
21
|
+
# search = SimilaritySearch.new(
|
|
22
|
+
# vocabulary: vocab,
|
|
23
|
+
# model: model,
|
|
24
|
+
# preload_embeddings: true
|
|
25
|
+
# )
|
|
26
|
+
# neighbors = search.find_nearest('hello', k: 10)
|
|
27
|
+
class SimilaritySearch
|
|
28
|
+
# @return [Vocabulary] The vocabulary
|
|
29
|
+
attr_reader :vocabulary
|
|
30
|
+
|
|
31
|
+
# @return [OnnxRuntimeModel] The ONNX model
|
|
32
|
+
attr_reader :model
|
|
33
|
+
|
|
34
|
+
# @return [Boolean] Whether embeddings are pre-loaded
|
|
35
|
+
attr_reader :embeddings_loaded
|
|
36
|
+
|
|
37
|
+
# Create a new similarity search instance.
|
|
38
|
+
#
|
|
39
|
+
# @param vocabulary [Vocabulary] Word vocabulary
|
|
40
|
+
# @param model [OnnxRuntimeModel] ONNX model for embeddings
|
|
41
|
+
# @param preload_embeddings [Boolean] Whether to preload all embeddings
|
|
42
|
+
# @param max_cache_size [Integer] Maximum embeddings to cache (if not preloading)
|
|
43
|
+
def initialize(vocabulary:, model:, preload_embeddings: false, max_cache_size: 1000)
|
|
44
|
+
@vocabulary = vocabulary
|
|
45
|
+
@model = model
|
|
46
|
+
@preload_embeddings = preload_embeddings
|
|
47
|
+
@max_cache_size = max_cache_size
|
|
48
|
+
|
|
49
|
+
# Embedding cache (word -> vector)
|
|
50
|
+
@embedding_cache = {}
|
|
51
|
+
|
|
52
|
+
# Pre-loaded embedding matrix (for faster search)
|
|
53
|
+
@embedding_matrix = nil
|
|
54
|
+
|
|
55
|
+
# Track whether embeddings are preloaded
|
|
56
|
+
@embeddings_loaded = false
|
|
57
|
+
|
|
58
|
+
# Load embeddings if requested
|
|
59
|
+
preload_embeddings! if preload_embeddings
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Find k nearest neighbors for a word.
|
|
63
|
+
#
|
|
64
|
+
# @param query_word [String] The query word
|
|
65
|
+
# @param k [Integer] Number of neighbors to return
|
|
66
|
+
# @param exclude_self [Boolean] Whether to exclude the query word itself
|
|
67
|
+
# @param min_similarity [Float] Minimum similarity threshold (0.0 to 1.0)
|
|
68
|
+
# @return [Array<Hash>] Array of {word, similarity} hashes
|
|
69
|
+
def find_nearest(query_word, k: 10, exclude_self: true, min_similarity: 0.0)
|
|
70
|
+
# Get query embedding
|
|
71
|
+
query_vec = get_embedding(query_word)
|
|
72
|
+
return [] unless query_vec
|
|
73
|
+
|
|
74
|
+
# Find neighbors
|
|
75
|
+
if @embedding_matrix
|
|
76
|
+
nearest_from_matrix(query_vec, k, exclude_self, min_similarity)
|
|
77
|
+
else
|
|
78
|
+
nearest_from_cache(query_vec, k, exclude_self, min_similarity)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Find k nearest neighbors for multiple words.
|
|
83
|
+
#
|
|
84
|
+
# @param query_words [Array<String>] Query words
|
|
85
|
+
# @param k [Integer] Number of neighbors per word
|
|
86
|
+
# @return [Hash<String, Array<Hash>>] Word to neighbors mapping
|
|
87
|
+
def find_nearest_batch(query_words, k: 10)
|
|
88
|
+
query_words.each_with_object({}) do |word, result|
|
|
89
|
+
result[word] = find_nearest(word, k: k)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Compute similarity between two words.
|
|
94
|
+
#
|
|
95
|
+
# @param word1 [String] First word
|
|
96
|
+
# @param word2 [String] Second word
|
|
97
|
+
# @return [Float] Cosine similarity (-1.0 to 1.0, or nil if either word not found)
|
|
98
|
+
def similarity(word1, word2)
|
|
99
|
+
vec1 = get_embedding(word1)
|
|
100
|
+
vec2 = get_embedding(word2)
|
|
101
|
+
|
|
102
|
+
return nil unless vec1 && vec2
|
|
103
|
+
|
|
104
|
+
cosine_similarity(vec1, vec2)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Compute similarity between two embedding vectors.
|
|
108
|
+
#
|
|
109
|
+
# @param vec1 [Array<Float>] First vector
|
|
110
|
+
# @param vec2 [Array<Float>] Second vector
|
|
111
|
+
# @return [Float] Cosine similarity (-1.0 to 1.0)
|
|
112
|
+
def cosine_similarity(vec1, vec2)
|
|
113
|
+
return 0.0 if vec1.nil? || vec2.nil?
|
|
114
|
+
|
|
115
|
+
# Compute dot product
|
|
116
|
+
dot = vec1.zip(vec2).sum { |a, b| a * b }
|
|
117
|
+
|
|
118
|
+
# Compute magnitudes
|
|
119
|
+
norm1 = Math.sqrt(vec1.sum { |x| x * x })
|
|
120
|
+
norm2 = Math.sqrt(vec2.sum { |x| x * x })
|
|
121
|
+
|
|
122
|
+
return 0.0 if norm1.zero? || norm2.zero?
|
|
123
|
+
|
|
124
|
+
dot / (norm1 * norm2)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Preload all embeddings into memory for faster search.
|
|
128
|
+
#
|
|
129
|
+
# @return [Boolean] True if loaded successfully
|
|
130
|
+
def preload_embeddings!
|
|
131
|
+
return false if @embedding_matrix
|
|
132
|
+
|
|
133
|
+
# Get all indices
|
|
134
|
+
all_indices = (0...@vocabulary.size).to_a
|
|
135
|
+
|
|
136
|
+
# Batch load embeddings
|
|
137
|
+
vectors = @model.get_embeddings(all_indices)
|
|
138
|
+
return false if vectors.nil? || vectors.empty?
|
|
139
|
+
|
|
140
|
+
# Store as hash for now (could use Numo::SFloat for efficiency)
|
|
141
|
+
@embedding_matrix = {}
|
|
142
|
+
all_indices.zip(vectors).each do |idx, vec|
|
|
143
|
+
@embedding_matrix[idx] = vec
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
@embeddings_loaded = true
|
|
147
|
+
true
|
|
148
|
+
rescue StandardError => e
|
|
149
|
+
warn "Failed to preload embeddings: #{e.message}"
|
|
150
|
+
false
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Clear the embedding cache.
|
|
154
|
+
#
|
|
155
|
+
# @return [self] Self for chaining
|
|
156
|
+
def clear_cache
|
|
157
|
+
@embedding_cache.clear
|
|
158
|
+
@embedding_matrix = nil
|
|
159
|
+
@embeddings_loaded = false
|
|
160
|
+
self
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Get cache statistics.
|
|
164
|
+
#
|
|
165
|
+
# @return [Hash] Cache statistics
|
|
166
|
+
def cache_stats
|
|
167
|
+
stats = {
|
|
168
|
+
size: @embedding_cache.size,
|
|
169
|
+
max_size: @max_cache_size
|
|
170
|
+
}
|
|
171
|
+
stats[:hit_rate] = @cache_hits.to_f / (@cache_hits + @cache_misses) if defined?(@cache_hits)
|
|
172
|
+
stats
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# String representation.
|
|
176
|
+
#
|
|
177
|
+
# @return [String] String representation
|
|
178
|
+
def to_s
|
|
179
|
+
"SimilaritySearch(vocab_size: #{@vocabulary.size}, loaded: #{@embeddings_loaded})"
|
|
180
|
+
end
|
|
181
|
+
alias inspect to_s
|
|
182
|
+
|
|
183
|
+
private
|
|
184
|
+
|
|
185
|
+
# Get embedding for a word (with caching).
|
|
186
|
+
#
|
|
187
|
+
# @param word [String] The word
|
|
188
|
+
# @return [Array<Float>, nil] Embedding vector or nil if not found
|
|
189
|
+
def get_embedding(word)
|
|
190
|
+
# Check cache first
|
|
191
|
+
if @embedding_cache.key?(word)
|
|
192
|
+
@cache_hits += 1 if defined?(@cache_hits)
|
|
193
|
+
return @embedding_cache[word]
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
@cache_misses ||= 0
|
|
197
|
+
@cache_hits ||= 0
|
|
198
|
+
@cache_misses += 1
|
|
199
|
+
|
|
200
|
+
# Get from model
|
|
201
|
+
index = @vocabulary.lookup(word)
|
|
202
|
+
return nil unless index
|
|
203
|
+
|
|
204
|
+
vec = if @embedding_matrix
|
|
205
|
+
@embedding_matrix[index]
|
|
206
|
+
else
|
|
207
|
+
@model.get_embedding(index)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
return nil unless vec
|
|
211
|
+
|
|
212
|
+
# Cache if not preloading (preload has all in memory already)
|
|
213
|
+
unless @preload_embeddings
|
|
214
|
+
# Evict oldest if cache is full
|
|
215
|
+
if @embedding_cache.size >= @max_cache_size
|
|
216
|
+
@embedding_cache.shift
|
|
217
|
+
end
|
|
218
|
+
@embedding_cache[word] = vec
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
vec
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
# Find nearest neighbors using pre-loaded matrix.
|
|
225
|
+
#
|
|
226
|
+
# @param query_vec [Array<Float>] Query embedding
|
|
227
|
+
# @param k [Integer] Number of neighbors
|
|
228
|
+
# @param exclude_self [Boolean] Whether to exclude exact matches
|
|
229
|
+
# @param min_similarity [Float] Minimum similarity
|
|
230
|
+
# @return [Array<Hash>] Nearest neighbors
|
|
231
|
+
def nearest_from_matrix(query_vec, k, exclude_self, min_similarity)
|
|
232
|
+
similarities = []
|
|
233
|
+
|
|
234
|
+
@vocabulary.words.each do |word|
|
|
235
|
+
index = @vocabulary.lookup(word)
|
|
236
|
+
vec = @embedding_matrix[index]
|
|
237
|
+
|
|
238
|
+
next unless vec
|
|
239
|
+
|
|
240
|
+
sim = cosine_similarity(query_vec, vec)
|
|
241
|
+
|
|
242
|
+
# Skip exact match if requested
|
|
243
|
+
next if exclude_self && sim >= 0.9999
|
|
244
|
+
|
|
245
|
+
# Skip below threshold
|
|
246
|
+
next if sim < min_similarity
|
|
247
|
+
|
|
248
|
+
similarities << { word: word, similarity: sim }
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Sort by similarity (descending) and take top k
|
|
252
|
+
similarities.sort_by { |s| -s[:similarity] }.first(k)
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
# Find nearest neighbors using cache (no pre-loading).
|
|
256
|
+
#
|
|
257
|
+
# @param query_vec [Array<Float>] Query embedding
|
|
258
|
+
# @param k [Integer] Number of neighbors
|
|
259
|
+
# @param exclude_self [Boolean] Whether to exclude exact matches
|
|
260
|
+
# @param min_similarity [Float] Minimum similarity
|
|
261
|
+
# @return [Array<Hash>] Nearest neighbors
|
|
262
|
+
def nearest_from_cache(query_vec, k, exclude_self, min_similarity)
|
|
263
|
+
similarities = []
|
|
264
|
+
|
|
265
|
+
# Sample from vocabulary for efficiency (or use common words)
|
|
266
|
+
sample_words = sample_vocabulary(k * 10)
|
|
267
|
+
|
|
268
|
+
sample_words.each do |word|
|
|
269
|
+
vec = get_embedding(word)
|
|
270
|
+
next unless vec
|
|
271
|
+
|
|
272
|
+
sim = cosine_similarity(query_vec, vec)
|
|
273
|
+
|
|
274
|
+
# Skip exact match if requested
|
|
275
|
+
next if exclude_self && sim >= 0.9999
|
|
276
|
+
|
|
277
|
+
# Skip below threshold
|
|
278
|
+
next if sim < min_similarity
|
|
279
|
+
|
|
280
|
+
similarities << { word: word, similarity: sim }
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
# Sort by similarity (descending) and take top k
|
|
284
|
+
similarities.sort_by { |s| -s[:similarity] }.first(k)
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
# Sample words from vocabulary for search.
|
|
288
|
+
#
|
|
289
|
+
# Prioritizes common words (first N in vocabulary).
|
|
290
|
+
#
|
|
291
|
+
# @param n [Integer] Number of words to sample
|
|
292
|
+
# @return [Array<String>] Sampled words
|
|
293
|
+
def sample_vocabulary(n)
|
|
294
|
+
# Use first N words (FastText orders by frequency)
|
|
295
|
+
# plus a random sample of the rest
|
|
296
|
+
common_size = [n / 2, 100].min
|
|
297
|
+
random_size = n - common_size
|
|
298
|
+
|
|
299
|
+
common = @vocabulary.common_words(n: common_size)
|
|
300
|
+
|
|
301
|
+
if @vocabulary.size > common_size
|
|
302
|
+
# Get a random sample from the rest
|
|
303
|
+
rest = @vocabulary.words.drop(common_size)
|
|
304
|
+
random_sample = rest.sample(random_size)
|
|
305
|
+
common + random_sample
|
|
306
|
+
else
|
|
307
|
+
common
|
|
308
|
+
end
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
# Create from cache.
|
|
312
|
+
#
|
|
313
|
+
# @param language_code [String] ISO 639-1 language code
|
|
314
|
+
# @param cache [Cache::ModelCache, nil] Optional cache instance
|
|
315
|
+
# @param preload [Boolean] Whether to preload embeddings
|
|
316
|
+
# @return [SimilaritySearch, nil] New search instance or nil if not available
|
|
317
|
+
def self.from_cache(language_code, cache: nil, preload: false)
|
|
318
|
+
vocab = Vocabulary.from_cache(language_code, cache: cache)
|
|
319
|
+
model = OnnxRuntimeModel.from_cache(language_code, cache: cache)
|
|
320
|
+
|
|
321
|
+
return nil unless vocab && model
|
|
322
|
+
|
|
323
|
+
new(
|
|
324
|
+
vocabulary: vocab,
|
|
325
|
+
model: model,
|
|
326
|
+
preload_embeddings: preload
|
|
327
|
+
)
|
|
328
|
+
end
|
|
329
|
+
end
|
|
330
|
+
end
|
|
331
|
+
end
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require_relative 'protocol'
|
|
5
|
+
|
|
6
|
+
# Vocabulary - Word to index mapping
|
|
7
|
+
#
|
|
8
|
+
# Provides efficient lookup from words to integer indices for embedding retrieval.
|
|
9
|
+
# Supports JSON file loading and saving.
|
|
10
|
+
#
|
|
11
|
+
# @example Creating a vocabulary
|
|
12
|
+
# vocab = Kotoshu::Embeddings::Vocabulary.new(
|
|
13
|
+
# language_code: 'en',
|
|
14
|
+
# word_to_index: { 'hello' => 0, 'world' => 1 }
|
|
15
|
+
# )
|
|
16
|
+
#
|
|
17
|
+
# @example Loading from file
|
|
18
|
+
# vocab = Kotoshu::Embeddings::Vocabulary.from_file('/path/to/vocab.json', language_code: 'en')
|
|
19
|
+
#
|
|
20
|
+
class Vocabulary
|
|
21
|
+
include VocabularyProtocol
|
|
22
|
+
|
|
23
|
+
# @return [String] ISO 639-1 language code
|
|
24
|
+
attr_reader :language_code
|
|
25
|
+
|
|
26
|
+
# @return [Hash{String => Integer}] Word to index mapping
|
|
27
|
+
attr_reader :word_to_index
|
|
28
|
+
|
|
29
|
+
# @return [Array<String>] Index to word mapping (sparse array)
|
|
30
|
+
attr_reader :index_to_word
|
|
31
|
+
|
|
32
|
+
# Create a new vocabulary
|
|
33
|
+
#
|
|
34
|
+
# @param language_code [String] ISO 639-1 language code
|
|
35
|
+
# @param word_to_index [Hash{String => Integer}] Word to index mapping
|
|
36
|
+
#
|
|
37
|
+
# @raise [ArgumentError] If word_to_index is empty
|
|
38
|
+
#
|
|
39
|
+
def initialize(language_code:, word_to_index:)
|
|
40
|
+
raise ArgumentError, 'word_to_index cannot be empty' if word_to_index.nil? || word_to_index.empty?
|
|
41
|
+
|
|
42
|
+
@language_code = language_code
|
|
43
|
+
@word_to_index = word_to_index.dup.freeze
|
|
44
|
+
|
|
45
|
+
# Build reverse index (index -> word)
|
|
46
|
+
@index_to_word = Array.new(@word_to_index.size)
|
|
47
|
+
@word_to_index.each do |word, index|
|
|
48
|
+
@index_to_word[index] = word if index < @index_to_word.size
|
|
49
|
+
end
|
|
50
|
+
@index_to_word.freeze
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Look up word index
|
|
54
|
+
#
|
|
55
|
+
# @param word [String] The word to look up
|
|
56
|
+
# @return [Integer, nil] Index of the word, or nil if not found
|
|
57
|
+
#
|
|
58
|
+
def lookup(word)
|
|
59
|
+
@word_to_index[word]
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Get word by index
|
|
63
|
+
#
|
|
64
|
+
# @param index [Integer] The index to look up
|
|
65
|
+
# @return [String, nil] Word at the index, or nil if not found
|
|
66
|
+
#
|
|
67
|
+
def get_word(index)
|
|
68
|
+
@index_to_word[index]
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Check if word exists in vocabulary
|
|
72
|
+
#
|
|
73
|
+
# @param word [String] Word to check
|
|
74
|
+
# @return [Boolean] True if word exists
|
|
75
|
+
#
|
|
76
|
+
def include?(word)
|
|
77
|
+
@word_to_index.key?(word)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Get vocabulary size
|
|
81
|
+
#
|
|
82
|
+
# @return [Integer] Number of words in vocabulary
|
|
83
|
+
#
|
|
84
|
+
def size
|
|
85
|
+
@word_to_index.size
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Check if index is valid
|
|
89
|
+
#
|
|
90
|
+
# @param index [Integer] Index to check
|
|
91
|
+
# @return [Boolean] True if index is valid
|
|
92
|
+
#
|
|
93
|
+
def valid_index?(index)
|
|
94
|
+
index.is_a?(Integer) && index >= 0 && index < @word_to_index.size
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Get common/most frequent words
|
|
98
|
+
#
|
|
99
|
+
# @param n [Integer] Number of words to return
|
|
100
|
+
# @return [Array<String>] Array of common words
|
|
101
|
+
#
|
|
102
|
+
def common_words(n: 10)
|
|
103
|
+
return [] if @word_to_index.empty?
|
|
104
|
+
|
|
105
|
+
@word_to_index.keys.first(n)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Convert to Hash
|
|
109
|
+
#
|
|
110
|
+
# @return [Hash{String => Integer}] Copy of word_to_index mapping
|
|
111
|
+
#
|
|
112
|
+
def to_h
|
|
113
|
+
@word_to_index.dup
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Get all words as enumerator
|
|
117
|
+
#
|
|
118
|
+
# @return [Enumerator<String>] Enumerator of all words
|
|
119
|
+
#
|
|
120
|
+
def words
|
|
121
|
+
@word_to_index.each_key
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Load vocabulary from JSON file
|
|
125
|
+
#
|
|
126
|
+
# @param path [String] Path to JSON file
|
|
127
|
+
# @param language_code [String] Language code (auto-detected from filename if nil)
|
|
128
|
+
# @return [Vocabulary] New vocabulary instance
|
|
129
|
+
#
|
|
130
|
+
# @raise [ArgumentError] If file doesn't exist
|
|
131
|
+
# @raise [Json::ParserError] If file is not valid JSON
|
|
132
|
+
#
|
|
133
|
+
def self.from_file(path, language_code: nil)
|
|
134
|
+
raise ArgumentError, "File not found: #{path}" unless File.exist?(path)
|
|
135
|
+
|
|
136
|
+
language_code ||= detect_language_from_path(path)
|
|
137
|
+
|
|
138
|
+
data = JSON.parse(File.read(path))
|
|
139
|
+
|
|
140
|
+
case data
|
|
141
|
+
when Hash
|
|
142
|
+
word_to_index = data.transform_keys(&:freeze).freeze
|
|
143
|
+
when Array
|
|
144
|
+
word_to_index = {}
|
|
145
|
+
data.each_with_index do |word, index|
|
|
146
|
+
word_to_index[word.freeze] = index
|
|
147
|
+
end
|
|
148
|
+
word_to_index.freeze
|
|
149
|
+
else
|
|
150
|
+
raise ArgumentError, "Invalid vocabulary format: expected Hash or Array"
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
new(language_code: language_code, word_to_index: word_to_index)
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Create vocabulary from Array of words
|
|
157
|
+
#
|
|
158
|
+
# @param words [Array<String>] Array of words
|
|
159
|
+
# @param language_code [String] Language code
|
|
160
|
+
# @return [Vocabulary] New vocabulary instance
|
|
161
|
+
#
|
|
162
|
+
def self.from_words(words, language_code: 'en')
|
|
163
|
+
word_to_index = {}
|
|
164
|
+
words.each_with_index do |word, index|
|
|
165
|
+
word_to_index[word.freeze] = index
|
|
166
|
+
end
|
|
167
|
+
word_to_index.freeze
|
|
168
|
+
|
|
169
|
+
new(language_code: language_code, word_to_index: word_to_index)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Save vocabulary to JSON file
|
|
173
|
+
#
|
|
174
|
+
# @param path [String] Path to save file
|
|
175
|
+
# @param format [Symbol] Format: :hash or :array
|
|
176
|
+
#
|
|
177
|
+
def save_to_file(path, format: :hash)
|
|
178
|
+
case format
|
|
179
|
+
when :hash
|
|
180
|
+
data = @word_to_index.dup
|
|
181
|
+
when :array
|
|
182
|
+
max_index = @index_to_word.compact.length
|
|
183
|
+
data = @index_to_word.compact.first(max_index)
|
|
184
|
+
else
|
|
185
|
+
raise ArgumentError, "Unknown format: #{format}"
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
File.write(path, JSON.pretty_generate(data))
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# Check if vocabulary is empty
|
|
192
|
+
#
|
|
193
|
+
# @return [Boolean] True if empty
|
|
194
|
+
#
|
|
195
|
+
def empty?
|
|
196
|
+
@word_to_index.empty?
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Get a sample of words
|
|
200
|
+
#
|
|
201
|
+
# @param n [Integer] Number of words to sample
|
|
202
|
+
# @return [Array<String>] Sample of words
|
|
203
|
+
#
|
|
204
|
+
def sample(n: 10)
|
|
205
|
+
@word_to_index.keys.sample(n)
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# Create a sub-vocabulary containing only specified words
|
|
209
|
+
#
|
|
210
|
+
# @param words [Array<String>] Words to include
|
|
211
|
+
# @return [Vocabulary] New vocabulary with subset of words
|
|
212
|
+
#
|
|
213
|
+
def sub_vocabulary(words)
|
|
214
|
+
filtered = @word_to_index.select { |w, _| words.include?(w) }
|
|
215
|
+
self.class.new(language_code: @language_code, word_to_index: filtered)
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# Find words starting with a prefix
|
|
219
|
+
#
|
|
220
|
+
# @param prefix [String] Prefix to match
|
|
221
|
+
# @return [Array<String>] Matching words
|
|
222
|
+
#
|
|
223
|
+
def words_starting_with(prefix)
|
|
224
|
+
pattern = /^#{Regexp.escape(prefix)}/
|
|
225
|
+
@word_to_index.keys.grep(pattern)
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# String representation
|
|
229
|
+
#
|
|
230
|
+
# @return [String]
|
|
231
|
+
#
|
|
232
|
+
def to_s
|
|
233
|
+
"Vocabulary(language: #{@language_code}, size: #{@word_to_index.size})"
|
|
234
|
+
end
|
|
235
|
+
alias inspect to_s
|
|
236
|
+
|
|
237
|
+
private_class_method
|
|
238
|
+
|
|
239
|
+
# Detect language code from file path
|
|
240
|
+
#
|
|
241
|
+
# @param path [String] File path
|
|
242
|
+
# @return [String] Detected language code
|
|
243
|
+
#
|
|
244
|
+
def self.detect_language_from_path(path)
|
|
245
|
+
basename = File.basename(path)
|
|
246
|
+
|
|
247
|
+
if basename =~ /(\w+)\.vocab\.json\z/
|
|
248
|
+
return $1
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
if basename =~ /\.(\w+)\.vocab\.json\z/
|
|
252
|
+
return $1
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
'unknown'
|
|
256
|
+
end
|
|
257
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Load all components
|
|
4
|
+
require_relative 'embeddings/protocol'
|
|
5
|
+
require_relative 'embeddings/lru_cache'
|
|
6
|
+
require_relative 'embeddings/vocabulary'
|
|
7
|
+
require_relative 'embeddings/onnx_runtime_model'
|
|
8
|
+
require_relative 'embeddings/similarity_engine'
|
|
9
|
+
require_relative 'embeddings/search'
|
|
10
|
+
require_relative 'embeddings/embedding_pipeline'
|
|
11
|
+
require_relative 'embeddings/registry'
|
|
12
|
+
|
|
13
|
+
# Embeddings module for FastText ONNX integration.
|
|
14
|
+
#
|
|
15
|
+
# Provides semantic spell checking using FastText word embeddings.
|
|
16
|
+
# Supports 157 languages through pre-converted ONNX models.
|
|
17
|
+
#
|
|
18
|
+
# @example Simple usage (recommended)
|
|
19
|
+
# pipeline = Kotoshu::Embeddings.from_cache(language: 'en')
|
|
20
|
+
# neighbors = pipeline.find_nearest('semantic', k: 5)
|
|
21
|
+
#
|
|
22
|
+
# @example Advanced usage
|
|
23
|
+
# vocab = Kotoshu::Embeddings::Vocabulary.from_file('vocab.json')
|
|
24
|
+
# model = Kotoshu::Embeddings::OnnxRuntimeModel.from_file('model.onnx')
|
|
25
|
+
# engine = Kotoshu::Embeddings::SimilarityEngine.new(pre_normalize: true)
|
|
26
|
+
#
|
|
27
|
+
module Kotoshu
|
|
28
|
+
module Embeddings
|
|
29
|
+
# Constants
|
|
30
|
+
DEFAULT_DIMENSION = 300
|
|
31
|
+
MAX_VOCABULARY_SIZE = 100_000
|
|
32
|
+
VERSION = '2.0.0'
|
|
33
|
+
|
|
34
|
+
# Expose classes
|
|
35
|
+
Vocabulary = ::Vocabulary
|
|
36
|
+
OnnxRuntimeModel = ::OnnxRuntimeModel
|
|
37
|
+
SimilarityEngine = ::SimilarityEngine
|
|
38
|
+
Search = ::Search
|
|
39
|
+
EmbeddingPipeline = ::EmbeddingPipeline
|
|
40
|
+
LruCache = ::LruCache
|
|
41
|
+
Registry = ::EmbeddingRegistry
|
|
42
|
+
|
|
43
|
+
# Protocols namespace
|
|
44
|
+
module Protocols
|
|
45
|
+
EmbeddingModel = ::EmbeddingModelProtocol
|
|
46
|
+
SimilarityEngine = ::SimilarityEngineProtocol
|
|
47
|
+
Vocabulary = ::VocabularyProtocol
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Create an EmbeddingPipeline from cache
|
|
51
|
+
#
|
|
52
|
+
# @param language [String] ISO 639-1 language code
|
|
53
|
+
# @param preload [Boolean] Preload embeddings into memory
|
|
54
|
+
# @return [EmbeddingPipeline]
|
|
55
|
+
#
|
|
56
|
+
def self.from_cache(language:, preload: false, index: :exact)
|
|
57
|
+
EmbeddingPipeline.from_cache(language: language, preload: preload, index: index)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Check if a language is supported
|
|
61
|
+
#
|
|
62
|
+
# @param language [String] ISO 639-1 language code
|
|
63
|
+
# @return [Boolean]
|
|
64
|
+
#
|
|
65
|
+
def self.language_supported?(language)
|
|
66
|
+
require_relative '../cache/model_cache'
|
|
67
|
+
cache = Cache::ModelCache.new
|
|
68
|
+
cache.available_models_for(language.to_sym).include?(:onnx)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# List all supported languages
|
|
72
|
+
#
|
|
73
|
+
# @return [Array<String>]
|
|
74
|
+
#
|
|
75
|
+
def self.supported_languages
|
|
76
|
+
require_relative '../cache/model_cache'
|
|
77
|
+
cache = Cache::ModelCache.new
|
|
78
|
+
cache.all_available_models[:onnx].keys.map(&:to_s)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Create a custom embedding pipeline
|
|
82
|
+
#
|
|
83
|
+
# @param vocabulary [Vocabulary] Vocabulary instance
|
|
84
|
+
# @param model [EmbeddingModel] Model instance
|
|
85
|
+
# @param preload [Boolean] Preload embeddings
|
|
86
|
+
# @return [EmbeddingPipeline]
|
|
87
|
+
#
|
|
88
|
+
def self.create_pipeline(vocabulary:, model:, preload: false, pre_normalize: false)
|
|
89
|
+
EmbeddingPipeline.new(
|
|
90
|
+
vocabulary: vocabulary,
|
|
91
|
+
model: model,
|
|
92
|
+
preload: preload,
|
|
93
|
+
pre_normalize: pre_normalize
|
|
94
|
+
)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|