kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'vocabulary'
|
|
4
|
+
require_relative 'onnx_runtime_model'
|
|
5
|
+
require_relative 'similarity_engine'
|
|
6
|
+
require_relative 'search'
|
|
7
|
+
require_relative 'lru_cache'
|
|
8
|
+
|
|
9
|
+
# EmbeddingPipeline - Unified API for embedding-based similarity search
|
|
10
|
+
#
|
|
11
|
+
# Provides a simple, unified interface for loading vocabulary and models,
|
|
12
|
+
# and performing similarity search. This is the recommended entry point.
|
|
13
|
+
#
|
|
14
|
+
# @example Simple usage (one line)
|
|
15
|
+
# pipeline = EmbeddingPipeline.from_cache(language: 'en')
|
|
16
|
+
#
|
|
17
|
+
# @example Full configuration
|
|
18
|
+
# pipeline = EmbeddingPipeline.new(
|
|
19
|
+
# vocabulary: vocab,
|
|
20
|
+
# model: model,
|
|
21
|
+
# preload: true
|
|
22
|
+
# )
|
|
23
|
+
#
|
|
24
|
+
# @example Finding similar words
|
|
25
|
+
# neighbors = pipeline.find_nearest('semantic', k: 5)
|
|
26
|
+
# neighbors.each { |r| puts "#{r[:word]}: #{r[:similarity].round(4)}" }
|
|
27
|
+
#
|
|
28
|
+
class EmbeddingPipeline
|
|
29
|
+
# @return [Vocabulary]
|
|
30
|
+
attr_reader :vocabulary
|
|
31
|
+
|
|
32
|
+
# @return [EmbeddingModel]
|
|
33
|
+
attr_reader :model
|
|
34
|
+
|
|
35
|
+
# @return [SimilarityEngine]
|
|
36
|
+
attr_reader :similarity_engine
|
|
37
|
+
|
|
38
|
+
# @return [Search]
|
|
39
|
+
attr_reader :search
|
|
40
|
+
|
|
41
|
+
# Create pipeline from cache (one-line initialization)
|
|
42
|
+
#
|
|
43
|
+
# @param language [String] ISO 639-1 language code
|
|
44
|
+
# @param cache [Cache::ModelCache] Cache instance
|
|
45
|
+
# @param preload [Boolean] Preload embeddings into memory
|
|
46
|
+
# @param index [:exact, :auto] Search index type
|
|
47
|
+
# @return [EmbeddingPipeline]
|
|
48
|
+
#
|
|
49
|
+
# @raise [ArgumentError] If no cached model found for language
|
|
50
|
+
#
|
|
51
|
+
def self.from_cache(language:, cache: nil, preload: false, index: :exact)
|
|
52
|
+
require_relative 'cache/model_cache'
|
|
53
|
+
|
|
54
|
+
cache ||= Cache::ModelCache.new
|
|
55
|
+
|
|
56
|
+
vocab_path = cache.find_vocab(language)
|
|
57
|
+
model_path = cache.find_model(language, :onnx)
|
|
58
|
+
|
|
59
|
+
unless vocab_path && model_path
|
|
60
|
+
raise ArgumentError, "No cached model for language: #{language}. " \
|
|
61
|
+
"Run: ruby scripts/extract_vocabularies.rb --languages=#{language}"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
from_files(
|
|
65
|
+
vocab_path: vocab_path,
|
|
66
|
+
model_path: model_path,
|
|
67
|
+
language: language,
|
|
68
|
+
preload: preload,
|
|
69
|
+
index: index
|
|
70
|
+
)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Create pipeline from files
|
|
74
|
+
#
|
|
75
|
+
# @param vocab_path [String] Path to vocabulary JSON file
|
|
76
|
+
# @param model_path [String] Path to ONNX model file
|
|
77
|
+
# @param language [String] Language code
|
|
78
|
+
# @param preload [Boolean] Preload embeddings
|
|
79
|
+
# @param index [:exact, :auto] Search index type
|
|
80
|
+
# @return [EmbeddingPipeline]
|
|
81
|
+
#
|
|
82
|
+
def self.from_files(vocab_path:, model_path:, language:, preload: false, index: :exact)
|
|
83
|
+
vocab = Vocabulary.from_file(vocab_path, language_code: language)
|
|
84
|
+
model = OnnxRuntimeModel.from_file(model_path, language_code: language)
|
|
85
|
+
|
|
86
|
+
new(
|
|
87
|
+
vocabulary: vocab,
|
|
88
|
+
model: model,
|
|
89
|
+
preload: preload,
|
|
90
|
+
index: index
|
|
91
|
+
)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Create pipeline with full configuration
|
|
95
|
+
#
|
|
96
|
+
# @param vocabulary [Vocabulary] Vocabulary instance
|
|
97
|
+
# @param model [EmbeddingModel] Model instance
|
|
98
|
+
# @param preload [Boolean] Preload embeddings
|
|
99
|
+
# @param index [:exact, :ann] Search index type (:exact = brute force, :ann = FAISS/HNSW)
|
|
100
|
+
# @param pre_normalize [Boolean] Pre-normalize vectors
|
|
101
|
+
# @param cache_size [Integer] Embedding cache size
|
|
102
|
+
#
|
|
103
|
+
def initialize(vocabulary:, model:, preload: false, index: :exact, pre_normalize: false, cache_size: 1000)
|
|
104
|
+
@vocabulary = vocabulary
|
|
105
|
+
@model = model
|
|
106
|
+
@similarity_engine = SimilarityEngine.new(pre_normalize: pre_normalize)
|
|
107
|
+
@cache_size = cache_size
|
|
108
|
+
|
|
109
|
+
# Create search engine
|
|
110
|
+
@search = Search.new(
|
|
111
|
+
vocabulary: vocabulary,
|
|
112
|
+
model: model,
|
|
113
|
+
similarity_engine: @similarity_engine,
|
|
114
|
+
pre_normalize: pre_normalize
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
preload_embeddings! if preload
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Find k nearest neighbors for a word
|
|
121
|
+
#
|
|
122
|
+
# @param word [String] Query word
|
|
123
|
+
# @param k [Integer] Number of neighbors
|
|
124
|
+
# @param exclude_self [Boolean] Exclude query word
|
|
125
|
+
# @param min_similarity [Float] Minimum similarity threshold
|
|
126
|
+
# @return [Array<Hash>] Array of {word, similarity, index}
|
|
127
|
+
#
|
|
128
|
+
def find_nearest(word, k: 10, exclude_self: true, min_similarity: 0.0)
|
|
129
|
+
@search.find_nearest(word, k: k, exclude_self: exclude_self, min_similarity: min_similarity)
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Find nearest neighbors for multiple words
|
|
133
|
+
#
|
|
134
|
+
# @param words [Array<String>] Query words
|
|
135
|
+
# @param k [Integer] Neighbors per word
|
|
136
|
+
# @return [Hash<String, Array<Hash>>]
|
|
137
|
+
#
|
|
138
|
+
def find_nearest_batch(words, k: 10)
|
|
139
|
+
@search.find_nearest_batch(words, k: k)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Compute similarity between two words
|
|
143
|
+
#
|
|
144
|
+
# @param word1 [String] First word
|
|
145
|
+
# @param word2 [String] Second word
|
|
146
|
+
# @return [Float, nil] Similarity or nil if either word not found
|
|
147
|
+
#
|
|
148
|
+
def similarity(word1, word2)
|
|
149
|
+
@search.similarity(word1, word2)
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Get embedding for a word
|
|
153
|
+
#
|
|
154
|
+
# @param word [String] Word
|
|
155
|
+
# @return [Array<Float>, nil]
|
|
156
|
+
#
|
|
157
|
+
def get_embedding(word)
|
|
158
|
+
@model.get_embedding_for_word(word, @vocabulary)
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Get embedding by index
|
|
162
|
+
#
|
|
163
|
+
# @param index [Integer] Word index
|
|
164
|
+
# @return [Array<Float>, nil]
|
|
165
|
+
#
|
|
166
|
+
def get_embedding_by_index(index)
|
|
167
|
+
@model.get_embedding(index)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Check if word exists in vocabulary
|
|
171
|
+
#
|
|
172
|
+
# @param word [String] Word
|
|
173
|
+
# @return [Boolean]
|
|
174
|
+
#
|
|
175
|
+
def include?(word)
|
|
176
|
+
@vocabulary.include?(word)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Preload all embeddings into memory
|
|
180
|
+
#
|
|
181
|
+
# @return [self]
|
|
182
|
+
#
|
|
183
|
+
def preload_embeddings!
|
|
184
|
+
@model.load!
|
|
185
|
+
@search.preload_embeddings!
|
|
186
|
+
self
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Unload model from memory
|
|
190
|
+
#
|
|
191
|
+
# @return [self]
|
|
192
|
+
#
|
|
193
|
+
def unload!
|
|
194
|
+
@model.unload!
|
|
195
|
+
@search.clear_cache
|
|
196
|
+
self
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Get pipeline statistics
|
|
200
|
+
#
|
|
201
|
+
# @return [Hash]
|
|
202
|
+
#
|
|
203
|
+
def stats
|
|
204
|
+
{
|
|
205
|
+
language: @vocabulary.language_code,
|
|
206
|
+
vocabulary_size: @vocabulary.size,
|
|
207
|
+
embedding_dimension: @model.dimension,
|
|
208
|
+
model_loaded: @model.loaded?,
|
|
209
|
+
embeddings_preloaded: @search.embeddings_loaded,
|
|
210
|
+
cache_stats: @search.instance_variable_get(:@embedding_cache)&.stats
|
|
211
|
+
}
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
# Get model information
|
|
215
|
+
#
|
|
216
|
+
# @return [Hash]
|
|
217
|
+
#
|
|
218
|
+
def model_info
|
|
219
|
+
@model.model_info
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# String representation
|
|
223
|
+
#
|
|
224
|
+
# @return [String]
|
|
225
|
+
#
|
|
226
|
+
def to_s
|
|
227
|
+
"EmbeddingPipeline(language: #{@vocabulary.language_code}, " \
|
|
228
|
+
"vocab_size: #{@vocabulary.size}, " \
|
|
229
|
+
"dimension: #{@model.dimension}, " \
|
|
230
|
+
"loaded: #{@model.loaded?})"
|
|
231
|
+
end
|
|
232
|
+
alias inspect to_s
|
|
233
|
+
|
|
234
|
+
# Convenience class methods
|
|
235
|
+
class << self
|
|
236
|
+
# Create pipeline for a specific language (shortcut)
|
|
237
|
+
#
|
|
238
|
+
# @param language [String] ISO 639-1 language code
|
|
239
|
+
# @param kwargs [Hash] Additional options
|
|
240
|
+
# @return [EmbeddingPipeline]
|
|
241
|
+
#
|
|
242
|
+
alias :[] :from_cache
|
|
243
|
+
end
|
|
244
|
+
end
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# LruCache - Least Recently Used Cache
|
|
4
|
+
#
|
|
5
|
+
# Provides efficient O(1) LRU caching with optional TTL support.
|
|
6
|
+
# Used for caching embeddings during similarity search.
|
|
7
|
+
#
|
|
8
|
+
# @example Basic usage
|
|
9
|
+
# cache = LruCache.new(max_size: 1000)
|
|
10
|
+
# cache[:key] = value
|
|
11
|
+
# cache[:key] # => value
|
|
12
|
+
#
|
|
13
|
+
# @example With TTL
|
|
14
|
+
# cache = LruCache.new(max_size: 1000, ttl: 300) # 5 minutes
|
|
15
|
+
#
|
|
16
|
+
class LruCache
|
|
17
|
+
# @return [Integer] Maximum number of entries
|
|
18
|
+
attr_reader :max_size
|
|
19
|
+
|
|
20
|
+
# @return [Integer, nil] TTL in seconds
|
|
21
|
+
attr_reader :ttl
|
|
22
|
+
|
|
23
|
+
# @return [Integer] Number of cache hits
|
|
24
|
+
attr_reader :hits
|
|
25
|
+
|
|
26
|
+
# @return [Integer] Number of cache misses
|
|
27
|
+
attr_reader :misses
|
|
28
|
+
|
|
29
|
+
# Create a new LRU cache
|
|
30
|
+
#
|
|
31
|
+
# @param max_size [Integer] Maximum number of entries (default: 1000)
|
|
32
|
+
# @param ttl [Integer, nil] Time-to-live in seconds (default: nil = no expiry)
|
|
33
|
+
#
|
|
34
|
+
def initialize(max_size: 1000, ttl: nil)
|
|
35
|
+
@max_size = max_size
|
|
36
|
+
@ttl = ttl
|
|
37
|
+
@cache = {} # key -> {value: v, accessed_at: t, created_at: t}
|
|
38
|
+
@order = [] # Ordered list of keys (most recently used first)
|
|
39
|
+
@hits = 0
|
|
40
|
+
@misses = 0
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Get value for key
|
|
44
|
+
#
|
|
45
|
+
# @param key [Object] Cache key
|
|
46
|
+
# @return [Object, nil] Cached value or nil if not found/expired
|
|
47
|
+
#
|
|
48
|
+
def [](key)
|
|
49
|
+
entry = @cache[key]
|
|
50
|
+
return nil unless entry
|
|
51
|
+
|
|
52
|
+
# Check TTL
|
|
53
|
+
if @ttl && (Time.now - entry[:created_at]) > @ttl
|
|
54
|
+
delete(key)
|
|
55
|
+
@misses += 1
|
|
56
|
+
return nil
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Update access order (move to front = most recently used)
|
|
60
|
+
@order.delete(key)
|
|
61
|
+
@order.unshift(key)
|
|
62
|
+
entry[:accessed_at] = Time.now
|
|
63
|
+
|
|
64
|
+
@hits += 1
|
|
65
|
+
entry[:value]
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Set value for key
|
|
69
|
+
#
|
|
70
|
+
# @param key [Object] Cache key
|
|
71
|
+
# @param value [Object] Value to cache
|
|
72
|
+
# @return [Object] The value
|
|
73
|
+
#
|
|
74
|
+
def []=(key, value)
|
|
75
|
+
# Evict LRU if at capacity
|
|
76
|
+
if @cache.key?(key)
|
|
77
|
+
# Update existing entry
|
|
78
|
+
@cache[key][:value] = value
|
|
79
|
+
@cache[key][:accessed_at] = Time.now
|
|
80
|
+
# Move to front
|
|
81
|
+
@order.delete(key)
|
|
82
|
+
@order.unshift(key)
|
|
83
|
+
return value
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
if @cache.size >= @max_size
|
|
87
|
+
evict_lru
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
@cache[key] = {
|
|
91
|
+
value: value,
|
|
92
|
+
accessed_at: Time.now,
|
|
93
|
+
created_at: Time.now
|
|
94
|
+
}
|
|
95
|
+
@order.unshift(key)
|
|
96
|
+
|
|
97
|
+
value
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Check if key exists
|
|
101
|
+
#
|
|
102
|
+
# @param key [Object] Cache key
|
|
103
|
+
# @return [Boolean] True if key exists and not expired
|
|
104
|
+
#
|
|
105
|
+
def key?(key)
|
|
106
|
+
entry = @cache[key]
|
|
107
|
+
return false unless entry
|
|
108
|
+
|
|
109
|
+
if @ttl && (Time.now - entry[:created_at]) > @ttl
|
|
110
|
+
delete(key)
|
|
111
|
+
return false
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
true
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Delete key from cache
|
|
118
|
+
#
|
|
119
|
+
# @param key [Object] Cache key
|
|
120
|
+
# @return [Object, nil] Deleted value or nil
|
|
121
|
+
#
|
|
122
|
+
def delete(key)
|
|
123
|
+
entry = @cache.delete(key)
|
|
124
|
+
@order.delete(key)
|
|
125
|
+
entry&.[](:value)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Clear all entries
|
|
129
|
+
#
|
|
130
|
+
# @return [self]
|
|
131
|
+
#
|
|
132
|
+
def clear
|
|
133
|
+
@cache.clear
|
|
134
|
+
@order.clear
|
|
135
|
+
self
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Get current size
|
|
139
|
+
#
|
|
140
|
+
# @return [Integer] Number of entries
|
|
141
|
+
#
|
|
142
|
+
def size
|
|
143
|
+
@cache.size
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Check if empty
|
|
147
|
+
#
|
|
148
|
+
# @return [Boolean]
|
|
149
|
+
#
|
|
150
|
+
def empty?
|
|
151
|
+
@cache.empty?
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Get least recently used key-value pair
|
|
155
|
+
#
|
|
156
|
+
# @return [Array<Object, Object>, nil]
|
|
157
|
+
#
|
|
158
|
+
def lru
|
|
159
|
+
return nil if @order.empty?
|
|
160
|
+
|
|
161
|
+
key = @order.last
|
|
162
|
+
[key, @cache[key][:value]]
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Get most recently used key-value pair
|
|
166
|
+
#
|
|
167
|
+
# @return [Array<Object, Object>, nil]
|
|
168
|
+
#
|
|
169
|
+
def mru
|
|
170
|
+
return nil if @order.empty?
|
|
171
|
+
|
|
172
|
+
key = @order.first
|
|
173
|
+
[key, @cache[key][:value]]
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Get all keys
|
|
177
|
+
#
|
|
178
|
+
# @return [Array<Object>] Array of keys
|
|
179
|
+
#
|
|
180
|
+
def keys
|
|
181
|
+
@order.dup
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Get all values
|
|
185
|
+
#
|
|
186
|
+
# @return [Array<Object>] Array of values
|
|
187
|
+
#
|
|
188
|
+
def values
|
|
189
|
+
@order.map { |key| @cache[key][:value] }
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Get cache statistics
|
|
193
|
+
#
|
|
194
|
+
# @return [Hash] Statistics
|
|
195
|
+
#
|
|
196
|
+
def stats
|
|
197
|
+
total = @hits + @misses
|
|
198
|
+
{
|
|
199
|
+
size: size,
|
|
200
|
+
max_size: @max_size,
|
|
201
|
+
hits: @hits,
|
|
202
|
+
misses: @misses,
|
|
203
|
+
hit_rate: total.zero? ? 0.0 : @hits.to_f / total,
|
|
204
|
+
ttl: @ttl
|
|
205
|
+
}
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# Fetch with block (cache-aside pattern)
|
|
209
|
+
#
|
|
210
|
+
# @param key [Object] Cache key
|
|
211
|
+
# @return [Object] Cached value or block result
|
|
212
|
+
#
|
|
213
|
+
def fetch(key, &block)
|
|
214
|
+
result = self[key]
|
|
215
|
+
return result if result || key?(key)
|
|
216
|
+
|
|
217
|
+
value = block.call
|
|
218
|
+
self[key] = value
|
|
219
|
+
value
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
private
|
|
223
|
+
|
|
224
|
+
# Evict least recently used entry
|
|
225
|
+
#
|
|
226
|
+
def evict_lru
|
|
227
|
+
return if @order.empty?
|
|
228
|
+
|
|
229
|
+
lru_key = @order.last
|
|
230
|
+
@cache.delete(lru_key)
|
|
231
|
+
@order.pop
|
|
232
|
+
end
|
|
233
|
+
end
|