kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,596 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "net/http"
|
|
5
|
+
require "json"
|
|
6
|
+
require "digest"
|
|
7
|
+
require "uri"
|
|
8
|
+
require "time"
|
|
9
|
+
|
|
10
|
+
require_relative "../integrity"
|
|
11
|
+
|
|
12
|
+
module Kotoshu
|
|
13
|
+
module Cache
|
|
14
|
+
# Abstract base class for all cache implementations.
|
|
15
|
+
#
|
|
16
|
+
# Provides common functionality for:
|
|
17
|
+
# - HTTP downloads with metadata
|
|
18
|
+
# - Cache validation (exists, expired)
|
|
19
|
+
# - Statistics tracking (hits, misses, hit rate)
|
|
20
|
+
# - TTL management
|
|
21
|
+
#
|
|
22
|
+
# Subclasses implement specific download and loading logic.
|
|
23
|
+
#
|
|
24
|
+
# @abstract Subclass must implement {#download_resource}, {#load_cached}
|
|
25
|
+
class BaseCache
|
|
26
|
+
# @return [String] Path to the cache directory
|
|
27
|
+
attr_reader :cache_path
|
|
28
|
+
|
|
29
|
+
# @return [String] Base URL for downloading resources
|
|
30
|
+
attr_reader :url_base
|
|
31
|
+
|
|
32
|
+
# @return [Integer] Cache TTL in seconds
|
|
33
|
+
attr_reader :cache_ttl
|
|
34
|
+
|
|
35
|
+
# @return [String] GitHub repository URL
|
|
36
|
+
attr_reader :github_url
|
|
37
|
+
|
|
38
|
+
# @return [Kotoshu::SourceRegistry] Single source of truth for
|
|
39
|
+
# per-repo URLs and pins. Subclasses MUST build URLs through
|
|
40
|
+
# this registry rather than constructing URL strings inline.
|
|
41
|
+
attr_reader :source_registry
|
|
42
|
+
|
|
43
|
+
# Create a new cache.
|
|
44
|
+
#
|
|
45
|
+
# @param cache_path [String] Path to cache directory
|
|
46
|
+
# @param url_base [String] Base URL for downloads (deprecated; pass source_registry instead)
|
|
47
|
+
# @param cache_ttl [Integer] Cache TTL in seconds
|
|
48
|
+
# @param github_url [String] GitHub repository URL
|
|
49
|
+
# @param resource_pin [String] Branch/tag/commit for URL templates (deprecated; use source_registry)
|
|
50
|
+
# @param manifest_url [String, nil] Override manifest.json URL
|
|
51
|
+
# @param audit_log [Integrity::AuditLog, nil] Override audit log
|
|
52
|
+
# @param source_registry [Kotoshu::SourceRegistry, nil] Single source of truth for URLs/pins
|
|
53
|
+
def initialize(cache_path: nil, url_base: nil, cache_ttl: nil, github_url: nil,
|
|
54
|
+
resource_pin: nil, manifest_url: nil, audit_log: nil,
|
|
55
|
+
source_registry: nil)
|
|
56
|
+
@cache_path = cache_path || default_cache_path
|
|
57
|
+
@source_registry = source_registry || default_source_registry
|
|
58
|
+
@url_base = url_base || @source_registry.base_url
|
|
59
|
+
@cache_ttl = cache_ttl || default_cache_ttl
|
|
60
|
+
@github_url = github_url || default_github_url
|
|
61
|
+
@resource_pin = resource_pin || @source_registry.pin_for_source(:spelling)
|
|
62
|
+
@manifest_url = manifest_url
|
|
63
|
+
@audit_log = audit_log || Kotoshu::Integrity::AuditLog.new
|
|
64
|
+
@manifest = nil
|
|
65
|
+
@manifest_loaded = false
|
|
66
|
+
@hits = 0
|
|
67
|
+
@misses = 0
|
|
68
|
+
|
|
69
|
+
# Ensure cache directory exists
|
|
70
|
+
FileUtils.mkdir_p(@cache_path)
|
|
71
|
+
FileUtils.mkdir_p(File.join(@cache_path, "tmp"))
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Check if a resource is available in cache.
|
|
75
|
+
#
|
|
76
|
+
# @param resource_id [String] The resource identifier (e.g., language code)
|
|
77
|
+
# @return [Boolean] True if resource is cached and valid
|
|
78
|
+
def available?(resource_id)
|
|
79
|
+
return false unless supports_resource?(resource_id)
|
|
80
|
+
|
|
81
|
+
metadata_path = metadata_path_for(resource_id)
|
|
82
|
+
return false unless File.exist?(metadata_path)
|
|
83
|
+
return false if expired?(metadata_path)
|
|
84
|
+
|
|
85
|
+
resource_files_exist?(resource_id)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Get a resource from cache or download it.
|
|
89
|
+
#
|
|
90
|
+
# @param resource_id [String] The resource identifier
|
|
91
|
+
# @param force_download [Boolean] Force re-download even if cached
|
|
92
|
+
# @return [Object, nil] The cached resource or nil if not available
|
|
93
|
+
def get(resource_id, force_download: false)
|
|
94
|
+
return nil unless supports_resource?(resource_id)
|
|
95
|
+
|
|
96
|
+
metadata_path = metadata_path_for(resource_id)
|
|
97
|
+
|
|
98
|
+
if !force_download && cached?(metadata_path) && !expired?(metadata_path)
|
|
99
|
+
@hits += 1
|
|
100
|
+
return load_cached(resource_id)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
@misses += 1
|
|
104
|
+
download(resource_id)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Clear a specific resource from cache.
|
|
108
|
+
#
|
|
109
|
+
# @param resource_id [String] The resource identifier
|
|
110
|
+
# @return [Boolean] True if cache was cleared
|
|
111
|
+
def clear(resource_id)
|
|
112
|
+
return false unless supports_resource?(resource_id)
|
|
113
|
+
|
|
114
|
+
resource_dir = resource_dir_for(resource_id)
|
|
115
|
+
if File.exist?(resource_dir)
|
|
116
|
+
FileUtils.rm_rf(resource_dir)
|
|
117
|
+
return true
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
false
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Clear all cached resources.
|
|
124
|
+
#
|
|
125
|
+
# @return [void]
|
|
126
|
+
def clear_all
|
|
127
|
+
@hits = 0
|
|
128
|
+
@misses = 0
|
|
129
|
+
FileUtils.rm_rf(@cache_path)
|
|
130
|
+
FileUtils.mkdir_p(@cache_path)
|
|
131
|
+
FileUtils.mkdir_p(File.join(@cache_path, "tmp"))
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Get cache statistics.
|
|
135
|
+
#
|
|
136
|
+
# @return [Hash] Statistics including :hits, :misses, :hit_rate, :size
|
|
137
|
+
def stats
|
|
138
|
+
total = @hits + @misses
|
|
139
|
+
hit_rate = total.positive? ? (@hits.to_f / total) : 0.0
|
|
140
|
+
|
|
141
|
+
{
|
|
142
|
+
hits: @hits,
|
|
143
|
+
misses: @misses,
|
|
144
|
+
total: total,
|
|
145
|
+
hit_rate: hit_rate,
|
|
146
|
+
cached_resources: cached_resources,
|
|
147
|
+
size_bytes: cache_size,
|
|
148
|
+
oldest_entry: oldest_entry
|
|
149
|
+
}
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Reset statistics counters.
|
|
153
|
+
#
|
|
154
|
+
# @return [self] Self for chaining
|
|
155
|
+
def reset_stats
|
|
156
|
+
@hits = 0
|
|
157
|
+
@misses = 0
|
|
158
|
+
self
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Clean expired cache entries.
|
|
162
|
+
#
|
|
163
|
+
# @return [Hash] Cleanup statistics
|
|
164
|
+
def clean
|
|
165
|
+
expired_count = clean_expired
|
|
166
|
+
size_reclaimed = clean_by_size
|
|
167
|
+
|
|
168
|
+
{
|
|
169
|
+
expired_entries_removed: expired_count,
|
|
170
|
+
bytes_reclaimed: size_reclaimed
|
|
171
|
+
}
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# List all cached resources.
|
|
175
|
+
#
|
|
176
|
+
# @return [Array<String>] List of cached resource identifiers
|
|
177
|
+
def cached_resources
|
|
178
|
+
raise NotImplementedError, "Subclass must implement"
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Check if a resource type is supported.
|
|
182
|
+
#
|
|
183
|
+
# @param resource_id [String] The resource identifier
|
|
184
|
+
# @return [Boolean] True if supported
|
|
185
|
+
def supports_resource?(resource_id)
|
|
186
|
+
raise NotImplementedError, "Subclass must implement"
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Download a resource from GitHub.
|
|
190
|
+
#
|
|
191
|
+
# @param resource_id [String] The resource identifier
|
|
192
|
+
# @return [Object, nil] Downloaded resource or nil on error
|
|
193
|
+
def download(resource_id)
|
|
194
|
+
return nil unless supports_resource?(resource_id)
|
|
195
|
+
|
|
196
|
+
resource_dir = resource_dir_for(resource_id)
|
|
197
|
+
FileUtils.mkdir_p(resource_dir)
|
|
198
|
+
|
|
199
|
+
begin
|
|
200
|
+
download_resource(resource_id, resource_dir)
|
|
201
|
+
rescue StandardError => e
|
|
202
|
+
warn "Error downloading #{resource_id}: #{e.message}" if $VERBOSE
|
|
203
|
+
nil
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Abstract: Download a specific resource.
|
|
208
|
+
#
|
|
209
|
+
# @param resource_id [String] The resource identifier
|
|
210
|
+
# @param dest_path [String] Destination directory
|
|
211
|
+
# @return [Object] Downloaded resource
|
|
212
|
+
# @abstract Subclass must implement
|
|
213
|
+
def download_resource(resource_id, dest_path)
|
|
214
|
+
raise NotImplementedError, "Subclass must implement"
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Abstract: Load cached resource data.
|
|
218
|
+
#
|
|
219
|
+
# @param resource_id [String] The resource identifier
|
|
220
|
+
# @return [Object, nil] Loaded resource or nil
|
|
221
|
+
# @abstract Subclass must implement
|
|
222
|
+
def load_cached(resource_id)
|
|
223
|
+
raise NotImplementedError, "Subclass must implement"
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# Abstract: Get metadata file path for a resource.
|
|
227
|
+
#
|
|
228
|
+
# @param resource_id [String] The resource identifier
|
|
229
|
+
# @return [String] Metadata file path
|
|
230
|
+
# @abstract Subclass must implement
|
|
231
|
+
def metadata_path_for(resource_id)
|
|
232
|
+
raise NotImplementedError, "Subclass must implement"
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
# Abstract: Get resource directory path.
|
|
236
|
+
#
|
|
237
|
+
# @param resource_id [String] The resource identifier
|
|
238
|
+
# @return [String] Resource directory path
|
|
239
|
+
# @abstract Subclass must implement
|
|
240
|
+
def resource_dir_for(resource_id)
|
|
241
|
+
raise NotImplementedError, "Subclass must implement"
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Abstract: Check if all resource files exist.
|
|
245
|
+
#
|
|
246
|
+
# @param resource_id [String] The resource identifier
|
|
247
|
+
# @return [Boolean] True if all files exist
|
|
248
|
+
# @abstract Subclass must implement
|
|
249
|
+
def resource_files_exist?(resource_id)
|
|
250
|
+
raise NotImplementedError, "Subclass must implement"
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
protected
|
|
254
|
+
|
|
255
|
+
# Download content from a URL.
|
|
256
|
+
#
|
|
257
|
+
# @param url [String] URL to download
|
|
258
|
+
# @return [String] Downloaded content
|
|
259
|
+
def download_url(url)
|
|
260
|
+
uri = URI.parse(url)
|
|
261
|
+
|
|
262
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
263
|
+
http.use_ssl = (uri.scheme == "https")
|
|
264
|
+
http.open_timeout = 10
|
|
265
|
+
http.read_timeout = 30
|
|
266
|
+
|
|
267
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
|
268
|
+
|
|
269
|
+
response = http.request(request)
|
|
270
|
+
|
|
271
|
+
raise "Failed to download #{url}: #{response.code} #{response.message}" unless response.is_a?(Net::HTTPSuccess)
|
|
272
|
+
|
|
273
|
+
response.body
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# Download a file to disk, streaming in chunks.
|
|
277
|
+
#
|
|
278
|
+
# @param url [String] Source URL
|
|
279
|
+
# @param dest_path [String] Destination file path
|
|
280
|
+
# @param reporter [#start,#update,#maybe_report_periodic,#finish,nil]
|
|
281
|
+
# Optional progress reporter. Defaults to
|
|
282
|
+
# Kotoshu.configuration.download_reporter (typically nil for
|
|
283
|
+
# programmatic use, set by the CLI during setup).
|
|
284
|
+
def download_file(url, dest_path, reporter: nil)
|
|
285
|
+
reporter ||= Kotoshu.configuration.download_reporter
|
|
286
|
+
uri = URI.parse(url)
|
|
287
|
+
|
|
288
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
289
|
+
http.use_ssl = (uri.scheme == "https")
|
|
290
|
+
http.open_timeout = 30
|
|
291
|
+
http.read_timeout = 300
|
|
292
|
+
|
|
293
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
|
294
|
+
|
|
295
|
+
http.request(request) do |response|
|
|
296
|
+
case response
|
|
297
|
+
when Net::HTTPSuccess
|
|
298
|
+
content_length = content_length_from(response)
|
|
299
|
+
FileUtils.mkdir_p(File.dirname(dest_path))
|
|
300
|
+
received = 0
|
|
301
|
+
reporter&.start(content_length)
|
|
302
|
+
File.open(dest_path, "wb") do |file|
|
|
303
|
+
response.read_body do |chunk|
|
|
304
|
+
file.write(chunk)
|
|
305
|
+
received += chunk.bytesize
|
|
306
|
+
reporter&.update(received)
|
|
307
|
+
reporter&.maybe_report_periodic
|
|
308
|
+
end
|
|
309
|
+
end
|
|
310
|
+
reporter&.finish
|
|
311
|
+
when Net::HTTPRedirection
|
|
312
|
+
download_file(response["location"], dest_path, reporter: reporter)
|
|
313
|
+
else
|
|
314
|
+
raise "Failed to download #{url}: #{response.code} #{response.message}"
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
# Extract Content-Length safely. Some servers omit it (chunked
|
|
320
|
+
# transfer encoding); caller treats nil as "size unknown".
|
|
321
|
+
# @param response [Net::HTTPResponse]
|
|
322
|
+
# @return [Integer, nil]
|
|
323
|
+
def content_length_from(response)
|
|
324
|
+
raw = response["Content-Length"]
|
|
325
|
+
return nil if raw.nil? || raw.strip.empty?
|
|
326
|
+
|
|
327
|
+
Integer(raw)
|
|
328
|
+
rescue ArgumentError
|
|
329
|
+
nil
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
# Write metadata to file.
|
|
333
|
+
#
|
|
334
|
+
# @param path [String] Metadata file path
|
|
335
|
+
# @param metadata [Hash] Metadata to write
|
|
336
|
+
def write_metadata(path, metadata)
|
|
337
|
+
FileUtils.mkdir_p(File.dirname(path))
|
|
338
|
+
File.write(path, JSON.pretty_generate(metadata))
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
# Read metadata from file.
|
|
342
|
+
#
|
|
343
|
+
# @param path [String] Metadata file path
|
|
344
|
+
# @return [Hash, nil] Metadata or nil
|
|
345
|
+
def read_metadata(path)
|
|
346
|
+
return nil unless File.exist?(path)
|
|
347
|
+
|
|
348
|
+
JSON.parse(File.read(path))
|
|
349
|
+
rescue JSON::ParserError
|
|
350
|
+
nil
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
# Check if cached file exists.
|
|
354
|
+
#
|
|
355
|
+
# @param metadata_path [String] Path to metadata file
|
|
356
|
+
# @return [Boolean] True if cached
|
|
357
|
+
def cached?(metadata_path)
|
|
358
|
+
File.exist?(metadata_path)
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
# Check if cached file is expired.
|
|
362
|
+
#
|
|
363
|
+
# @param metadata_path [String] Path to metadata file
|
|
364
|
+
# @return [Boolean] True if expired
|
|
365
|
+
def expired?(metadata_path)
|
|
366
|
+
return true unless File.exist?(metadata_path)
|
|
367
|
+
|
|
368
|
+
metadata = read_metadata(metadata_path)
|
|
369
|
+
return true unless metadata
|
|
370
|
+
|
|
371
|
+
cached_time_str = metadata["cached_at"] || metadata["version"]
|
|
372
|
+
return true unless cached_time_str
|
|
373
|
+
|
|
374
|
+
begin
|
|
375
|
+
cached_time = Time.iso8601(cached_time_str)
|
|
376
|
+
Time.now.utc - cached_time > @cache_ttl
|
|
377
|
+
rescue StandardError
|
|
378
|
+
true
|
|
379
|
+
end
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
# Calculate checksum of content.
|
|
383
|
+
#
|
|
384
|
+
# @param content [String] Content to checksum
|
|
385
|
+
# @return [String] SHA256 checksum
|
|
386
|
+
def checksum(content)
|
|
387
|
+
Digest::SHA256.hexdigest(content)
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
# Verify downloaded content against the manifest and log to audit.
|
|
391
|
+
#
|
|
392
|
+
# If a manifest is published for this cache's content repo, the content's
|
|
393
|
+
# SHA-256 is checked against the manifest entry for `relative_path`. A
|
|
394
|
+
# mismatch raises {Kotoshu::IntegrityError} — callers MUST remove the
|
|
395
|
+
# corrupt bytes from disk so the next call re-downloads. When no manifest
|
|
396
|
+
# entry exists (kotoshu/dictionaries hasn't shipped one yet), the content
|
|
397
|
+
# is logged as `"unverified"` and accepted — graceful degradation.
|
|
398
|
+
#
|
|
399
|
+
# @param url [String] Source URL (for audit log)
|
|
400
|
+
# @param relative_path [String] Manifest lookup key (e.g., "en/spelling/index.dic")
|
|
401
|
+
# @param content [String] Downloaded bytes
|
|
402
|
+
# @param resource_id [String, nil] Caller-supplied resource identifier
|
|
403
|
+
# @return [void]
|
|
404
|
+
def verify_and_audit(url:, relative_path:, content:, resource_id: nil)
|
|
405
|
+
sha = Digest::SHA256.hexdigest(content)
|
|
406
|
+
entry = manifest_entry_for(relative_path)
|
|
407
|
+
|
|
408
|
+
if entry.nil?
|
|
409
|
+
@audit_log.record(
|
|
410
|
+
url: url, status: "unverified", size: content.bytesize,
|
|
411
|
+
sha256: sha, manifest_sha256: nil, resource_id: resource_id
|
|
412
|
+
)
|
|
413
|
+
return
|
|
414
|
+
end
|
|
415
|
+
|
|
416
|
+
if sha == entry.sha256
|
|
417
|
+
@audit_log.record(
|
|
418
|
+
url: url, status: "verified", size: content.bytesize,
|
|
419
|
+
sha256: sha, manifest_sha256: entry.sha256, resource_id: resource_id
|
|
420
|
+
)
|
|
421
|
+
else
|
|
422
|
+
@audit_log.record(
|
|
423
|
+
url: url, status: "mismatch", size: content.bytesize,
|
|
424
|
+
sha256: sha, manifest_sha256: entry.sha256, resource_id: resource_id
|
|
425
|
+
)
|
|
426
|
+
raise Kotoshu::IntegrityError.new(
|
|
427
|
+
relative_path, expected: entry.sha256, actual: sha, url: url
|
|
428
|
+
)
|
|
429
|
+
end
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
# Pin used in URL templates (default "main"; override via constructor
|
|
433
|
+
# or KOTOSHU_RESOURCE_PIN env var through Configuration).
|
|
434
|
+
#
|
|
435
|
+
# @return [String]
|
|
436
|
+
attr_reader :resource_pin
|
|
437
|
+
|
|
438
|
+
private
|
|
439
|
+
|
|
440
|
+
# Look up a manifest entry by relative path. Loads the manifest
|
|
441
|
+
# lazily on first call; treats HTTP 404/410 as "no manifest" (returns
|
|
442
|
+
# nil) so verification is gracefully skipped.
|
|
443
|
+
def manifest_entry_for(relative_path)
|
|
444
|
+
load_manifest! unless @manifest_loaded
|
|
445
|
+
@manifest&.fetch(relative_path)
|
|
446
|
+
end
|
|
447
|
+
|
|
448
|
+
# Fetch the manifest once per cache instance. Sets @manifest_loaded
|
|
449
|
+
# regardless of outcome so we don't retry on every download.
|
|
450
|
+
def load_manifest!
|
|
451
|
+
@manifest_loaded = true
|
|
452
|
+
url = manifest_url
|
|
453
|
+
return unless url
|
|
454
|
+
|
|
455
|
+
begin
|
|
456
|
+
@manifest = Kotoshu::Integrity::Manifest.load(url)
|
|
457
|
+
rescue StandardError => e
|
|
458
|
+
warn "Manifest fetch failed for #{url}: #{e.message}" if $VERBOSE
|
|
459
|
+
@manifest = nil
|
|
460
|
+
end
|
|
461
|
+
end
|
|
462
|
+
|
|
463
|
+
# Default manifest URL — subclasses override to point at their repo's
|
|
464
|
+
# manifest.json. Returns nil to opt out of manifest verification.
|
|
465
|
+
def manifest_url
|
|
466
|
+
@manifest_url
|
|
467
|
+
end
|
|
468
|
+
|
|
469
|
+
# Get cache size in bytes.
|
|
470
|
+
#
|
|
471
|
+
# @return [Integer] Total size in bytes
|
|
472
|
+
def cache_size
|
|
473
|
+
total = 0
|
|
474
|
+
Dir.glob(File.join(@cache_path, "**", "*")).each do |path|
|
|
475
|
+
total += File.size(path) if File.file?(path)
|
|
476
|
+
end
|
|
477
|
+
total
|
|
478
|
+
end
|
|
479
|
+
|
|
480
|
+
# Get oldest cached entry timestamp.
|
|
481
|
+
#
|
|
482
|
+
# @return [String, nil] ISO8601 timestamp or nil
|
|
483
|
+
def oldest_entry
|
|
484
|
+
oldest = nil
|
|
485
|
+
|
|
486
|
+
Dir.glob(File.join(@cache_path, "**", "metadata.json")).each do |metadata_path|
|
|
487
|
+
metadata = read_metadata(metadata_path)
|
|
488
|
+
next unless metadata
|
|
489
|
+
|
|
490
|
+
timestamp = metadata["cached_at"] || metadata["version"]
|
|
491
|
+
next unless timestamp
|
|
492
|
+
|
|
493
|
+
oldest = timestamp if oldest.nil? || timestamp < oldest
|
|
494
|
+
end
|
|
495
|
+
|
|
496
|
+
oldest
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
# Clean expired cache entries.
|
|
500
|
+
#
|
|
501
|
+
# @return [Integer] Number of entries removed
|
|
502
|
+
def clean_expired
|
|
503
|
+
count = 0
|
|
504
|
+
|
|
505
|
+
Dir.glob(File.join(@cache_path, "**", "metadata.json")).each do |metadata_path|
|
|
506
|
+
next unless expired?(metadata_path)
|
|
507
|
+
|
|
508
|
+
dir_path = File.dirname(metadata_path)
|
|
509
|
+
FileUtils.rm_rf(dir_path)
|
|
510
|
+
count += 1
|
|
511
|
+
end
|
|
512
|
+
|
|
513
|
+
count
|
|
514
|
+
end
|
|
515
|
+
|
|
516
|
+
# Clean cache entries by size.
|
|
517
|
+
#
|
|
518
|
+
# @return [Integer] Bytes reclaimed
|
|
519
|
+
def clean_by_size
|
|
520
|
+
0 # Override in subclass if needed
|
|
521
|
+
end
|
|
522
|
+
|
|
523
|
+
private
|
|
524
|
+
|
|
525
|
+
# Parse resource identifier into components.
|
|
526
|
+
#
|
|
527
|
+
# @param resource_id [String] The resource identifier (e.g., "en:spelling" or "en:fasttext")
|
|
528
|
+
# @return [Array<String>, nil] Array of parts or nil if invalid
|
|
529
|
+
def parse_resource_id(resource_id)
|
|
530
|
+
parts = resource_id.split(":")
|
|
531
|
+
return nil unless parts.size == 2
|
|
532
|
+
|
|
533
|
+
parts
|
|
534
|
+
end
|
|
535
|
+
|
|
536
|
+
# Extract language code from resource identifier.
|
|
537
|
+
#
|
|
538
|
+
# @param resource_id [String] The resource identifier
|
|
539
|
+
# @return [String, nil] Language code or nil if invalid
|
|
540
|
+
def extract_language(resource_id)
|
|
541
|
+
parts = parse_resource_id(resource_id)
|
|
542
|
+
return nil unless parts
|
|
543
|
+
|
|
544
|
+
parts[0]
|
|
545
|
+
end
|
|
546
|
+
|
|
547
|
+
# Extract resource type from resource identifier.
|
|
548
|
+
#
|
|
549
|
+
# @param resource_id [String] The resource identifier
|
|
550
|
+
# @return [String, nil] Resource type or nil if invalid
|
|
551
|
+
def extract_type(resource_id)
|
|
552
|
+
parts = parse_resource_id(resource_id)
|
|
553
|
+
return nil unless parts
|
|
554
|
+
|
|
555
|
+
parts[1]
|
|
556
|
+
end
|
|
557
|
+
|
|
558
|
+
# Default cache path: $XDG_CACHE_HOME/kotoshu
|
|
559
|
+
#
|
|
560
|
+
# @return [String] Default cache path
|
|
561
|
+
def default_cache_path
|
|
562
|
+
Kotoshu::Paths.cache_path
|
|
563
|
+
end
|
|
564
|
+
|
|
565
|
+
# Default URL base.
|
|
566
|
+
#
|
|
567
|
+
# @return [String] Default URL base
|
|
568
|
+
def default_url_base
|
|
569
|
+
Kotoshu::SourceRegistry::DEFAULT_BASE_URL
|
|
570
|
+
end
|
|
571
|
+
|
|
572
|
+
# Default source registry — pulls from global Configuration so
|
|
573
|
+
# ENV (KOTOSHU_REPOS_BASE_URL, KOTOSHU_DICTIONARIES_PIN, etc.)
|
|
574
|
+
# and programmatic config reach the cache layer automatically.
|
|
575
|
+
#
|
|
576
|
+
# @return [Kotoshu::SourceRegistry]
|
|
577
|
+
def default_source_registry
|
|
578
|
+
Kotoshu::Configuration.instance.source_registry
|
|
579
|
+
end
|
|
580
|
+
|
|
581
|
+
# Default GitHub URL.
|
|
582
|
+
#
|
|
583
|
+
# @return [String] Default GitHub URL
|
|
584
|
+
def default_github_url
|
|
585
|
+
"https://github.com/kotoshu"
|
|
586
|
+
end
|
|
587
|
+
|
|
588
|
+
# Default cache TTL (7 days).
|
|
589
|
+
#
|
|
590
|
+
# @return [Integer] Default TTL in seconds
|
|
591
|
+
def default_cache_ttl
|
|
592
|
+
604_800
|
|
593
|
+
end
|
|
594
|
+
end
|
|
595
|
+
end
|
|
596
|
+
end
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Cache
|
|
5
|
+
# Base cache interface.
|
|
6
|
+
#
|
|
7
|
+
# All cache implementations should follow this interface.
|
|
8
|
+
#
|
|
9
|
+
# @abstract Subclass must implement {#fetch}, {#write}, {#read}, {#delete}, {#clear}
|
|
10
|
+
module Cache
|
|
11
|
+
# Retrieve a value from cache, or compute it.
|
|
12
|
+
#
|
|
13
|
+
# @param key [Object] The cache key
|
|
14
|
+
# @yield Block to compute value on cache miss
|
|
15
|
+
# @return [Object] The cached or computed value
|
|
16
|
+
# @abstract Subclass must implement
|
|
17
|
+
def fetch(key, &block)
|
|
18
|
+
raise NotImplementedError
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Write a value to cache.
|
|
22
|
+
#
|
|
23
|
+
# @param key [Object] The cache key
|
|
24
|
+
# @param value [Object] The value to store
|
|
25
|
+
# @return [Object] The stored value
|
|
26
|
+
# @abstract Subclass must implement
|
|
27
|
+
def write(key, value)
|
|
28
|
+
raise NotImplementedError
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Read a value from cache.
|
|
32
|
+
#
|
|
33
|
+
# @param key [Object] The cache key
|
|
34
|
+
# @return [Object, nil] The cached value or nil
|
|
35
|
+
# @abstract Subclass must implement
|
|
36
|
+
def read(key)
|
|
37
|
+
raise NotImplementedError
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Delete a value from cache.
|
|
41
|
+
#
|
|
42
|
+
# @param key [Object] The cache key
|
|
43
|
+
# @return [Object, nil] The deleted value or nil
|
|
44
|
+
# @abstract Subclass must implement
|
|
45
|
+
def delete(key)
|
|
46
|
+
raise NotImplementedError
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Clear all entries from cache.
|
|
50
|
+
#
|
|
51
|
+
# @return [self] Self for chaining
|
|
52
|
+
# @abstract Subclass must implement
|
|
53
|
+
def clear
|
|
54
|
+
raise NotImplementedError
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Check if key exists in cache.
|
|
58
|
+
#
|
|
59
|
+
# @param key [Object] The cache key
|
|
60
|
+
# @return [Boolean] True if key exists
|
|
61
|
+
# @abstract Subclass must implement
|
|
62
|
+
def key?(key)
|
|
63
|
+
raise NotImplementedError
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Get number of entries in cache.
|
|
67
|
+
#
|
|
68
|
+
# @return [Integer] Number of entries
|
|
69
|
+
# @abstract Subclass must implement
|
|
70
|
+
def size
|
|
71
|
+
raise NotImplementedError
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Get cache statistics.
|
|
75
|
+
#
|
|
76
|
+
# @return [Hash] Statistics including :hits, :misses, :size, :hit_rate
|
|
77
|
+
# @abstract Subclass must implement
|
|
78
|
+
def stats
|
|
79
|
+
raise NotImplementedError
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Reset statistics counters.
|
|
83
|
+
#
|
|
84
|
+
# @return [self] Self for chaining
|
|
85
|
+
# @abstract Subclass must implement
|
|
86
|
+
def reset_stats
|
|
87
|
+
raise NotImplementedError
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|