kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,596 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "net/http"
5
+ require "json"
6
+ require "digest"
7
+ require "uri"
8
+ require "time"
9
+
10
+ require_relative "../integrity"
11
+
12
+ module Kotoshu
13
+ module Cache
14
+ # Abstract base class for all cache implementations.
15
+ #
16
+ # Provides common functionality for:
17
+ # - HTTP downloads with metadata
18
+ # - Cache validation (exists, expired)
19
+ # - Statistics tracking (hits, misses, hit rate)
20
+ # - TTL management
21
+ #
22
+ # Subclasses implement specific download and loading logic.
23
+ #
24
+ # @abstract Subclass must implement {#download_resource}, {#load_cached}
25
+ class BaseCache
26
+ # @return [String] Path to the cache directory
27
+ attr_reader :cache_path
28
+
29
+ # @return [String] Base URL for downloading resources
30
+ attr_reader :url_base
31
+
32
+ # @return [Integer] Cache TTL in seconds
33
+ attr_reader :cache_ttl
34
+
35
+ # @return [String] GitHub repository URL
36
+ attr_reader :github_url
37
+
38
+ # @return [Kotoshu::SourceRegistry] Single source of truth for
39
+ # per-repo URLs and pins. Subclasses MUST build URLs through
40
+ # this registry rather than constructing URL strings inline.
41
+ attr_reader :source_registry
42
+
43
+ # Create a new cache.
44
+ #
45
+ # @param cache_path [String] Path to cache directory
46
+ # @param url_base [String] Base URL for downloads (deprecated; pass source_registry instead)
47
+ # @param cache_ttl [Integer] Cache TTL in seconds
48
+ # @param github_url [String] GitHub repository URL
49
+ # @param resource_pin [String] Branch/tag/commit for URL templates (deprecated; use source_registry)
50
+ # @param manifest_url [String, nil] Override manifest.json URL
51
+ # @param audit_log [Integrity::AuditLog, nil] Override audit log
52
+ # @param source_registry [Kotoshu::SourceRegistry, nil] Single source of truth for URLs/pins
53
+ def initialize(cache_path: nil, url_base: nil, cache_ttl: nil, github_url: nil,
54
+ resource_pin: nil, manifest_url: nil, audit_log: nil,
55
+ source_registry: nil)
56
+ @cache_path = cache_path || default_cache_path
57
+ @source_registry = source_registry || default_source_registry
58
+ @url_base = url_base || @source_registry.base_url
59
+ @cache_ttl = cache_ttl || default_cache_ttl
60
+ @github_url = github_url || default_github_url
61
+ @resource_pin = resource_pin || @source_registry.pin_for_source(:spelling)
62
+ @manifest_url = manifest_url
63
+ @audit_log = audit_log || Kotoshu::Integrity::AuditLog.new
64
+ @manifest = nil
65
+ @manifest_loaded = false
66
+ @hits = 0
67
+ @misses = 0
68
+
69
+ # Ensure cache directory exists
70
+ FileUtils.mkdir_p(@cache_path)
71
+ FileUtils.mkdir_p(File.join(@cache_path, "tmp"))
72
+ end
73
+
74
+ # Check if a resource is available in cache.
75
+ #
76
+ # @param resource_id [String] The resource identifier (e.g., language code)
77
+ # @return [Boolean] True if resource is cached and valid
78
+ def available?(resource_id)
79
+ return false unless supports_resource?(resource_id)
80
+
81
+ metadata_path = metadata_path_for(resource_id)
82
+ return false unless File.exist?(metadata_path)
83
+ return false if expired?(metadata_path)
84
+
85
+ resource_files_exist?(resource_id)
86
+ end
87
+
88
+ # Get a resource from cache or download it.
89
+ #
90
+ # @param resource_id [String] The resource identifier
91
+ # @param force_download [Boolean] Force re-download even if cached
92
+ # @return [Object, nil] The cached resource or nil if not available
93
+ def get(resource_id, force_download: false)
94
+ return nil unless supports_resource?(resource_id)
95
+
96
+ metadata_path = metadata_path_for(resource_id)
97
+
98
+ if !force_download && cached?(metadata_path) && !expired?(metadata_path)
99
+ @hits += 1
100
+ return load_cached(resource_id)
101
+ end
102
+
103
+ @misses += 1
104
+ download(resource_id)
105
+ end
106
+
107
+ # Clear a specific resource from cache.
108
+ #
109
+ # @param resource_id [String] The resource identifier
110
+ # @return [Boolean] True if cache was cleared
111
+ def clear(resource_id)
112
+ return false unless supports_resource?(resource_id)
113
+
114
+ resource_dir = resource_dir_for(resource_id)
115
+ if File.exist?(resource_dir)
116
+ FileUtils.rm_rf(resource_dir)
117
+ return true
118
+ end
119
+
120
+ false
121
+ end
122
+
123
+ # Clear all cached resources.
124
+ #
125
+ # @return [void]
126
+ def clear_all
127
+ @hits = 0
128
+ @misses = 0
129
+ FileUtils.rm_rf(@cache_path)
130
+ FileUtils.mkdir_p(@cache_path)
131
+ FileUtils.mkdir_p(File.join(@cache_path, "tmp"))
132
+ end
133
+
134
+ # Get cache statistics.
135
+ #
136
+ # @return [Hash] Statistics including :hits, :misses, :hit_rate, :size
137
+ def stats
138
+ total = @hits + @misses
139
+ hit_rate = total.positive? ? (@hits.to_f / total) : 0.0
140
+
141
+ {
142
+ hits: @hits,
143
+ misses: @misses,
144
+ total: total,
145
+ hit_rate: hit_rate,
146
+ cached_resources: cached_resources,
147
+ size_bytes: cache_size,
148
+ oldest_entry: oldest_entry
149
+ }
150
+ end
151
+
152
+ # Reset statistics counters.
153
+ #
154
+ # @return [self] Self for chaining
155
+ def reset_stats
156
+ @hits = 0
157
+ @misses = 0
158
+ self
159
+ end
160
+
161
+ # Clean expired cache entries.
162
+ #
163
+ # @return [Hash] Cleanup statistics
164
+ def clean
165
+ expired_count = clean_expired
166
+ size_reclaimed = clean_by_size
167
+
168
+ {
169
+ expired_entries_removed: expired_count,
170
+ bytes_reclaimed: size_reclaimed
171
+ }
172
+ end
173
+
174
+ # List all cached resources.
175
+ #
176
+ # @return [Array<String>] List of cached resource identifiers
177
+ def cached_resources
178
+ raise NotImplementedError, "Subclass must implement"
179
+ end
180
+
181
+ # Check if a resource type is supported.
182
+ #
183
+ # @param resource_id [String] The resource identifier
184
+ # @return [Boolean] True if supported
185
+ def supports_resource?(resource_id)
186
+ raise NotImplementedError, "Subclass must implement"
187
+ end
188
+
189
+ # Download a resource from GitHub.
190
+ #
191
+ # @param resource_id [String] The resource identifier
192
+ # @return [Object, nil] Downloaded resource or nil on error
193
+ def download(resource_id)
194
+ return nil unless supports_resource?(resource_id)
195
+
196
+ resource_dir = resource_dir_for(resource_id)
197
+ FileUtils.mkdir_p(resource_dir)
198
+
199
+ begin
200
+ download_resource(resource_id, resource_dir)
201
+ rescue StandardError => e
202
+ warn "Error downloading #{resource_id}: #{e.message}" if $VERBOSE
203
+ nil
204
+ end
205
+ end
206
+
207
+ # Abstract: Download a specific resource.
208
+ #
209
+ # @param resource_id [String] The resource identifier
210
+ # @param dest_path [String] Destination directory
211
+ # @return [Object] Downloaded resource
212
+ # @abstract Subclass must implement
213
+ def download_resource(resource_id, dest_path)
214
+ raise NotImplementedError, "Subclass must implement"
215
+ end
216
+
217
+ # Abstract: Load cached resource data.
218
+ #
219
+ # @param resource_id [String] The resource identifier
220
+ # @return [Object, nil] Loaded resource or nil
221
+ # @abstract Subclass must implement
222
+ def load_cached(resource_id)
223
+ raise NotImplementedError, "Subclass must implement"
224
+ end
225
+
226
+ # Abstract: Get metadata file path for a resource.
227
+ #
228
+ # @param resource_id [String] The resource identifier
229
+ # @return [String] Metadata file path
230
+ # @abstract Subclass must implement
231
+ def metadata_path_for(resource_id)
232
+ raise NotImplementedError, "Subclass must implement"
233
+ end
234
+
235
+ # Abstract: Get resource directory path.
236
+ #
237
+ # @param resource_id [String] The resource identifier
238
+ # @return [String] Resource directory path
239
+ # @abstract Subclass must implement
240
+ def resource_dir_for(resource_id)
241
+ raise NotImplementedError, "Subclass must implement"
242
+ end
243
+
244
+ # Abstract: Check if all resource files exist.
245
+ #
246
+ # @param resource_id [String] The resource identifier
247
+ # @return [Boolean] True if all files exist
248
+ # @abstract Subclass must implement
249
+ def resource_files_exist?(resource_id)
250
+ raise NotImplementedError, "Subclass must implement"
251
+ end
252
+
253
+ protected
254
+
255
+ # Download content from a URL.
256
+ #
257
+ # @param url [String] URL to download
258
+ # @return [String] Downloaded content
259
+ def download_url(url)
260
+ uri = URI.parse(url)
261
+
262
+ http = Net::HTTP.new(uri.host, uri.port)
263
+ http.use_ssl = (uri.scheme == "https")
264
+ http.open_timeout = 10
265
+ http.read_timeout = 30
266
+
267
+ request = Net::HTTP::Get.new(uri.request_uri)
268
+
269
+ response = http.request(request)
270
+
271
+ raise "Failed to download #{url}: #{response.code} #{response.message}" unless response.is_a?(Net::HTTPSuccess)
272
+
273
+ response.body
274
+ end
275
+
276
+ # Download a file to disk, streaming in chunks.
277
+ #
278
+ # @param url [String] Source URL
279
+ # @param dest_path [String] Destination file path
280
+ # @param reporter [#start,#update,#maybe_report_periodic,#finish,nil]
281
+ # Optional progress reporter. Defaults to
282
+ # Kotoshu.configuration.download_reporter (typically nil for
283
+ # programmatic use, set by the CLI during setup).
284
+ def download_file(url, dest_path, reporter: nil)
285
+ reporter ||= Kotoshu.configuration.download_reporter
286
+ uri = URI.parse(url)
287
+
288
+ http = Net::HTTP.new(uri.host, uri.port)
289
+ http.use_ssl = (uri.scheme == "https")
290
+ http.open_timeout = 30
291
+ http.read_timeout = 300
292
+
293
+ request = Net::HTTP::Get.new(uri.request_uri)
294
+
295
+ http.request(request) do |response|
296
+ case response
297
+ when Net::HTTPSuccess
298
+ content_length = content_length_from(response)
299
+ FileUtils.mkdir_p(File.dirname(dest_path))
300
+ received = 0
301
+ reporter&.start(content_length)
302
+ File.open(dest_path, "wb") do |file|
303
+ response.read_body do |chunk|
304
+ file.write(chunk)
305
+ received += chunk.bytesize
306
+ reporter&.update(received)
307
+ reporter&.maybe_report_periodic
308
+ end
309
+ end
310
+ reporter&.finish
311
+ when Net::HTTPRedirection
312
+ download_file(response["location"], dest_path, reporter: reporter)
313
+ else
314
+ raise "Failed to download #{url}: #{response.code} #{response.message}"
315
+ end
316
+ end
317
+ end
318
+
319
+ # Extract Content-Length safely. Some servers omit it (chunked
320
+ # transfer encoding); caller treats nil as "size unknown".
321
+ # @param response [Net::HTTPResponse]
322
+ # @return [Integer, nil]
323
+ def content_length_from(response)
324
+ raw = response["Content-Length"]
325
+ return nil if raw.nil? || raw.strip.empty?
326
+
327
+ Integer(raw)
328
+ rescue ArgumentError
329
+ nil
330
+ end
331
+
332
+ # Write metadata to file.
333
+ #
334
+ # @param path [String] Metadata file path
335
+ # @param metadata [Hash] Metadata to write
336
+ def write_metadata(path, metadata)
337
+ FileUtils.mkdir_p(File.dirname(path))
338
+ File.write(path, JSON.pretty_generate(metadata))
339
+ end
340
+
341
+ # Read metadata from file.
342
+ #
343
+ # @param path [String] Metadata file path
344
+ # @return [Hash, nil] Metadata or nil
345
+ def read_metadata(path)
346
+ return nil unless File.exist?(path)
347
+
348
+ JSON.parse(File.read(path))
349
+ rescue JSON::ParserError
350
+ nil
351
+ end
352
+
353
+ # Check if cached file exists.
354
+ #
355
+ # @param metadata_path [String] Path to metadata file
356
+ # @return [Boolean] True if cached
357
+ def cached?(metadata_path)
358
+ File.exist?(metadata_path)
359
+ end
360
+
361
+ # Check if cached file is expired.
362
+ #
363
+ # @param metadata_path [String] Path to metadata file
364
+ # @return [Boolean] True if expired
365
+ def expired?(metadata_path)
366
+ return true unless File.exist?(metadata_path)
367
+
368
+ metadata = read_metadata(metadata_path)
369
+ return true unless metadata
370
+
371
+ cached_time_str = metadata["cached_at"] || metadata["version"]
372
+ return true unless cached_time_str
373
+
374
+ begin
375
+ cached_time = Time.iso8601(cached_time_str)
376
+ Time.now.utc - cached_time > @cache_ttl
377
+ rescue StandardError
378
+ true
379
+ end
380
+ end
381
+
382
+ # Calculate checksum of content.
383
+ #
384
+ # @param content [String] Content to checksum
385
+ # @return [String] SHA256 checksum
386
+ def checksum(content)
387
+ Digest::SHA256.hexdigest(content)
388
+ end
389
+
390
+ # Verify downloaded content against the manifest and log to audit.
391
+ #
392
+ # If a manifest is published for this cache's content repo, the content's
393
+ # SHA-256 is checked against the manifest entry for `relative_path`. A
394
+ # mismatch raises {Kotoshu::IntegrityError} — callers MUST remove the
395
+ # corrupt bytes from disk so the next call re-downloads. When no manifest
396
+ # entry exists (kotoshu/dictionaries hasn't shipped one yet), the content
397
+ # is logged as `"unverified"` and accepted — graceful degradation.
398
+ #
399
+ # @param url [String] Source URL (for audit log)
400
+ # @param relative_path [String] Manifest lookup key (e.g., "en/spelling/index.dic")
401
+ # @param content [String] Downloaded bytes
402
+ # @param resource_id [String, nil] Caller-supplied resource identifier
403
+ # @return [void]
404
+ def verify_and_audit(url:, relative_path:, content:, resource_id: nil)
405
+ sha = Digest::SHA256.hexdigest(content)
406
+ entry = manifest_entry_for(relative_path)
407
+
408
+ if entry.nil?
409
+ @audit_log.record(
410
+ url: url, status: "unverified", size: content.bytesize,
411
+ sha256: sha, manifest_sha256: nil, resource_id: resource_id
412
+ )
413
+ return
414
+ end
415
+
416
+ if sha == entry.sha256
417
+ @audit_log.record(
418
+ url: url, status: "verified", size: content.bytesize,
419
+ sha256: sha, manifest_sha256: entry.sha256, resource_id: resource_id
420
+ )
421
+ else
422
+ @audit_log.record(
423
+ url: url, status: "mismatch", size: content.bytesize,
424
+ sha256: sha, manifest_sha256: entry.sha256, resource_id: resource_id
425
+ )
426
+ raise Kotoshu::IntegrityError.new(
427
+ relative_path, expected: entry.sha256, actual: sha, url: url
428
+ )
429
+ end
430
+ end
431
+
432
+ # Pin used in URL templates (default "main"; override via constructor
433
+ # or KOTOSHU_RESOURCE_PIN env var through Configuration).
434
+ #
435
+ # @return [String]
436
+ attr_reader :resource_pin
437
+
438
+ private
439
+
440
+ # Look up a manifest entry by relative path. Loads the manifest
441
+ # lazily on first call; treats HTTP 404/410 as "no manifest" (returns
442
+ # nil) so verification is gracefully skipped.
443
+ def manifest_entry_for(relative_path)
444
+ load_manifest! unless @manifest_loaded
445
+ @manifest&.fetch(relative_path)
446
+ end
447
+
448
+ # Fetch the manifest once per cache instance. Sets @manifest_loaded
449
+ # regardless of outcome so we don't retry on every download.
450
+ def load_manifest!
451
+ @manifest_loaded = true
452
+ url = manifest_url
453
+ return unless url
454
+
455
+ begin
456
+ @manifest = Kotoshu::Integrity::Manifest.load(url)
457
+ rescue StandardError => e
458
+ warn "Manifest fetch failed for #{url}: #{e.message}" if $VERBOSE
459
+ @manifest = nil
460
+ end
461
+ end
462
+
463
+ # Default manifest URL — subclasses override to point at their repo's
464
+ # manifest.json. Returns nil to opt out of manifest verification.
465
+ def manifest_url
466
+ @manifest_url
467
+ end
468
+
469
+ # Get cache size in bytes.
470
+ #
471
+ # @return [Integer] Total size in bytes
472
+ def cache_size
473
+ total = 0
474
+ Dir.glob(File.join(@cache_path, "**", "*")).each do |path|
475
+ total += File.size(path) if File.file?(path)
476
+ end
477
+ total
478
+ end
479
+
480
+ # Get oldest cached entry timestamp.
481
+ #
482
+ # @return [String, nil] ISO8601 timestamp or nil
483
+ def oldest_entry
484
+ oldest = nil
485
+
486
+ Dir.glob(File.join(@cache_path, "**", "metadata.json")).each do |metadata_path|
487
+ metadata = read_metadata(metadata_path)
488
+ next unless metadata
489
+
490
+ timestamp = metadata["cached_at"] || metadata["version"]
491
+ next unless timestamp
492
+
493
+ oldest = timestamp if oldest.nil? || timestamp < oldest
494
+ end
495
+
496
+ oldest
497
+ end
498
+
499
+ # Clean expired cache entries.
500
+ #
501
+ # @return [Integer] Number of entries removed
502
+ def clean_expired
503
+ count = 0
504
+
505
+ Dir.glob(File.join(@cache_path, "**", "metadata.json")).each do |metadata_path|
506
+ next unless expired?(metadata_path)
507
+
508
+ dir_path = File.dirname(metadata_path)
509
+ FileUtils.rm_rf(dir_path)
510
+ count += 1
511
+ end
512
+
513
+ count
514
+ end
515
+
516
+ # Clean cache entries by size.
517
+ #
518
+ # @return [Integer] Bytes reclaimed
519
+ def clean_by_size
520
+ 0 # Override in subclass if needed
521
+ end
522
+
523
+ private
524
+
525
+ # Parse resource identifier into components.
526
+ #
527
+ # @param resource_id [String] The resource identifier (e.g., "en:spelling" or "en:fasttext")
528
+ # @return [Array<String>, nil] Array of parts or nil if invalid
529
+ def parse_resource_id(resource_id)
530
+ parts = resource_id.split(":")
531
+ return nil unless parts.size == 2
532
+
533
+ parts
534
+ end
535
+
536
+ # Extract language code from resource identifier.
537
+ #
538
+ # @param resource_id [String] The resource identifier
539
+ # @return [String, nil] Language code or nil if invalid
540
+ def extract_language(resource_id)
541
+ parts = parse_resource_id(resource_id)
542
+ return nil unless parts
543
+
544
+ parts[0]
545
+ end
546
+
547
+ # Extract resource type from resource identifier.
548
+ #
549
+ # @param resource_id [String] The resource identifier
550
+ # @return [String, nil] Resource type or nil if invalid
551
+ def extract_type(resource_id)
552
+ parts = parse_resource_id(resource_id)
553
+ return nil unless parts
554
+
555
+ parts[1]
556
+ end
557
+
558
+ # Default cache path: $XDG_CACHE_HOME/kotoshu
559
+ #
560
+ # @return [String] Default cache path
561
+ def default_cache_path
562
+ Kotoshu::Paths.cache_path
563
+ end
564
+
565
+ # Default URL base.
566
+ #
567
+ # @return [String] Default URL base
568
+ def default_url_base
569
+ Kotoshu::SourceRegistry::DEFAULT_BASE_URL
570
+ end
571
+
572
+ # Default source registry — pulls from global Configuration so
573
+ # ENV (KOTOSHU_REPOS_BASE_URL, KOTOSHU_DICTIONARIES_PIN, etc.)
574
+ # and programmatic config reach the cache layer automatically.
575
+ #
576
+ # @return [Kotoshu::SourceRegistry]
577
+ def default_source_registry
578
+ Kotoshu::Configuration.instance.source_registry
579
+ end
580
+
581
+ # Default GitHub URL.
582
+ #
583
+ # @return [String] Default GitHub URL
584
+ def default_github_url
585
+ "https://github.com/kotoshu"
586
+ end
587
+
588
+ # Default cache TTL (7 days).
589
+ #
590
+ # @return [Integer] Default TTL in seconds
591
+ def default_cache_ttl
592
+ 604_800
593
+ end
594
+ end
595
+ end
596
+ end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Cache
5
+ # Base cache interface.
6
+ #
7
+ # All cache implementations should follow this interface.
8
+ #
9
+ # @abstract Subclass must implement {#fetch}, {#write}, {#read}, {#delete}, {#clear}
10
+ module Cache
11
+ # Retrieve a value from cache, or compute it.
12
+ #
13
+ # @param key [Object] The cache key
14
+ # @yield Block to compute value on cache miss
15
+ # @return [Object] The cached or computed value
16
+ # @abstract Subclass must implement
17
+ def fetch(key, &block)
18
+ raise NotImplementedError
19
+ end
20
+
21
+ # Write a value to cache.
22
+ #
23
+ # @param key [Object] The cache key
24
+ # @param value [Object] The value to store
25
+ # @return [Object] The stored value
26
+ # @abstract Subclass must implement
27
+ def write(key, value)
28
+ raise NotImplementedError
29
+ end
30
+
31
+ # Read a value from cache.
32
+ #
33
+ # @param key [Object] The cache key
34
+ # @return [Object, nil] The cached value or nil
35
+ # @abstract Subclass must implement
36
+ def read(key)
37
+ raise NotImplementedError
38
+ end
39
+
40
+ # Delete a value from cache.
41
+ #
42
+ # @param key [Object] The cache key
43
+ # @return [Object, nil] The deleted value or nil
44
+ # @abstract Subclass must implement
45
+ def delete(key)
46
+ raise NotImplementedError
47
+ end
48
+
49
+ # Clear all entries from cache.
50
+ #
51
+ # @return [self] Self for chaining
52
+ # @abstract Subclass must implement
53
+ def clear
54
+ raise NotImplementedError
55
+ end
56
+
57
+ # Check if key exists in cache.
58
+ #
59
+ # @param key [Object] The cache key
60
+ # @return [Boolean] True if key exists
61
+ # @abstract Subclass must implement
62
+ def key?(key)
63
+ raise NotImplementedError
64
+ end
65
+
66
+ # Get number of entries in cache.
67
+ #
68
+ # @return [Integer] Number of entries
69
+ # @abstract Subclass must implement
70
+ def size
71
+ raise NotImplementedError
72
+ end
73
+
74
+ # Get cache statistics.
75
+ #
76
+ # @return [Hash] Statistics including :hits, :misses, :size, :hit_rate
77
+ # @abstract Subclass must implement
78
+ def stats
79
+ raise NotImplementedError
80
+ end
81
+
82
+ # Reset statistics counters.
83
+ #
84
+ # @return [self] Self for chaining
85
+ # @abstract Subclass must implement
86
+ def reset_stats
87
+ raise NotImplementedError
88
+ end
89
+ end
90
+ end
91
+ end