kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,224 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_cache"
4
+
5
+ module Kotoshu
6
+ module Cache
7
+ # Frequency cache for Kelly Project frequency lists.
8
+ #
9
+ # Manages Kelly frequency list downloads from the kotoshu/frequency-list-kelly
10
+ # repository. Resources are cached locally in `$XDG_CACHE_HOME/kotoshu/frequency-lists/{code}/`
11
+ # with metadata for versioning and expiration.
12
+ #
13
+ # Extends BaseCache for common download, metadata, and validation logic.
14
+ #
15
+ # @example Getting cached frequency data
16
+ # cache = FrequencyCache.new
17
+ # result = cache.get('en')
18
+ # # => { frequency_path: "~/.cache/kotoshu/frequency-lists/en/frequency.json",
19
+ # # tiers: { top_50: Set<...>, top_200: Set<...>, top_1000: Set<...> },
20
+ # # metadata: { ... } }
21
+ #
22
+ # @example Checking if frequency data is available
23
+ # cache = FrequencyCache.new
24
+ # available = cache.available?('en')
25
+ # # => true
26
+ class FrequencyCache < BaseCache
27
+ # Kelly Project languages available
28
+ KELLY_LANGUAGES = %w[ar zh en el it no ru sv].freeze
29
+
30
+ # GitHub repository for Kelly frequency lists
31
+ GITHUB_REPO = "kotoshu/frequency-list-kelly"
32
+ GITHUB_BRANCH = "main"
33
+
34
+ # Get list of available languages.
35
+ #
36
+ # @return [Array<String>] List of available language codes
37
+ def available_languages
38
+ KELLY_LANGUAGES.dup
39
+ end
40
+
41
+ # Get frequency data for a language (alias for get).
42
+ #
43
+ # @param language_code [String] ISO 639-1 language code
44
+ # @param force_download [Boolean] Force re-download even if cached
45
+ # @return [Hash, nil] Frequency data with :frequency_path, :tiers, :metadata keys
46
+ def get_frequency(language_code, force_download: false)
47
+ get(language_code, force_download)
48
+ end
49
+
50
+ # Check if a resource type is supported.
51
+ #
52
+ # @param resource_id [String] The resource identifier (language code)
53
+ # @return [Boolean] True if supported
54
+ def supports_resource?(resource_id)
55
+ KELLY_LANGUAGES.include?(resource_id)
56
+ end
57
+
58
+ # List all cached resources.
59
+ #
60
+ # @return [Array<String>] List of cached language codes
61
+ def cached_resources
62
+ directories = Dir.glob(File.join(@cache_path, "*")).select do |path|
63
+ File.directory?(path) && !File.basename(path).start_with?(".")
64
+ end
65
+ directories.map { |path| File.basename(path) }
66
+ end
67
+
68
+ protected
69
+
70
+ # Download a specific resource (implements abstract method).
71
+ #
72
+ # @param language_code [String] The language code
73
+ # @param dest_path [String] Destination directory
74
+ # @return [Hash] Downloaded frequency data
75
+ def download_resource(language_code, dest_path)
76
+ FileUtils.mkdir_p(dest_path)
77
+
78
+ frequency_file = File.join(dest_path, "frequency.json")
79
+ metadata_path = File.join(dest_path, "metadata.json")
80
+
81
+ # Download from GitHub
82
+ url = frequency_url(language_code)
83
+
84
+ warn "Downloading Kelly frequency data for #{language_code} from #{url}..." if $VERBOSE
85
+
86
+ response = download_url(url)
87
+ verify_and_audit(url: url,
88
+ relative_path: "data/#{language_code}.json",
89
+ content: response,
90
+ resource_id: language_code)
91
+
92
+ # Validate it's valid JSON
93
+ JSON.parse(response)
94
+
95
+ # Save frequency file
96
+ File.write(frequency_file, response)
97
+
98
+ # Save metadata
99
+ metadata = {
100
+ version: Time.now.utc.iso8601,
101
+ url: url,
102
+ language: language_code,
103
+ type: "kelly_frequency",
104
+ checksum: checksum(response),
105
+ cached_at: Time.now.utc.iso8601
106
+ }
107
+ write_metadata(metadata_path, metadata)
108
+
109
+ # Load and return the data
110
+ load_cached(language_code)
111
+ end
112
+
113
+ # Load cached resource data (implements abstract method).
114
+ #
115
+ # @param language_code [String] The language code
116
+ # @return [Hash, nil] Loaded frequency data
117
+ def load_cached(language_code)
118
+ frequency_file = frequency_file_path(language_code)
119
+ metadata_path = metadata_path(language_code)
120
+
121
+ return nil unless File.exist?(frequency_file) && File.exist?(metadata_path)
122
+
123
+ metadata = read_metadata(metadata_path)
124
+ return nil unless metadata
125
+
126
+ # Load frequency file
127
+ require_relative "../data/common_words_loader"
128
+ data = Data::CommonWordsLoader.load_from_frequency_file(frequency_file)
129
+
130
+ {
131
+ frequency_path: frequency_file,
132
+ tiers: data[:tiers],
133
+ metadata: metadata
134
+ }
135
+ end
136
+
137
+ # Get metadata file path for a resource.
138
+ #
139
+ # @param language_code [String] The language code
140
+ # @return [String] Metadata file path
141
+ def metadata_path_for(language_code)
142
+ File.join(language_dir(language_code), "metadata.json")
143
+ end
144
+
145
+ # Get resource directory path.
146
+ #
147
+ # @param language_code [String] The language code
148
+ # @return [String] Resource directory path
149
+ def resource_dir_for(language_code)
150
+ language_dir(language_code)
151
+ end
152
+
153
+ # Check if all resource files exist.
154
+ #
155
+ # @param language_code [String] The language code
156
+ # @return [Boolean] True if all files exist
157
+ def resource_files_exist?(language_code)
158
+ File.exist?(frequency_file_path(language_code))
159
+ end
160
+
161
+ private
162
+
163
+ # Get the directory path for a language.
164
+ #
165
+ # @param language_code [String] ISO 639-1 language code
166
+ # @return [String] Directory path
167
+ def language_dir(language_code)
168
+ File.join(@cache_path, language_code)
169
+ end
170
+
171
+ # Get the path to the frequency JSON file.
172
+ #
173
+ # @param language_code [String] ISO 639-1 language code
174
+ # @return [String] File path
175
+ def frequency_file_path(language_code)
176
+ File.join(language_dir(language_code), "frequency.json")
177
+ end
178
+
179
+ # Get the path to the metadata file.
180
+ #
181
+ # @param language_code [String] ISO 639-1 language code
182
+ # @return [String] File path
183
+ def metadata_path(language_code)
184
+ metadata_path_for(language_code)
185
+ end
186
+
187
+ # Get the GitHub URL for a language's frequency file.
188
+ #
189
+ # @param language_code [String] ISO 639-1 language code
190
+ # @return [String] Download URL
191
+ def frequency_url(language_code)
192
+ @source_registry.url_for(:frequency, lang: language_code)
193
+ end
194
+
195
+ # Kelly repo manifest URL (used for integrity verification).
196
+ #
197
+ # @return [String]
198
+ def manifest_url
199
+ @manifest_url || @source_registry.url_for(:freq_manifest)
200
+ end
201
+
202
+ # Default cache path: $XDG_CACHE_HOME/kotoshu/frequency-lists/
203
+ #
204
+ # @return [String] Default cache directory path
205
+ def default_cache_path
206
+ File.join(Kotoshu::Paths.cache_path, "frequency-lists")
207
+ end
208
+
209
+ # Default URL base for Kelly frequency lists.
210
+ #
211
+ # @return [String] Default URL base
212
+ def default_url_base
213
+ "https://raw.githubusercontent.com"
214
+ end
215
+
216
+ # Default cache TTL (7 days).
217
+ #
218
+ # @return [Integer] Default TTL in seconds
219
+ def default_cache_ttl
220
+ 604_800
221
+ end
222
+ end
223
+ end
224
+ end
@@ -0,0 +1,454 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_cache"
4
+
5
+ module Kotoshu
6
+ module Cache
7
+ # Language cache for dynamic dictionary and grammar rule downloads.
8
+ #
9
+ # Manages per-language dictionary and grammar rule downloads from a remote
10
+ # repository. Resources are cached locally in `$XDG_CACHE_HOME/kotoshu/languages/{code}/`
11
+ # with metadata for versioning and expiration.
12
+ #
13
+ # Extends BaseCache for common download, metadata, and validation logic.
14
+ #
15
+ # @example Getting a cached spelling dictionary
16
+ # cache = LanguageCache.new
17
+ # result = cache.get('en')
18
+ # # => { aff_path: "~/.cache/kotoshu/languages/en/spelling/index.aff",
19
+ # # dic_path: "~/.cache/kotoshu/languages/en/spelling/index.dic",
20
+ # # metadata: { ... } }
21
+ #
22
+ # @example Checking cache statistics
23
+ # stats = cache.stats
24
+ # # => { hits: 5, misses: 1, hit_rate: 0.83, ... }
25
+ class LanguageCache < BaseCache
26
+ # Supported resource types
27
+ RESOURCE_TYPES = %w[spelling grammar frequency].freeze
28
+
29
+ # Available languages
30
+ AVAILABLE_LANGUAGES = %w[de en es fr pt ru].freeze
31
+
32
+ # Get or download spelling dictionary for a language.
33
+ #
34
+ # @param language [String] Language code (e.g., 'en', 'de')
35
+ # @param force_download [Boolean] Force re-download even if cached
36
+ # @return [Hash] Dictionary paths and metadata
37
+ def get_spelling(language, force_download: false)
38
+ resource_id = "#{language}:spelling"
39
+ result = get(resource_id, force_download: force_download)
40
+ result || download_spelling(language)
41
+ end
42
+
43
+ # Install a spelling dictionary from local files (no download).
44
+ # Used by ResourceManager.setup_from_local when the user already
45
+ # has .aff/.dic files on disk. Symlinks the source files into the
46
+ # cache directory so subsequent cache lookups find them. Existing
47
+ # symlinks are replaced when force: true; existing real files
48
+ # raise ArgumentError unless force: true.
49
+ #
50
+ # @param language [String] Language code
51
+ # @param aff [String] Path to .aff file
52
+ # @param dic [String] Path to .dic file
53
+ # @param force [Boolean] Overwrite existing install
54
+ # @return [Hash] Installed paths
55
+ def install_local(language, aff:, dic:, force: false)
56
+ require "fileutils"
57
+
58
+ resource_id = "#{language}:spelling"
59
+ lang_path = resource_dir_for(resource_id)
60
+ FileUtils.mkdir_p(lang_path)
61
+
62
+ target_aff = File.join(lang_path, "index.aff")
63
+ target_dic = File.join(lang_path, "index.dic")
64
+
65
+ if File.exist?(target_aff) || File.symlink?(target_aff)
66
+ raise ArgumentError, "#{target_aff} already exists (use force: true to overwrite)" unless force
67
+
68
+ File.unlink(target_aff)
69
+ end
70
+ if File.exist?(target_dic) || File.symlink?(target_dic)
71
+ raise ArgumentError, "#{target_dic} already exists (use force: true to overwrite)" unless force
72
+
73
+ File.unlink(target_dic)
74
+ end
75
+
76
+ File.symlink(File.expand_path(aff), target_aff)
77
+ File.symlink(File.expand_path(dic), target_dic)
78
+
79
+ write_metadata(metadata_path_for(resource_id),
80
+ build_metadata(language, "spelling", "local-source"))
81
+
82
+ { aff_path: target_aff, dic_path: target_dic, source: :local }
83
+ end
84
+
85
+ # Alias for get_spelling for backward compatibility.
86
+ #
87
+ # @param language [String] Language code
88
+ # @param force_download [Boolean] Force re-download
89
+ # @return [Hash] Dictionary paths and metadata
90
+ def get_dictionary(language, force_download: false)
91
+ get_spelling(language, force_download)
92
+ end
93
+
94
+ # Get or download grammar rules for a language.
95
+ #
96
+ # @param language [String] Language code
97
+ # @param force_download [Boolean] Force re-download
98
+ # @return [Hash] Rules path and metadata
99
+ def get_grammar(language, force_download: false)
100
+ resource_id = "#{language}:grammar"
101
+ result = get(resource_id, force_download: force_download)
102
+ result || download_grammar(language)
103
+ end
104
+
105
+ # Check if frequency data is available for a language.
106
+ #
107
+ # @param language_code [String] ISO 639-1 language code
108
+ # @return [Boolean] True if frequency data exists
109
+ def frequency_available?(language_code)
110
+ resource_id = "#{language_code}:frequency"
111
+ available?(resource_id)
112
+ end
113
+
114
+ # Get list of available languages.
115
+ #
116
+ # @return [Array<String>] List of supported language codes
117
+ def available_languages
118
+ AVAILABLE_LANGUAGES.dup
119
+ end
120
+
121
+ # Get language metadata (word count, license, source).
122
+ #
123
+ # @param language_code [String] The language code
124
+ # @return [Hash] Language info
125
+ def language_info(language_code)
126
+ {
127
+ "de" => { name: "German", word_count: 75_873, license: "GPL", source: "igerman98" },
128
+ "en" => { name: "English", word_count: 49_568, license: "LGPL/MPL/GPL", source: "SCOWL" },
129
+ "es" => { name: "Spanish", word_count: 57_344, license: "GPL", source: "LibreOffice" },
130
+ "fr" => { name: "French", word_count: 84_310, license: "MPL 2.0", source: "Grammalecte" },
131
+ "pt" => { name: "Portuguese", word_count: 312_368, license: "LGPLv3 + MPL", source: "VERO" },
132
+ "ru" => { name: "Russian", word_count: 146_269, license: "BSD-style", source: "Alexander Lebedev" }
133
+ }[language_code] || { name: language_code.upcase, word_count: 0, license: "Unknown", source: "Unknown" }
134
+ end
135
+
136
+ # Get cache size in bytes (override for language-specific tracking).
137
+ #
138
+ # @return [Integer] Total size in bytes
139
+ def cache_size
140
+ total = 0
141
+ Dir.glob(File.join(@cache_path, "languages", "**", "*.dic")).each do |path|
142
+ total += File.size(path) if File.file?(path)
143
+ end
144
+ total
145
+ end
146
+
147
+ # List all cached resources.
148
+ #
149
+ # @return [Array<String>] List of cached resource identifiers
150
+ def cached_resources
151
+ Dir.glob(File.join(@cache_path, "languages", "**", "metadata.json")).map do |path|
152
+ relative = Pathname.new(path).relative_path_from(Pathname.new(@cache_path))
153
+ parts = relative.to_s.split("/")
154
+ "#{parts[1]}:#{parts[2]}"
155
+ end.uniq
156
+ end
157
+
158
+ # Check if a resource type is supported.
159
+ #
160
+ # @param resource_id [String] The resource identifier (e.g., "en:spelling")
161
+ # @return [Boolean] True if supported
162
+ def supports_resource?(resource_id)
163
+ parts = resource_id.split(":")
164
+ return false unless parts.size == 2
165
+
166
+ language, type = parts
167
+ AVAILABLE_LANGUAGES.include?(language) && RESOURCE_TYPES.include?(type)
168
+ end
169
+
170
+ protected
171
+
172
+ # Download a spelling dictionary.
173
+ #
174
+ # @param language [String] Language code
175
+ # @return [Hash] Dictionary paths and metadata
176
+ def download_spelling(language)
177
+ lang_path = resource_dir_for("#{language}:spelling")
178
+ resource_id = "#{language}:spelling"
179
+
180
+ # Download index.aff
181
+ aff_url = @source_registry.url_for(:spelling, lang: language, ext: "aff")
182
+ aff_content = download_url(aff_url)
183
+ verify_and_audit(url: aff_url,
184
+ relative_path: "#{language}/spelling/index.aff",
185
+ content: aff_content,
186
+ resource_id: resource_id)
187
+ File.write(File.join(lang_path, "index.aff"), aff_content)
188
+
189
+ # Download index.dic
190
+ dic_url = @source_registry.url_for(:spelling, lang: language, ext: "dic")
191
+ dic_content = download_url(dic_url)
192
+ verify_and_audit(url: dic_url,
193
+ relative_path: "#{language}/spelling/index.dic",
194
+ content: dic_content,
195
+ resource_id: resource_id)
196
+ File.write(File.join(lang_path, "index.dic"), dic_content)
197
+
198
+ # Save metadata
199
+ metadata = build_metadata(language, "spelling", checksum(dic_content))
200
+ write_metadata(metadata_path_for(resource_id), metadata)
201
+
202
+ {
203
+ aff_path: File.join(lang_path, "index.aff"),
204
+ dic_path: File.join(lang_path, "index.dic"),
205
+ cached: false,
206
+ metadata: metadata
207
+ }
208
+ end
209
+
210
+ # Download grammar rules.
211
+ #
212
+ # @param language [String] Language code
213
+ # @return [Hash] Rules path and metadata
214
+ def download_grammar(language)
215
+ lang_path = resource_dir_for("#{language}:grammar")
216
+ resource_id = "#{language}:grammar"
217
+
218
+ # Download rules.yaml
219
+ rules_url = @source_registry.url_for(:grammar, lang: language)
220
+ rules_content = download_url(rules_url)
221
+ verify_and_audit(url: rules_url,
222
+ relative_path: "#{language}/grammar/rules.yaml",
223
+ content: rules_content,
224
+ resource_id: resource_id)
225
+ File.write(File.join(lang_path, "rules.yaml"), rules_content)
226
+
227
+ # Save metadata
228
+ metadata = build_metadata(language, "grammar", checksum(rules_content))
229
+ write_metadata(metadata_path_for(resource_id), metadata)
230
+
231
+ {
232
+ rules_path: lang_path,
233
+ cached: false,
234
+ metadata: metadata
235
+ }
236
+ end
237
+
238
+ # Download frequency data.
239
+ #
240
+ # @param language [String] Language code
241
+ # @return [Hash] Frequency data path and metadata
242
+ def download_frequency(language)
243
+ lang_path = resource_dir_for("#{language}:frequency")
244
+ resource_id = "#{language}:frequency"
245
+
246
+ # Download frequency.json from Kelly repository
247
+ freq_url = @source_registry.url_for(:frequency, lang: language)
248
+ freq_content = download_url(freq_url)
249
+ verify_and_audit(url: freq_url,
250
+ relative_path: "data/#{language}.json",
251
+ content: freq_content,
252
+ resource_id: resource_id)
253
+
254
+ # Validate JSON
255
+ JSON.parse(freq_content)
256
+
257
+ # Save frequency file
258
+ freq_file = File.join(lang_path, "frequency.json")
259
+ File.write(freq_file, freq_content)
260
+
261
+ # Save metadata (with custom URL for Kelly)
262
+ metadata = build_metadata(language, "kelly_frequency", checksum(freq_content))
263
+ metadata[:url] = freq_url # Override with specific Kelly URL
264
+ write_metadata(metadata_path_for(resource_id), metadata)
265
+
266
+ {
267
+ frequency_path: freq_file,
268
+ cached: false,
269
+ metadata: metadata
270
+ }
271
+ end
272
+
273
+ private
274
+
275
+ # LanguageCache serves from the kotoshu/dictionaries repo for spelling
276
+ # and grammar; frequency lives in a separate repo (kelly). Pin the
277
+ # manifest URL at the dictionaries repo since that's the primary
278
+ # surface users see. Kelly's manifest can be added when that repo
279
+ # ships one.
280
+ def manifest_url
281
+ @manifest_url || @source_registry.url_for(:dict_manifest)
282
+ end
283
+
284
+ # Download a specific resource (implements abstract method).
285
+ #
286
+ # @param resource_id [String] The resource identifier
287
+ # @param dest_path [String] Destination directory
288
+ # @return [Object] Downloaded resource
289
+ def download_resource(resource_id, _dest_path)
290
+ language = extract_language(resource_id)
291
+ type = extract_type(resource_id)
292
+ return nil unless language && type
293
+
294
+ case type
295
+ when "spelling" then download_spelling(language)
296
+ when "grammar" then download_grammar(language)
297
+ when "frequency" then download_frequency(language)
298
+ else raise "Unknown resource type: #{type}"
299
+ end
300
+ end
301
+
302
+ # Load cached resource data (implements abstract method).
303
+ #
304
+ # @param resource_id [String] The resource identifier
305
+ # @return [Object, nil] Loaded resource or nil
306
+ def load_cached(resource_id)
307
+ parts = parse_resource_id(resource_id)
308
+ return nil unless parts
309
+
310
+ type = parts[1]
311
+ metadata = load_metadata_for(resource_id)
312
+ return nil unless metadata
313
+
314
+ load_cached_resource_by_type(resource_id, type, metadata)
315
+ end
316
+
317
+ private
318
+
319
+ # Load metadata for a resource.
320
+ #
321
+ # @param resource_id [String] The resource identifier
322
+ # @return [Hash, nil] Metadata or nil if not found
323
+ def load_metadata_for(resource_id)
324
+ metadata_path = metadata_path_for(resource_id)
325
+ return nil unless File.exist?(metadata_path)
326
+
327
+ read_metadata(metadata_path)
328
+ end
329
+
330
+ # Load cached resource by type.
331
+ #
332
+ # @param resource_id [String] The resource identifier
333
+ # @param type [String] The resource type
334
+ # @param metadata [Hash] The resource metadata
335
+ # @return [Hash, nil] Loaded resource or nil
336
+ def load_cached_resource_by_type(resource_id, type, metadata)
337
+ case type
338
+ when "spelling" then load_cached_spelling(resource_id, metadata)
339
+ when "grammar" then load_cached_grammar(resource_id, metadata)
340
+ when "frequency" then load_cached_frequency(resource_id, metadata)
341
+ end
342
+ end
343
+
344
+ # Load cached spelling resource.
345
+ #
346
+ # @param resource_id [String] The resource identifier
347
+ # @param metadata [Hash] The resource metadata
348
+ # @return [Hash] Spelling resource data
349
+ def load_cached_spelling(resource_id, metadata)
350
+ lang_path = resource_dir_for(resource_id)
351
+ {
352
+ aff_path: File.join(lang_path, "index.aff"),
353
+ dic_path: File.join(lang_path, "index.dic"),
354
+ cached: true,
355
+ metadata: metadata
356
+ }
357
+ end
358
+
359
+ # Load cached grammar resource.
360
+ #
361
+ # @param resource_id [String] The resource identifier
362
+ # @param metadata [Hash] The resource metadata
363
+ # @return [Hash] Grammar resource data
364
+ def load_cached_grammar(resource_id, metadata)
365
+ lang_path = resource_dir_for(resource_id)
366
+ {
367
+ rules_path: lang_path,
368
+ cached: true,
369
+ metadata: metadata
370
+ }
371
+ end
372
+
373
+ # Load cached frequency resource.
374
+ #
375
+ # @param resource_id [String] The resource identifier
376
+ # @param metadata [Hash] The resource metadata
377
+ # @return [Hash, nil] Frequency resource data or nil
378
+ def load_cached_frequency(resource_id, metadata)
379
+ require_relative "../data/common_words_loader"
380
+ freq_file = File.join(resource_dir_for(resource_id), "frequency.json")
381
+ return nil unless File.exist?(freq_file)
382
+
383
+ data = Data::CommonWordsLoader.load_from_frequency_file(freq_file)
384
+ {
385
+ frequency_path: freq_file,
386
+ tiers: data[:tiers],
387
+ metadata: metadata
388
+ }
389
+ end
390
+
391
+ # Build metadata hash for a resource.
392
+ #
393
+ # @param language [String] Language code
394
+ # @param type [String] Resource type
395
+ # @param content_checksum [String] SHA256 checksum of content
396
+ # @return [Hash] Metadata hash
397
+ def build_metadata(language, type, content_checksum)
398
+ {
399
+ version: Time.now.utc.iso8601,
400
+ url: @url_base,
401
+ language: language,
402
+ type: type,
403
+ checksum: content_checksum,
404
+ cached_at: Time.now.utc.iso8601
405
+ }
406
+ end
407
+
408
+ public
409
+
410
+ # Get metadata file path for a resource.
411
+ #
412
+ # @param resource_id [String] The resource identifier
413
+ # @return [String] Metadata file path
414
+ def metadata_path_for(resource_id)
415
+ language = extract_language(resource_id)
416
+ type = extract_type(resource_id)
417
+ File.join(@cache_path, "languages", language, type, "metadata.json")
418
+ end
419
+
420
+ # Get resource directory path.
421
+ #
422
+ # @param resource_id [String] The resource identifier
423
+ # @return [String] Resource directory path
424
+ def resource_dir_for(resource_id)
425
+ language = extract_language(resource_id)
426
+ type = extract_type(resource_id)
427
+ File.join(@cache_path, "languages", language, type)
428
+ end
429
+
430
+ # Check if all resource files exist.
431
+ #
432
+ # @param resource_id [String] The resource identifier
433
+ # @return [Boolean] True if all files exist
434
+ def resource_files_exist?(resource_id)
435
+ type = extract_type(resource_id)
436
+ return false unless type
437
+
438
+ lang_path = resource_dir_for(resource_id)
439
+
440
+ case type
441
+ when "spelling"
442
+ File.exist?(File.join(lang_path, "index.aff")) &&
443
+ File.exist?(File.join(lang_path, "index.dic"))
444
+ when "grammar"
445
+ File.exist?(File.join(lang_path, "rules.yaml"))
446
+ when "frequency"
447
+ File.exist?(File.join(lang_path, "frequency.json"))
448
+ else
449
+ false
450
+ end
451
+ end
452
+ end
453
+ end
454
+ end