kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,166 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "cache"
4
+
5
+ module Kotoshu
6
+ module Cache
7
+ # LRU (Least Recently Used) cache for fast lookups.
8
+ #
9
+ # This cache automatically evicts the least recently used entries
10
+ # when the maximum size is reached.
11
+ #
12
+ # @example Basic usage
13
+ # cache = LookupCache.new(max_size: 1000)
14
+ # cache.write("key", "value")
15
+ # cache.read("key") # => "value"
16
+ #
17
+ # @example Using fetch for lazy computation
18
+ # cache.fetch("expensive_key") { compute_expensive_value() }
19
+ class LookupCache
20
+ include Cache
21
+
22
+ # Default maximum cache size
23
+ DEFAULT_MAX_SIZE = 1000
24
+
25
+ # @return [Integer] Maximum number of entries
26
+ attr_reader :max_size
27
+
28
+ # Create a new LRU cache.
29
+ #
30
+ # @param max_size [Integer] Maximum number of entries (default: 1000)
31
+ def initialize(max_size: DEFAULT_MAX_SIZE)
32
+ @max_size = max_size
33
+ @data = {} # key => [value, access_order]
34
+ @access_order = 0
35
+ @stats = { hits: 0, misses: 0 }
36
+ end
37
+
38
+ # Retrieve a value from cache, or compute it.
39
+ #
40
+ # @param key [Object] The cache key
41
+ # @param default [Object] Optional default value (if no block given)
42
+ # @yield Block to compute value on cache miss
43
+ # @return [Object] The cached or computed value
44
+ def fetch(key, default = nil)
45
+ if key?(key)
46
+ record_hit
47
+ @data[key][0] # Return value
48
+ else
49
+ record_miss
50
+ value = block_given? ? yield : default
51
+ write(key, value)
52
+ value
53
+ end
54
+ end
55
+
56
+ # Write a value to cache.
57
+ #
58
+ # @param key [Object] The cache key
59
+ # @param value [Object] The value to store
60
+ # @return [Object] The stored value
61
+ def write(key, value)
62
+ evict_if_needed
63
+
64
+ @access_order += 1
65
+ @data[key] = [value, @access_order]
66
+
67
+ value
68
+ end
69
+
70
+ # Read a value from cache.
71
+ #
72
+ # @param key [Object] The cache key
73
+ # @return [Object, nil] The cached value or nil
74
+ def read(key)
75
+ entry = @data[key]
76
+
77
+ if entry
78
+ record_hit
79
+ @access_order += 1
80
+ entry[1] = @access_order # Update access order
81
+ entry[0] # Return value
82
+ else
83
+ record_miss
84
+ nil
85
+ end
86
+ end
87
+
88
+ # Delete a value from cache.
89
+ #
90
+ # @param key [Object] The cache key
91
+ # @return [Object, nil] The deleted value or nil
92
+ def delete(key)
93
+ entry = @data.delete(key)
94
+ entry&.first # Return value or nil
95
+ end
96
+
97
+ # Clear all entries from cache.
98
+ #
99
+ # @return [self] Self for chaining
100
+ def clear
101
+ @data.clear
102
+ @access_order = 0
103
+ self
104
+ end
105
+
106
+ # Check if key exists in cache.
107
+ #
108
+ # @param key [Object] The cache key
109
+ # @return [Boolean] True if key exists
110
+ def key?(key)
111
+ @data.key?(key)
112
+ end
113
+
114
+ # Get number of entries in cache.
115
+ #
116
+ # @return [Integer] Number of entries
117
+ def size
118
+ @data.size
119
+ end
120
+
121
+ # Get cache statistics.
122
+ #
123
+ # @return [Hash] Statistics including :hits, :misses, :size, :hit_rate
124
+ def stats
125
+ total = @stats[:hits] + @stats[:misses]
126
+ hit_rate = total.positive? ? @stats[:hits].to_f / total : 0.0
127
+
128
+ {
129
+ hits: @stats[:hits],
130
+ misses: @stats[:misses],
131
+ size: size,
132
+ hit_rate: hit_rate.round(4)
133
+ }
134
+ end
135
+
136
+ # Reset statistics counters.
137
+ #
138
+ # @return [self] Self for chaining
139
+ def reset_stats
140
+ @stats = { hits: 0, misses: 0 }
141
+ self
142
+ end
143
+
144
+ private
145
+
146
+ # Record a cache hit.
147
+ def record_hit
148
+ @stats[:hits] += 1
149
+ end
150
+
151
+ # Record a cache miss.
152
+ def record_miss
153
+ @stats[:misses] += 1
154
+ end
155
+
156
+ # Evict least recently used entry if cache is full.
157
+ def evict_if_needed
158
+ return if @data.size < @max_size
159
+
160
+ # Find entry with lowest access order
161
+ lru_key = @data.min_by { |_, v| v[1] }&.first
162
+ @data.delete(lru_key) if lru_key
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,513 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_cache"
4
+ require "zlib"
5
+ require "open-uri"
6
+ require "open3"
7
+
8
+ module Kotoshu
9
+ module Cache
10
+ # Manages embedding model downloads from FastText CDN and GitHub.
11
+ #
12
+ # Extends BaseCache to support FastText .vec files and ONNX models.
13
+ # Downloads FastText models from Facebook's public CDN.
14
+ #
15
+ # @example Downloading a FastText model
16
+ # cache = ModelCache.new
17
+ # vec_file = cache.get_fasttext_model('en')
18
+ # model = FastTextModel.from_file(vec_file)
19
+ #
20
+ # @example Downloading an ONNX model
21
+ # onnx_file = cache.get_onnx_model('en')
22
+ class ModelCache < BaseCache
23
+ # Available models in FastText CDN and models-fasttext-onnx repository
24
+ AVAILABLE_MODELS = {
25
+ # FastText crawl vectors (300D) from Facebook Research
26
+ # https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/
27
+ # Selected high-resource languages
28
+ fasttext: {
29
+ de: { file: "cc.de.300.vec.gz", size: 1_000_000, source: "FastText Common Crawl" },
30
+ en: { file: "cc.en.300.vec.gz", size: 2_000_000, source: "FastText Common Crawl" },
31
+ es: { file: "cc.es.300.vec.gz", size: 1_000_000, source: "FastText Common Crawl" },
32
+ fr: { file: "cc.fr.300.vec.gz", size: 1_000_000, source: "FastText Common Crawl" },
33
+ pt: { file: "cc.pt.300.vec.gz", size: 1_000_000, source: "FastText Common Crawl" },
34
+ ru: { file: "cc.ru.300.vec.gz", size: 1_000_000, source: "FastText Common Crawl" }
35
+ },
36
+ # ONNX models (active set) from models-fasttext-onnx repository.
37
+ # Sizes synced with manifest.json in kotoshu/models-fasttext-onnx.
38
+ # The repo holds .onnx for 158 languages but only the 9 below are
39
+ # tracked and exposed — to promote a language, see
40
+ # models-fasttext-onnx/.gitignore and re-sync this constant.
41
+ # https://github.com/kotoshu/models-fasttext-onnx
42
+ onnx: {
43
+ de: { file: "fasttext.de.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
44
+ en: { file: "fasttext.en.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
45
+ es: { file: "fasttext.es.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
46
+ fr: { file: "fasttext.fr.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
47
+ pt: { file: "fasttext.pt.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
48
+ ru: { file: "fasttext.ru.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
49
+ zh: { file: "fasttext.zh.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
50
+ ja: { file: "fasttext.ja.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
51
+ ko: { file: "fasttext.ko.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
52
+ }
53
+ }.freeze
54
+
55
+ # Get or download FastText model for a language.
56
+ #
57
+ # @param language_code [String] ISO 639-1 language code
58
+ # @param force_download [Boolean] Force re-download
59
+ # @return [String, nil] Path to downloaded .vec file
60
+ def get_fasttext_model(language_code, force_download: false)
61
+ resource_id = "#{language_code}:fasttext"
62
+ result = get(resource_id, force_download: force_download)
63
+
64
+ result&.dig(:model_path)
65
+ end
66
+
67
+ # Get or download ONNX model for a language.
68
+ #
69
+ # @param language_code [String] ISO 639-1 language code
70
+ # @param force_download [Boolean] Force re-download
71
+ # @return [String, nil] Path to downloaded .onnx file
72
+ def get_onnx_model(language_code, force_download: false)
73
+ resource_id = "#{language_code}:onnx"
74
+ result = get(resource_id, force_download: force_download)
75
+
76
+ result&.dig(:model_path)
77
+ end
78
+
79
+ # Get available model types for a language.
80
+ #
81
+ # @param language_code [String] ISO 639-1 language code
82
+ # @return [Array<Symbol>] Available model types (:fasttext, :onnx)
83
+ def available_models_for(language_code)
84
+ lang = language_code.to_sym
85
+ types = []
86
+ types << :fasttext if AVAILABLE_MODELS[:fasttext][lang]
87
+ types << :onnx if AVAILABLE_MODELS[:onnx][lang]
88
+ types
89
+ end
90
+
91
+ # Get model info for a language and type.
92
+ #
93
+ # @param language_code [String] ISO 639-1 language code
94
+ # @param model_type [Symbol] Model type (:fasttext, :onnx)
95
+ # @return [Hash, nil] Model info or nil if not available
96
+ def model_info(language_code, model_type)
97
+ AVAILABLE_MODELS.dig(model_type, language_code.to_sym)
98
+ end
99
+
100
+ # List all available models across all languages.
101
+ #
102
+ # @return [Hash] Mapping of language to available model types
103
+ def all_available_models
104
+ AVAILABLE_MODELS
105
+ end
106
+
107
+ # Check if a resource type is supported.
108
+ #
109
+ # @param resource_id [String] The resource identifier (e.g., "en:fasttext")
110
+ # @return [Boolean] True if supported
111
+ def supports_resource?(resource_id)
112
+ parts = resource_id.split(":")
113
+ return false unless parts.size == 2
114
+
115
+ language, type = parts
116
+ AVAILABLE_MODELS[type.to_sym]&.key?(language.to_sym)
117
+ end
118
+
119
+ # List all cached resources.
120
+ #
121
+ # @return [Array<String>] List of cached resource identifiers
122
+ def cached_resources
123
+ Dir.glob(File.join(@cache_path, "**", "metadata.json")).map do |path|
124
+ relative = Pathname.new(path).relative_path_to(Pathname.new(@cache_path))
125
+ parts = relative.to_s.split("/")
126
+ "#{parts[0]}:#{parts[2]}" # language:model_type
127
+ end.uniq
128
+ end
129
+
130
+ protected
131
+
132
+ # Download a specific resource (implements abstract method).
133
+ #
134
+ # @param resource_id [String] The resource identifier
135
+ # @param dest_path [String] Destination directory
136
+ # @return [Hash] Downloaded model info
137
+ def download_resource(resource_id, dest_path)
138
+ language = extract_language(resource_id)
139
+ type = extract_type(resource_id)
140
+ return nil unless language && type
141
+
142
+ model_info = AVAILABLE_MODELS[type.to_sym][language.to_sym]
143
+ return nil unless model_info
144
+
145
+ FileUtils.mkdir_p(dest_path)
146
+
147
+ filename = model_info[:file]
148
+
149
+ # Handle ONNX with try-download-first approach
150
+ if type == "onnx"
151
+ download_or_convert_onnx(language, dest_path, filename)
152
+ else
153
+ # Handle FastText download (existing logic)
154
+ url = model_url(language, type, filename)
155
+
156
+ # Remove .gz extension for final storage (we decompress gzip files)
157
+ final_filename = filename.sub('.gz', '')
158
+ model_file = File.join(dest_path, final_filename)
159
+
160
+ # Download (and decompress if needed)
161
+ if url.end_with?('.gz')
162
+ download_and_decompress(url, model_file)
163
+ else
164
+ download_file(url, model_file)
165
+ end
166
+
167
+ # Save metadata
168
+ metadata = build_model_metadata(language, type, final_filename, url, model_file)
169
+ write_metadata(File.join(dest_path, "metadata.json"), metadata)
170
+
171
+ { model_path: model_file, metadata: metadata }
172
+ end
173
+ end
174
+
175
+ # Load cached resource data (implements abstract method).
176
+ #
177
+ # @param resource_id [String] The resource identifier
178
+ # @return [Hash, nil] Loaded model info
179
+ def load_cached(resource_id)
180
+ language = extract_language(resource_id)
181
+ type = extract_type(resource_id)
182
+ return nil unless language && type
183
+
184
+ model_info = AVAILABLE_MODELS[type.to_sym][language.to_sym]
185
+ return nil unless model_info
186
+
187
+ metadata_path = metadata_path_for(resource_id)
188
+ return nil unless File.exist?(metadata_path)
189
+
190
+ metadata = read_metadata(metadata_path)
191
+ return nil unless metadata
192
+
193
+ # For .gz files, the decompressed version is stored without .gz extension
194
+ filename = model_info[:file].sub('.gz', '')
195
+ model_file = File.join(resource_dir_for(resource_id), filename)
196
+
197
+ return nil unless File.exist?(model_file)
198
+
199
+ { model_path: model_file, metadata: metadata }
200
+ end
201
+
202
+ # Get metadata file path for a resource.
203
+ #
204
+ # @param resource_id [String] The resource identifier
205
+ # @return [String] Metadata file path
206
+ def metadata_path_for(resource_id)
207
+ language = extract_language(resource_id)
208
+ type = extract_type(resource_id)
209
+ File.join(@cache_path, language, "models", type, "metadata.json")
210
+ end
211
+
212
+ # Get resource directory path.
213
+ #
214
+ # @param resource_id [String] The resource identifier
215
+ # @return [String] Resource directory path
216
+ def resource_dir_for(resource_id)
217
+ language = extract_language(resource_id)
218
+ type = extract_type(resource_id)
219
+ File.join(@cache_path, language, "models", type)
220
+ end
221
+
222
+ # Check if all resource files exist.
223
+ #
224
+ # @param resource_id [String] The resource identifier
225
+ # @return [Boolean] True if all files exist
226
+ def resource_files_exist?(resource_id)
227
+ language = extract_language(resource_id)
228
+ type = extract_type(resource_id)
229
+ return false unless language && type
230
+
231
+ model_info = AVAILABLE_MODELS[type.to_sym][language.to_sym]
232
+ return false unless model_info
233
+
234
+ # For .gz files, check the decompressed version
235
+ filename = model_info[:file].sub('.gz', '')
236
+ model_file = File.join(resource_dir_for(resource_id), filename)
237
+ File.exist?(model_file) && File.size(model_file).positive?
238
+ end
239
+
240
+ private
241
+
242
+ # Build metadata hash for a model.
243
+ #
244
+ # @param language [String] Language code
245
+ # @param type [String] Model type
246
+ # @param filename [String] Model filename
247
+ # @param url [String] Download URL
248
+ # @param model_file [String] Path to downloaded model file
249
+ # @return [Hash] Metadata hash
250
+ def build_model_metadata(language, type, filename, url, model_file)
251
+ {
252
+ version: Time.now.utc.iso8601,
253
+ url: url,
254
+ language: language,
255
+ type: type,
256
+ file: filename,
257
+ checksum: Digest::SHA256.file(model_file).hexdigest,
258
+ cached_at: Time.now.utc.iso8601
259
+ }
260
+ end
261
+
262
+ # Get URL for a model file.
263
+ #
264
+ # @param language [String] Language code
265
+ # @param type [String] Model type
266
+ # @param filename [String] Model filename
267
+ # @return [String, nil] Download URL
268
+ def model_url(language, type, filename)
269
+ case type
270
+ when "fasttext"
271
+ # Download from FastText CDN (Facebook Research)
272
+ # https://fasttext.cc/docs/en/english-vectors.html
273
+ "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/#{filename}"
274
+ when "onnx"
275
+ # Download from models-fasttext-onnx GitHub repository.
276
+ # SourceRegistry owns the per-repo pin so we never accidentally
277
+ # fall back to the dictionaries pin.
278
+ @source_registry.url_for(:model, lang: language)
279
+ else
280
+ "#{@url_base}/dictionaries/main/#{language}/models/#{type}/#{filename}"
281
+ end
282
+ end
283
+
284
+ # URL for the vocab.json sibling file. The conversion script ships
285
+ # vocabularies alongside the .onnx so OnnxModel.from_file can resolve
286
+ # word→index without re-parsing the FastText .vec.
287
+ #
288
+ # @param language [String] Language code
289
+ # @return [String]
290
+ def vocab_url(language)
291
+ @source_registry.url_for(:model_vocab, lang: language)
292
+ end
293
+
294
+ # Download and decompress gzip file.
295
+ #
296
+ # @param url [String] URL to gzip file
297
+ # @param dest_path [String] Destination path (without .gz)
298
+ def download_and_decompress(url, dest_path)
299
+ # Download to temporary file first
300
+ temp_gz = "#{dest_path}.gz"
301
+
302
+ puts " Downloading from #{url.split('/').last}..." if $VERBOSE
303
+
304
+ downloaded_bytes = 0
305
+ URI.open(url, open_timeout: 30, read_timeout: 300) do |uri|
306
+ File.open(temp_gz, 'wb') do |f|
307
+ downloaded_bytes = f.write(uri.read)
308
+ end
309
+ end
310
+
311
+ puts " Downloaded: #{(downloaded_bytes.to_f / 1024 / 1024).round(2)} MB" if $VERBOSE
312
+
313
+ # Verify the download succeeded
314
+ unless File.exist?(temp_gz) && File.size(temp_gz).positive?
315
+ raise "Download failed: #{temp_gz} is empty or missing"
316
+ end
317
+
318
+ puts " Decompressing..." if $VERBOSE
319
+
320
+ # Remove existing file if present (handles partial downloads)
321
+ File.delete(dest_path) if File.exist?(dest_path)
322
+
323
+ # Decompress gzip with streaming
324
+ File.open(temp_gz, 'rb') do |gz_file|
325
+ Zlib::GzipReader.wrap(gz_file) do |gzip|
326
+ # Stream in chunks to avoid memory issues with large files
327
+ File.open(dest_path, 'wb') do |out_file|
328
+ chunk_size = 65_536 # 64KB chunks
329
+ while (chunk = gzip.read(chunk_size))
330
+ out_file.write(chunk)
331
+ # Print progress every 10MB
332
+ if $VERBOSE && out_file.pos % (10 * 1024 * 1024) < chunk_size
333
+ puts " Decompressed: #{(out_file.pos.to_f / 1024 / 1024).round(1)} MB..."
334
+ end
335
+ end
336
+ end
337
+ end
338
+ end
339
+
340
+ # Verify the decompression succeeded
341
+ unless File.exist?(dest_path) && File.size(dest_path).positive?
342
+ raise "Decompression failed: #{dest_path} is empty or missing"
343
+ end
344
+
345
+ # Clean up gz file
346
+ File.delete(temp_gz)
347
+
348
+ puts " ✓ Downloaded and decompressed" if $VERBOSE
349
+ end
350
+
351
+ # Convert FastText .vec file to ONNX format.
352
+ #
353
+ # @param language [String] Language code
354
+ # @param dest_path [String] Destination directory
355
+ # @param onnx_filename [String] Output ONNX filename
356
+ # @return [Hash] Converted model info
357
+ def convert_to_onnx(language, dest_path, onnx_filename)
358
+ puts "Converting FastText to ONNX for #{language}..." if $VERBOSE
359
+
360
+ # First, ensure we have the FastText .vec file
361
+ fasttext_resource_id = "#{language}:fasttext"
362
+ fasttext_result = get(fasttext_resource_id, force_download: false)
363
+
364
+ unless fasttext_result
365
+ raise "Failed to get FastText model for #{language} needed for ONNX conversion"
366
+ end
367
+
368
+ vec_file = fasttext_result[:model_path]
369
+
370
+ # Verify the .vec file exists
371
+ unless File.exist?(vec_file)
372
+ raise "FastText .vec file not found: #{vec_file}"
373
+ end
374
+
375
+ # Output ONNX file path
376
+ onnx_file = File.join(dest_path, onnx_filename)
377
+
378
+ # Get the conversion script path
379
+ script_path = File.expand_path('../scripts/fasttext_to_onnx.py', __dir__)
380
+
381
+ unless File.exist?(script_path)
382
+ raise "ONNX conversion script not found: #{script_path}"
383
+ end
384
+
385
+ # Build conversion command
386
+ # Use --vocab-size to limit vocabulary size and reduce conversion time
387
+ vocab_size = fasttext_result.dig(:metadata, "vocab_size")&.to_i || 100_000
388
+
389
+ cmd = [
390
+ 'python3',
391
+ script_path,
392
+ vec_file,
393
+ onnx_file,
394
+ '--vocab-size', vocab_size.to_s
395
+ ]
396
+
397
+ puts " Running conversion: #{shell_join(cmd)}" if $VERBOSE
398
+
399
+ # Run conversion
400
+ require 'open3'
401
+ stdout, stderr, status = Open3.capture3(*cmd)
402
+
403
+ unless status.success?
404
+ raise "ONNX conversion failed:\n#{stdout}\n#{stderr}"
405
+ end
406
+
407
+ puts stdout if $VERBOSE
408
+
409
+ # Build metadata for the ONNX file
410
+ metadata = {
411
+ version: Time.now.utc.iso8601,
412
+ url: "converted:#{vec_file}",
413
+ language: language,
414
+ type: "onnx",
415
+ file: onnx_filename,
416
+ checksum: Digest::SHA256.file(onnx_file).hexdigest,
417
+ cached_at: Time.now.utc.iso8601,
418
+ source_model: File.basename(vec_file),
419
+ conversion_method: "fasttext_to_onnx.py"
420
+ }
421
+
422
+ # Save metadata
423
+ write_metadata(File.join(dest_path, "metadata.json"), metadata)
424
+
425
+ puts " ✓ ONNX conversion complete" if $VERBOSE
426
+
427
+ { model_path: onnx_file, metadata: metadata }
428
+ end
429
+
430
+ # Try to download ONNX from GitHub, fall back to conversion if download fails.
431
+ #
432
+ # @param language [String] Language code
433
+ # @param dest_path [String] Destination directory
434
+ # @param onnx_filename [String] ONNX filename
435
+ # @return [Hash] Downloaded or converted model info
436
+ def download_or_convert_onnx(language, dest_path, onnx_filename)
437
+ url = model_url(language, "onnx", onnx_filename)
438
+ onnx_file = File.join(dest_path, onnx_filename)
439
+
440
+ puts " Attempting download from GitHub..." if $VERBOSE
441
+
442
+ # Try downloading from GitHub first
443
+ begin
444
+ download_file(url, onnx_file)
445
+
446
+ # Verify the downloaded file
447
+ unless File.exist?(onnx_file) && File.size(onnx_file).positive?
448
+ raise "Download failed: empty file"
449
+ end
450
+
451
+ # Pull the matching vocab.json so OnnxModel.from_file can resolve
452
+ # word→index without re-parsing the source FastText .vec.
453
+ begin
454
+ download_file(vocab_url(language),
455
+ File.join(dest_path, "fasttext.#{language}.vocab.json"))
456
+ rescue StandardError => e
457
+ warn " vocab.json unavailable for #{language}: #{e.message}" if $VERBOSE
458
+ end
459
+
460
+ puts " ✓ Downloaded from GitHub" if $VERBOSE
461
+
462
+ # Build metadata for downloaded file
463
+ metadata = {
464
+ version: Time.now.utc.iso8601,
465
+ url: url,
466
+ language: language,
467
+ type: "onnx",
468
+ file: onnx_filename,
469
+ checksum: Digest::SHA256.file(onnx_file).hexdigest,
470
+ cached_at: Time.now.utc.iso8601,
471
+ source: "github"
472
+ }
473
+
474
+ # Save metadata
475
+ write_metadata(File.join(dest_path, "metadata.json"), metadata)
476
+
477
+ { model_path: onnx_file, metadata: metadata }
478
+ rescue StandardError => e
479
+ puts " GitHub download failed: #{e.message}" if $VERBOSE
480
+ puts " Falling back to local conversion..." if $VERBOSE
481
+
482
+ # Remove partial download if any
483
+ File.delete(onnx_file) if File.exist?(onnx_file)
484
+
485
+ # Fall back to local conversion
486
+ convert_to_onnx(language, dest_path, onnx_filename)
487
+ end
488
+ end
489
+
490
+ # Join shell command arguments safely (for display purposes).
491
+ #
492
+ # @param args [Array<String>] Command arguments
493
+ # @return [String] Joined command string
494
+ def shell_join(args)
495
+ args.map { |a| a =~ /\s/ ? "'#{a}'" : a }.join(' ')
496
+ end
497
+
498
+ # Default cache path: $XDG_CACHE_HOME/kotoshu/models
499
+ #
500
+ # @return [String] Default cache path
501
+ def default_cache_path
502
+ File.join(Kotoshu::Paths.cache_path, "models")
503
+ end
504
+
505
+ # Default cache TTL (30 days for models).
506
+ #
507
+ # @return [Integer] Default TTL in seconds
508
+ def default_cache_ttl
509
+ 2_592_000 # 30 days
510
+ end
511
+ end
512
+ end
513
+ end