RubyGems - kotoshu - Versions diffs - 0.3.0 - Mend

kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (210) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +18 -0
data/CHANGELOG.md +182 -0
data/CLAUDE.md +172 -0
data/CODE_OF_CONDUCT.md +132 -0
data/LICENSE +31 -0
data/README.adoc +955 -0
data/Rakefile +12 -0
data/SECURITY.md +93 -0
data/examples/01_basic_word_checking.rb +38 -0
data/examples/02_text_document_checking.rb +77 -0
data/examples/03_dictionary_backends.rb +137 -0
data/examples/04_trie_data_structure.rb +146 -0
data/examples/05_suggestion_algorithms.rb +239 -0
data/examples/06_configuration_advanced.rb +287 -0
data/examples/07_multi_language_dictionaries.rb +278 -0
data/exe/kotoshu +6 -0
data/lib/kotoshu/algorithms/capitalization.rb +276 -0
data/lib/kotoshu/algorithms/lookup.rb +876 -0
data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
data/lib/kotoshu/algorithms/permutations.rb +283 -0
data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
data/lib/kotoshu/algorithms/suggest.rb +575 -0
data/lib/kotoshu/algorithms.rb +14 -0
data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
data/lib/kotoshu/cache/base_cache.rb +596 -0
data/lib/kotoshu/cache/cache.rb +91 -0
data/lib/kotoshu/cache/frequency_cache.rb +224 -0
data/lib/kotoshu/cache/language_cache.rb +454 -0
data/lib/kotoshu/cache/lookup_cache.rb +166 -0
data/lib/kotoshu/cache/model_cache.rb +513 -0
data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
data/lib/kotoshu/cache.rb +40 -0
data/lib/kotoshu/cli/auto_setup.rb +71 -0
data/lib/kotoshu/cli/batch_reporter.rb +315 -0
data/lib/kotoshu/cli/cache_command.rb +356 -0
data/lib/kotoshu/cli/display_formatter.rb +431 -0
data/lib/kotoshu/cli/errors.rb +36 -0
data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
data/lib/kotoshu/cli/language_resolver.rb +91 -0
data/lib/kotoshu/cli/navigation_manager.rb +272 -0
data/lib/kotoshu/cli/progress_reporter.rb +114 -0
data/lib/kotoshu/cli/status_report.rb +130 -0
data/lib/kotoshu/cli.rb +627 -0
data/lib/kotoshu/commands/cache_command.rb +424 -0
data/lib/kotoshu/commands/check_command.rb +312 -0
data/lib/kotoshu/commands/model_command.rb +295 -0
data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
data/lib/kotoshu/components/pos_tagger.rb +98 -0
data/lib/kotoshu/components/spell_checker.rb +73 -0
data/lib/kotoshu/components/synthesizer.rb +60 -0
data/lib/kotoshu/components/tokenizer.rb +58 -0
data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
data/lib/kotoshu/configuration/builder.rb +209 -0
data/lib/kotoshu/configuration/resolver.rb +124 -0
data/lib/kotoshu/configuration.rb +702 -0
data/lib/kotoshu/core/exceptions.rb +165 -0
data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
data/lib/kotoshu/core/models/affix_rule.rb +260 -0
data/lib/kotoshu/core/models/result/document_result.rb +263 -0
data/lib/kotoshu/core/models/result/word_result.rb +203 -0
data/lib/kotoshu/core/models/word.rb +142 -0
data/lib/kotoshu/core/trie/builder.rb +119 -0
data/lib/kotoshu/core/trie/node.rb +94 -0
data/lib/kotoshu/core/trie/trie.rb +249 -0
data/lib/kotoshu/core.rb +28 -0
data/lib/kotoshu/data/common_words/de.yml +1800 -0
data/lib/kotoshu/data/common_words/en.yml +1215 -0
data/lib/kotoshu/data/common_words/es.yml +750 -0
data/lib/kotoshu/data/common_words/fr.yml +1015 -0
data/lib/kotoshu/data/common_words/pt.yml +870 -0
data/lib/kotoshu/data/common_words/ru.yml +484 -0
data/lib/kotoshu/data/common_words_loader.rb +152 -0
data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
data/lib/kotoshu/debug_logger.rb +146 -0
data/lib/kotoshu/debug_mode.rb +134 -0
data/lib/kotoshu/defaults.rb +86 -0
data/lib/kotoshu/dictionaries/catalog.rb +817 -0
data/lib/kotoshu/dictionary/base.rb +237 -0
data/lib/kotoshu/dictionary/cspell.rb +254 -0
data/lib/kotoshu/dictionary/custom.rb +224 -0
data/lib/kotoshu/dictionary/hunspell.rb +526 -0
data/lib/kotoshu/dictionary/plain_text.rb +282 -0
data/lib/kotoshu/dictionary/repository.rb +248 -0
data/lib/kotoshu/dictionary/unified.rb +260 -0
data/lib/kotoshu/dictionary/unix_words.rb +218 -0
data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
data/lib/kotoshu/documents/document.rb +229 -0
data/lib/kotoshu/documents/location.rb +139 -0
data/lib/kotoshu/documents/markdown_document.rb +389 -0
data/lib/kotoshu/documents/plain_text_document.rb +147 -0
data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
data/lib/kotoshu/embeddings/protocol.rb +83 -0
data/lib/kotoshu/embeddings/protocols.rb +17 -0
data/lib/kotoshu/embeddings/registry.rb +182 -0
data/lib/kotoshu/embeddings/search.rb +192 -0
data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
data/lib/kotoshu/embeddings.rb +97 -0
data/lib/kotoshu/fluent_checker.rb +91 -0
data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
data/lib/kotoshu/grammar/rule.rb +95 -0
data/lib/kotoshu/grammar/rule_engine.rb +111 -0
data/lib/kotoshu/grammar/rule_loader.rb +31 -0
data/lib/kotoshu/grammar.rb +18 -0
data/lib/kotoshu/integrity/audit_log.rb +88 -0
data/lib/kotoshu/integrity/manifest.rb +117 -0
data/lib/kotoshu/integrity/net_http.rb +46 -0
data/lib/kotoshu/integrity.rb +25 -0
data/lib/kotoshu/keyboard/layout.rb +115 -0
data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
data/lib/kotoshu/keyboard/registry.rb +146 -0
data/lib/kotoshu/keyboard.rb +60 -0
data/lib/kotoshu/language/detector.rb +242 -0
data/lib/kotoshu/language/identifier.rb +378 -0
data/lib/kotoshu/language/languages/base.rb +256 -0
data/lib/kotoshu/language/normalizer/base.rb +137 -0
data/lib/kotoshu/language/registry.rb +147 -0
data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
data/lib/kotoshu/language/tokenizer/base.rb +170 -0
data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
data/lib/kotoshu/language.rb +99 -0
data/lib/kotoshu/languages/de/language.rb +546 -0
data/lib/kotoshu/languages/en/language.rb +448 -0
data/lib/kotoshu/languages/es/language.rb +459 -0
data/lib/kotoshu/languages/fr/language.rb +493 -0
data/lib/kotoshu/languages/ja/language.rb +477 -0
data/lib/kotoshu/languages/pt/language.rb +423 -0
data/lib/kotoshu/languages/ru/language.rb +404 -0
data/lib/kotoshu/languages.rb +43 -0
data/lib/kotoshu/metrics_collector.rb +222 -0
data/lib/kotoshu/metrics_module.rb +110 -0
data/lib/kotoshu/models/context.rb +119 -0
data/lib/kotoshu/models/embedding_model.rb +182 -0
data/lib/kotoshu/models/fasttext_model.rb +220 -0
data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
data/lib/kotoshu/models/onnx_model.rb +333 -0
data/lib/kotoshu/models/semantic_error.rb +165 -0
data/lib/kotoshu/models/suggestion.rb +106 -0
data/lib/kotoshu/models/word_embedding.rb +107 -0
data/lib/kotoshu/paths.rb +53 -0
data/lib/kotoshu/personal_dictionary.rb +94 -0
data/lib/kotoshu/plugins/plugin.rb +61 -0
data/lib/kotoshu/plugins/registry.rb +120 -0
data/lib/kotoshu/project_config.rb +76 -0
data/lib/kotoshu/readers/aff_data.rb +356 -0
data/lib/kotoshu/readers/aff_reader.rb +375 -0
data/lib/kotoshu/readers/condition_checker.rb +142 -0
data/lib/kotoshu/readers/dic_reader.rb +118 -0
data/lib/kotoshu/readers/file_reader.rb +347 -0
data/lib/kotoshu/readers/lookup_builder.rb +299 -0
data/lib/kotoshu/readers/readers.rb +6 -0
data/lib/kotoshu/readers.rb +9 -0
data/lib/kotoshu/resource_bundle.rb +30 -0
data/lib/kotoshu/resource_manager.rb +295 -0
data/lib/kotoshu/results/result.rb +165 -0
data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
data/lib/kotoshu/source_registry.rb +74 -0
data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
data/lib/kotoshu/spellchecker.rb +298 -0
data/lib/kotoshu/string_metrics.rb +153 -0
data/lib/kotoshu/suggestions/context.rb +55 -0
data/lib/kotoshu/suggestions/generator.rb +175 -0
data/lib/kotoshu/suggestions/pipeline.rb +135 -0
data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
data/lib/kotoshu/suggestions/suggestion.rb +174 -0
data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
data/lib/kotoshu/version.rb +5 -0
data/lib/kotoshu.rb +493 -0
data/script/validate_all_dictionaries.rb +444 -0
data/sig/kotoshu.rbs +4 -0
data/test_oop.rb +79 -0
metadata +298 -0

data/lib/kotoshu/cache/lookup_cache.rb ADDED Viewed

@@ -0,0 +1,166 @@
+# frozen_string_literal: true
+require_relative "cache"
+module Kotoshu
+  module Cache
+    # LRU (Least Recently Used) cache for fast lookups.
+    #
+    # This cache automatically evicts the least recently used entries
+    # when the maximum size is reached.
+    #
+    # @example Basic usage
+    #   cache = LookupCache.new(max_size: 1000)
+    #   cache.write("key", "value")
+    #   cache.read("key")  # => "value"
+    #
+    # @example Using fetch for lazy computation
+    #   cache.fetch("expensive_key") { compute_expensive_value() }
+    class LookupCache
+      include Cache
+      # Default maximum cache size
+      DEFAULT_MAX_SIZE = 1000
+      # @return [Integer] Maximum number of entries
+      attr_reader :max_size
+      # Create a new LRU cache.
+      #
+      # @param max_size [Integer] Maximum number of entries (default: 1000)
+      def initialize(max_size: DEFAULT_MAX_SIZE)
+        @max_size = max_size
+        @data = {} # key => [value, access_order]
+        @access_order = 0
+        @stats = { hits: 0, misses: 0 }
+      end
+      # Retrieve a value from cache, or compute it.
+      #
+      # @param key [Object] The cache key
+      # @param default [Object] Optional default value (if no block given)
+      # @yield Block to compute value on cache miss
+      # @return [Object] The cached or computed value
+      def fetch(key, default = nil)
+        if key?(key)
+          record_hit
+          @data[key][0] # Return value
+        else
+          record_miss
+          value = block_given? ? yield : default
+          write(key, value)
+          value
+        end
+      end
+      # Write a value to cache.
+      #
+      # @param key [Object] The cache key
+      # @param value [Object] The value to store
+      # @return [Object] The stored value
+      def write(key, value)
+        evict_if_needed
+        @access_order += 1
+        @data[key] = [value, @access_order]
+        value
+      end
+      # Read a value from cache.
+      #
+      # @param key [Object] The cache key
+      # @return [Object, nil] The cached value or nil
+      def read(key)
+        entry = @data[key]
+        if entry
+          record_hit
+          @access_order += 1
+          entry[1] = @access_order # Update access order
+          entry[0] # Return value
+        else
+          record_miss
+          nil
+        end
+      end
+      # Delete a value from cache.
+      #
+      # @param key [Object] The cache key
+      # @return [Object, nil] The deleted value or nil
+      def delete(key)
+        entry = @data.delete(key)
+        entry&.first # Return value or nil
+      end
+      # Clear all entries from cache.
+      #
+      # @return [self] Self for chaining
+      def clear
+        @data.clear
+        @access_order = 0
+        self
+      end
+      # Check if key exists in cache.
+      #
+      # @param key [Object] The cache key
+      # @return [Boolean] True if key exists
+      def key?(key)
+        @data.key?(key)
+      end
+      # Get number of entries in cache.
+      #
+      # @return [Integer] Number of entries
+      def size
+        @data.size
+      end
+      # Get cache statistics.
+      #
+      # @return [Hash] Statistics including :hits, :misses, :size, :hit_rate
+      def stats
+        total = @stats[:hits] + @stats[:misses]
+        hit_rate = total.positive? ? @stats[:hits].to_f / total : 0.0
+        {
+          hits: @stats[:hits],
+          misses: @stats[:misses],
+          size: size,
+          hit_rate: hit_rate.round(4)
+        }
+      end
+      # Reset statistics counters.
+      #
+      # @return [self] Self for chaining
+      def reset_stats
+        @stats = { hits: 0, misses: 0 }
+        self
+      end
+      private
+      # Record a cache hit.
+      def record_hit
+        @stats[:hits] += 1
+      end
+      # Record a cache miss.
+      def record_miss
+        @stats[:misses] += 1
+      end
+      # Evict least recently used entry if cache is full.
+      def evict_if_needed
+        return if @data.size < @max_size
+        # Find entry with lowest access order
+        lru_key = @data.min_by { |_, v| v[1] }&.first
+        @data.delete(lru_key) if lru_key
+      end
+    end
+  end
+end

data/lib/kotoshu/cache/model_cache.rb ADDED Viewed

@@ -0,0 +1,513 @@
+# frozen_string_literal: true
+require_relative "base_cache"
+require "zlib"
+require "open-uri"
+require "open3"
+module Kotoshu
+  module Cache
+    # Manages embedding model downloads from FastText CDN and GitHub.
+    #
+    # Extends BaseCache to support FastText .vec files and ONNX models.
+    # Downloads FastText models from Facebook's public CDN.
+    #
+    # @example Downloading a FastText model
+    #   cache = ModelCache.new
+    #   vec_file = cache.get_fasttext_model('en')
+    #   model = FastTextModel.from_file(vec_file)
+    #
+    # @example Downloading an ONNX model
+    #   onnx_file = cache.get_onnx_model('en')
+    class ModelCache < BaseCache
+      # Available models in FastText CDN and models-fasttext-onnx repository
+      AVAILABLE_MODELS = {
+        # FastText crawl vectors (300D) from Facebook Research
+        # https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/
+        # Selected high-resource languages
+        fasttext: {
+          de: { file: "cc.de.300.vec.gz", size: 1_000_000, source: "FastText Common Crawl" },
+          en: { file: "cc.en.300.vec.gz", size: 2_000_000, source: "FastText Common Crawl" },
+          es: { file: "cc.es.300.vec.gz", size: 1_000_000, source: "FastText Common Crawl" },
+          fr: { file: "cc.fr.300.vec.gz", size: 1_000_000, source: "FastText Common Crawl" },
+          pt: { file: "cc.pt.300.vec.gz", size: 1_000_000, source: "FastText Common Crawl" },
+          ru: { file: "cc.ru.300.vec.gz", size: 1_000_000, source: "FastText Common Crawl" }
+        },
+        # ONNX models (active set) from models-fasttext-onnx repository.
+        # Sizes synced with manifest.json in kotoshu/models-fasttext-onnx.
+        # The repo holds .onnx for 158 languages but only the 9 below are
+        # tracked and exposed — to promote a language, see
+        # models-fasttext-onnx/.gitignore and re-sync this constant.
+        # https://github.com/kotoshu/models-fasttext-onnx
+        onnx: {
+          de: { file: "fasttext.de.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
+          en: { file: "fasttext.en.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
+          es: { file: "fasttext.es.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
+          fr: { file: "fasttext.fr.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
+          pt: { file: "fasttext.pt.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
+          ru: { file: "fasttext.ru.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
+          zh: { file: "fasttext.zh.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
+          ja: { file: "fasttext.ja.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
+          ko: { file: "fasttext.ko.onnx", size: 120_000_415, source: "models-fasttext-onnx" },
+        }
+      }.freeze
+      # Get or download FastText model for a language.
+      #
+      # @param language_code [String] ISO 639-1 language code
+      # @param force_download [Boolean] Force re-download
+      # @return [String, nil] Path to downloaded .vec file
+      def get_fasttext_model(language_code, force_download: false)
+        resource_id = "#{language_code}:fasttext"
+        result = get(resource_id, force_download: force_download)
+        result&.dig(:model_path)
+      end
+      # Get or download ONNX model for a language.
+      #
+      # @param language_code [String] ISO 639-1 language code
+      # @param force_download [Boolean] Force re-download
+      # @return [String, nil] Path to downloaded .onnx file
+      def get_onnx_model(language_code, force_download: false)
+        resource_id = "#{language_code}:onnx"
+        result = get(resource_id, force_download: force_download)
+        result&.dig(:model_path)
+      end
+      # Get available model types for a language.
+      #
+      # @param language_code [String] ISO 639-1 language code
+      # @return [Array<Symbol>] Available model types (:fasttext, :onnx)
+      def available_models_for(language_code)
+        lang = language_code.to_sym
+        types = []
+        types << :fasttext if AVAILABLE_MODELS[:fasttext][lang]
+        types << :onnx if AVAILABLE_MODELS[:onnx][lang]
+        types
+      end
+      # Get model info for a language and type.
+      #
+      # @param language_code [String] ISO 639-1 language code
+      # @param model_type [Symbol] Model type (:fasttext, :onnx)
+      # @return [Hash, nil] Model info or nil if not available
+      def model_info(language_code, model_type)
+        AVAILABLE_MODELS.dig(model_type, language_code.to_sym)
+      end
+      # List all available models across all languages.
+      #
+      # @return [Hash] Mapping of language to available model types
+      def all_available_models
+        AVAILABLE_MODELS
+      end
+      # Check if a resource type is supported.
+      #
+      # @param resource_id [String] The resource identifier (e.g., "en:fasttext")
+      # @return [Boolean] True if supported
+      def supports_resource?(resource_id)
+        parts = resource_id.split(":")
+        return false unless parts.size == 2
+        language, type = parts
+        AVAILABLE_MODELS[type.to_sym]&.key?(language.to_sym)
+      end
+      # List all cached resources.
+      #
+      # @return [Array<String>] List of cached resource identifiers
+      def cached_resources
+        Dir.glob(File.join(@cache_path, "**", "metadata.json")).map do |path|
+          relative = Pathname.new(path).relative_path_to(Pathname.new(@cache_path))
+          parts = relative.to_s.split("/")
+          "#{parts[0]}:#{parts[2]}" # language:model_type
+        end.uniq
+      end
+      protected
+      # Download a specific resource (implements abstract method).
+      #
+      # @param resource_id [String] The resource identifier
+      # @param dest_path [String] Destination directory
+      # @return [Hash] Downloaded model info
+      def download_resource(resource_id, dest_path)
+        language = extract_language(resource_id)
+        type = extract_type(resource_id)
+        return nil unless language && type
+        model_info = AVAILABLE_MODELS[type.to_sym][language.to_sym]
+        return nil unless model_info
+        FileUtils.mkdir_p(dest_path)
+        filename = model_info[:file]
+        # Handle ONNX with try-download-first approach
+        if type == "onnx"
+          download_or_convert_onnx(language, dest_path, filename)
+        else
+          # Handle FastText download (existing logic)
+          url = model_url(language, type, filename)
+          # Remove .gz extension for final storage (we decompress gzip files)
+          final_filename = filename.sub('.gz', '')
+          model_file = File.join(dest_path, final_filename)
+          # Download (and decompress if needed)
+          if url.end_with?('.gz')
+            download_and_decompress(url, model_file)
+          else
+            download_file(url, model_file)
+          end
+          # Save metadata
+          metadata = build_model_metadata(language, type, final_filename, url, model_file)
+          write_metadata(File.join(dest_path, "metadata.json"), metadata)
+          { model_path: model_file, metadata: metadata }
+        end
+      end
+      # Load cached resource data (implements abstract method).
+      #
+      # @param resource_id [String] The resource identifier
+      # @return [Hash, nil] Loaded model info
+      def load_cached(resource_id)
+        language = extract_language(resource_id)
+        type = extract_type(resource_id)
+        return nil unless language && type
+        model_info = AVAILABLE_MODELS[type.to_sym][language.to_sym]
+        return nil unless model_info
+        metadata_path = metadata_path_for(resource_id)
+        return nil unless File.exist?(metadata_path)
+        metadata = read_metadata(metadata_path)
+        return nil unless metadata
+        # For .gz files, the decompressed version is stored without .gz extension
+        filename = model_info[:file].sub('.gz', '')
+        model_file = File.join(resource_dir_for(resource_id), filename)
+        return nil unless File.exist?(model_file)
+        { model_path: model_file, metadata: metadata }
+      end
+      # Get metadata file path for a resource.
+      #
+      # @param resource_id [String] The resource identifier
+      # @return [String] Metadata file path
+      def metadata_path_for(resource_id)
+        language = extract_language(resource_id)
+        type = extract_type(resource_id)
+        File.join(@cache_path, language, "models", type, "metadata.json")
+      end
+      # Get resource directory path.
+      #
+      # @param resource_id [String] The resource identifier
+      # @return [String] Resource directory path
+      def resource_dir_for(resource_id)
+        language = extract_language(resource_id)
+        type = extract_type(resource_id)
+        File.join(@cache_path, language, "models", type)
+      end
+      # Check if all resource files exist.
+      #
+      # @param resource_id [String] The resource identifier
+      # @return [Boolean] True if all files exist
+      def resource_files_exist?(resource_id)
+        language = extract_language(resource_id)
+        type = extract_type(resource_id)
+        return false unless language && type
+        model_info = AVAILABLE_MODELS[type.to_sym][language.to_sym]
+        return false unless model_info
+        # For .gz files, check the decompressed version
+        filename = model_info[:file].sub('.gz', '')
+        model_file = File.join(resource_dir_for(resource_id), filename)
+        File.exist?(model_file) && File.size(model_file).positive?
+      end
+      private
+      # Build metadata hash for a model.
+      #
+      # @param language [String] Language code
+      # @param type [String] Model type
+      # @param filename [String] Model filename
+      # @param url [String] Download URL
+      # @param model_file [String] Path to downloaded model file
+      # @return [Hash] Metadata hash
+      def build_model_metadata(language, type, filename, url, model_file)
+        {
+          version: Time.now.utc.iso8601,
+          url: url,
+          language: language,
+          type: type,
+          file: filename,
+          checksum: Digest::SHA256.file(model_file).hexdigest,
+          cached_at: Time.now.utc.iso8601
+        }
+      end
+      # Get URL for a model file.
+      #
+      # @param language [String] Language code
+      # @param type [String] Model type
+      # @param filename [String] Model filename
+      # @return [String, nil] Download URL
+      def model_url(language, type, filename)
+        case type
+        when "fasttext"
+          # Download from FastText CDN (Facebook Research)
+          # https://fasttext.cc/docs/en/english-vectors.html
+          "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/#{filename}"
+        when "onnx"
+          # Download from models-fasttext-onnx GitHub repository.
+          # SourceRegistry owns the per-repo pin so we never accidentally
+          # fall back to the dictionaries pin.
+          @source_registry.url_for(:model, lang: language)
+        else
+          "#{@url_base}/dictionaries/main/#{language}/models/#{type}/#{filename}"
+        end
+      end
+      # URL for the vocab.json sibling file. The conversion script ships
+      # vocabularies alongside the .onnx so OnnxModel.from_file can resolve
+      # word→index without re-parsing the FastText .vec.
+      #
+      # @param language [String] Language code
+      # @return [String]
+      def vocab_url(language)
+        @source_registry.url_for(:model_vocab, lang: language)
+      end
+      # Download and decompress gzip file.
+      #
+      # @param url [String] URL to gzip file
+      # @param dest_path [String] Destination path (without .gz)
+      def download_and_decompress(url, dest_path)
+        # Download to temporary file first
+        temp_gz = "#{dest_path}.gz"
+        puts "  Downloading from #{url.split('/').last}..." if $VERBOSE
+        downloaded_bytes = 0
+        URI.open(url, open_timeout: 30, read_timeout: 300) do |uri|
+          File.open(temp_gz, 'wb') do |f|
+            downloaded_bytes = f.write(uri.read)
+          end
+        end
+        puts "  Downloaded: #{(downloaded_bytes.to_f / 1024 / 1024).round(2)} MB" if $VERBOSE
+        # Verify the download succeeded
+        unless File.exist?(temp_gz) && File.size(temp_gz).positive?
+          raise "Download failed: #{temp_gz} is empty or missing"
+        end
+        puts "  Decompressing..." if $VERBOSE
+        # Remove existing file if present (handles partial downloads)
+        File.delete(dest_path) if File.exist?(dest_path)
+        # Decompress gzip with streaming
+        File.open(temp_gz, 'rb') do |gz_file|
+          Zlib::GzipReader.wrap(gz_file) do |gzip|
+            # Stream in chunks to avoid memory issues with large files
+            File.open(dest_path, 'wb') do |out_file|
+              chunk_size = 65_536 # 64KB chunks
+              while (chunk = gzip.read(chunk_size))
+                out_file.write(chunk)
+                # Print progress every 10MB
+                if $VERBOSE && out_file.pos % (10 * 1024 * 1024) < chunk_size
+                  puts "    Decompressed: #{(out_file.pos.to_f / 1024 / 1024).round(1)} MB..."
+                end
+              end
+            end
+          end
+        end
+        # Verify the decompression succeeded
+        unless File.exist?(dest_path) && File.size(dest_path).positive?
+          raise "Decompression failed: #{dest_path} is empty or missing"
+        end
+        # Clean up gz file
+        File.delete(temp_gz)
+        puts "  ✓ Downloaded and decompressed" if $VERBOSE
+      end
+      # Convert FastText .vec file to ONNX format.
+      #
+      # @param language [String] Language code
+      # @param dest_path [String] Destination directory
+      # @param onnx_filename [String] Output ONNX filename
+      # @return [Hash] Converted model info
+      def convert_to_onnx(language, dest_path, onnx_filename)
+        puts "Converting FastText to ONNX for #{language}..." if $VERBOSE
+        # First, ensure we have the FastText .vec file
+        fasttext_resource_id = "#{language}:fasttext"
+        fasttext_result = get(fasttext_resource_id, force_download: false)
+        unless fasttext_result
+          raise "Failed to get FastText model for #{language} needed for ONNX conversion"
+        end
+        vec_file = fasttext_result[:model_path]
+        # Verify the .vec file exists
+        unless File.exist?(vec_file)
+          raise "FastText .vec file not found: #{vec_file}"
+        end
+        # Output ONNX file path
+        onnx_file = File.join(dest_path, onnx_filename)
+        # Get the conversion script path
+        script_path = File.expand_path('../scripts/fasttext_to_onnx.py', __dir__)
+        unless File.exist?(script_path)
+          raise "ONNX conversion script not found: #{script_path}"
+        end
+        # Build conversion command
+        # Use --vocab-size to limit vocabulary size and reduce conversion time
+        vocab_size = fasttext_result.dig(:metadata, "vocab_size")&.to_i || 100_000
+        cmd = [
+          'python3',
+          script_path,
+          vec_file,
+          onnx_file,
+          '--vocab-size', vocab_size.to_s
+        ]
+        puts "  Running conversion: #{shell_join(cmd)}" if $VERBOSE
+        # Run conversion
+        require 'open3'
+        stdout, stderr, status = Open3.capture3(*cmd)
+        unless status.success?
+          raise "ONNX conversion failed:\n#{stdout}\n#{stderr}"
+        end
+        puts stdout if $VERBOSE
+        # Build metadata for the ONNX file
+        metadata = {
+          version: Time.now.utc.iso8601,
+          url: "converted:#{vec_file}",
+          language: language,
+          type: "onnx",
+          file: onnx_filename,
+          checksum: Digest::SHA256.file(onnx_file).hexdigest,
+          cached_at: Time.now.utc.iso8601,
+          source_model: File.basename(vec_file),
+          conversion_method: "fasttext_to_onnx.py"
+        }
+        # Save metadata
+        write_metadata(File.join(dest_path, "metadata.json"), metadata)
+        puts "  ✓ ONNX conversion complete" if $VERBOSE
+        { model_path: onnx_file, metadata: metadata }
+      end
+      # Try to download ONNX from GitHub, fall back to conversion if download fails.
+      #
+      # @param language [String] Language code
+      # @param dest_path [String] Destination directory
+      # @param onnx_filename [String] ONNX filename
+      # @return [Hash] Downloaded or converted model info
+      def download_or_convert_onnx(language, dest_path, onnx_filename)
+        url = model_url(language, "onnx", onnx_filename)
+        onnx_file = File.join(dest_path, onnx_filename)
+        puts "  Attempting download from GitHub..." if $VERBOSE
+        # Try downloading from GitHub first
+        begin
+          download_file(url, onnx_file)
+          # Verify the downloaded file
+          unless File.exist?(onnx_file) && File.size(onnx_file).positive?
+            raise "Download failed: empty file"
+          end
+          # Pull the matching vocab.json so OnnxModel.from_file can resolve
+          # word→index without re-parsing the source FastText .vec.
+          begin
+            download_file(vocab_url(language),
+                          File.join(dest_path, "fasttext.#{language}.vocab.json"))
+          rescue StandardError => e
+            warn "  vocab.json unavailable for #{language}: #{e.message}" if $VERBOSE
+          end
+          puts "  ✓ Downloaded from GitHub" if $VERBOSE
+          # Build metadata for downloaded file
+          metadata = {
+            version: Time.now.utc.iso8601,
+            url: url,
+            language: language,
+            type: "onnx",
+            file: onnx_filename,
+            checksum: Digest::SHA256.file(onnx_file).hexdigest,
+            cached_at: Time.now.utc.iso8601,
+            source: "github"
+          }
+          # Save metadata
+          write_metadata(File.join(dest_path, "metadata.json"), metadata)
+          { model_path: onnx_file, metadata: metadata }
+        rescue StandardError => e
+          puts "  GitHub download failed: #{e.message}" if $VERBOSE
+          puts "  Falling back to local conversion..." if $VERBOSE
+          # Remove partial download if any
+          File.delete(onnx_file) if File.exist?(onnx_file)
+          # Fall back to local conversion
+          convert_to_onnx(language, dest_path, onnx_filename)
+        end
+      end
+      # Join shell command arguments safely (for display purposes).
+      #
+      # @param args [Array<String>] Command arguments
+      # @return [String] Joined command string
+      def shell_join(args)
+        args.map { |a| a =~ /\s/ ? "'#{a}'" : a }.join(' ')
+      end
+      # Default cache path: $XDG_CACHE_HOME/kotoshu/models
+      #
+      # @return [String] Default cache path
+      def default_cache_path
+        File.join(Kotoshu::Paths.cache_path, "models")
+      end
+      # Default cache TTL (30 days for models).
+      #
+      # @return [Integer] Default TTL in seconds
+      def default_cache_ttl
+        2_592_000 # 30 days
+      end
+    end
+  end
+end