RubyGems - kotoshu - Versions diffs - 0.3.0 - Mend

kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (210) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +18 -0
data/CHANGELOG.md +182 -0
data/CLAUDE.md +172 -0
data/CODE_OF_CONDUCT.md +132 -0
data/LICENSE +31 -0
data/README.adoc +955 -0
data/Rakefile +12 -0
data/SECURITY.md +93 -0
data/examples/01_basic_word_checking.rb +38 -0
data/examples/02_text_document_checking.rb +77 -0
data/examples/03_dictionary_backends.rb +137 -0
data/examples/04_trie_data_structure.rb +146 -0
data/examples/05_suggestion_algorithms.rb +239 -0
data/examples/06_configuration_advanced.rb +287 -0
data/examples/07_multi_language_dictionaries.rb +278 -0
data/exe/kotoshu +6 -0
data/lib/kotoshu/algorithms/capitalization.rb +276 -0
data/lib/kotoshu/algorithms/lookup.rb +876 -0
data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
data/lib/kotoshu/algorithms/permutations.rb +283 -0
data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
data/lib/kotoshu/algorithms/suggest.rb +575 -0
data/lib/kotoshu/algorithms.rb +14 -0
data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
data/lib/kotoshu/cache/base_cache.rb +596 -0
data/lib/kotoshu/cache/cache.rb +91 -0
data/lib/kotoshu/cache/frequency_cache.rb +224 -0
data/lib/kotoshu/cache/language_cache.rb +454 -0
data/lib/kotoshu/cache/lookup_cache.rb +166 -0
data/lib/kotoshu/cache/model_cache.rb +513 -0
data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
data/lib/kotoshu/cache.rb +40 -0
data/lib/kotoshu/cli/auto_setup.rb +71 -0
data/lib/kotoshu/cli/batch_reporter.rb +315 -0
data/lib/kotoshu/cli/cache_command.rb +356 -0
data/lib/kotoshu/cli/display_formatter.rb +431 -0
data/lib/kotoshu/cli/errors.rb +36 -0
data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
data/lib/kotoshu/cli/language_resolver.rb +91 -0
data/lib/kotoshu/cli/navigation_manager.rb +272 -0
data/lib/kotoshu/cli/progress_reporter.rb +114 -0
data/lib/kotoshu/cli/status_report.rb +130 -0
data/lib/kotoshu/cli.rb +627 -0
data/lib/kotoshu/commands/cache_command.rb +424 -0
data/lib/kotoshu/commands/check_command.rb +312 -0
data/lib/kotoshu/commands/model_command.rb +295 -0
data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
data/lib/kotoshu/components/pos_tagger.rb +98 -0
data/lib/kotoshu/components/spell_checker.rb +73 -0
data/lib/kotoshu/components/synthesizer.rb +60 -0
data/lib/kotoshu/components/tokenizer.rb +58 -0
data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
data/lib/kotoshu/configuration/builder.rb +209 -0
data/lib/kotoshu/configuration/resolver.rb +124 -0
data/lib/kotoshu/configuration.rb +702 -0
data/lib/kotoshu/core/exceptions.rb +165 -0
data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
data/lib/kotoshu/core/models/affix_rule.rb +260 -0
data/lib/kotoshu/core/models/result/document_result.rb +263 -0
data/lib/kotoshu/core/models/result/word_result.rb +203 -0
data/lib/kotoshu/core/models/word.rb +142 -0
data/lib/kotoshu/core/trie/builder.rb +119 -0
data/lib/kotoshu/core/trie/node.rb +94 -0
data/lib/kotoshu/core/trie/trie.rb +249 -0
data/lib/kotoshu/core.rb +28 -0
data/lib/kotoshu/data/common_words/de.yml +1800 -0
data/lib/kotoshu/data/common_words/en.yml +1215 -0
data/lib/kotoshu/data/common_words/es.yml +750 -0
data/lib/kotoshu/data/common_words/fr.yml +1015 -0
data/lib/kotoshu/data/common_words/pt.yml +870 -0
data/lib/kotoshu/data/common_words/ru.yml +484 -0
data/lib/kotoshu/data/common_words_loader.rb +152 -0
data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
data/lib/kotoshu/debug_logger.rb +146 -0
data/lib/kotoshu/debug_mode.rb +134 -0
data/lib/kotoshu/defaults.rb +86 -0
data/lib/kotoshu/dictionaries/catalog.rb +817 -0
data/lib/kotoshu/dictionary/base.rb +237 -0
data/lib/kotoshu/dictionary/cspell.rb +254 -0
data/lib/kotoshu/dictionary/custom.rb +224 -0
data/lib/kotoshu/dictionary/hunspell.rb +526 -0
data/lib/kotoshu/dictionary/plain_text.rb +282 -0
data/lib/kotoshu/dictionary/repository.rb +248 -0
data/lib/kotoshu/dictionary/unified.rb +260 -0
data/lib/kotoshu/dictionary/unix_words.rb +218 -0
data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
data/lib/kotoshu/documents/document.rb +229 -0
data/lib/kotoshu/documents/location.rb +139 -0
data/lib/kotoshu/documents/markdown_document.rb +389 -0
data/lib/kotoshu/documents/plain_text_document.rb +147 -0
data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
data/lib/kotoshu/embeddings/protocol.rb +83 -0
data/lib/kotoshu/embeddings/protocols.rb +17 -0
data/lib/kotoshu/embeddings/registry.rb +182 -0
data/lib/kotoshu/embeddings/search.rb +192 -0
data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
data/lib/kotoshu/embeddings.rb +97 -0
data/lib/kotoshu/fluent_checker.rb +91 -0
data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
data/lib/kotoshu/grammar/rule.rb +95 -0
data/lib/kotoshu/grammar/rule_engine.rb +111 -0
data/lib/kotoshu/grammar/rule_loader.rb +31 -0
data/lib/kotoshu/grammar.rb +18 -0
data/lib/kotoshu/integrity/audit_log.rb +88 -0
data/lib/kotoshu/integrity/manifest.rb +117 -0
data/lib/kotoshu/integrity/net_http.rb +46 -0
data/lib/kotoshu/integrity.rb +25 -0
data/lib/kotoshu/keyboard/layout.rb +115 -0
data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
data/lib/kotoshu/keyboard/registry.rb +146 -0
data/lib/kotoshu/keyboard.rb +60 -0
data/lib/kotoshu/language/detector.rb +242 -0
data/lib/kotoshu/language/identifier.rb +378 -0
data/lib/kotoshu/language/languages/base.rb +256 -0
data/lib/kotoshu/language/normalizer/base.rb +137 -0
data/lib/kotoshu/language/registry.rb +147 -0
data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
data/lib/kotoshu/language/tokenizer/base.rb +170 -0
data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
data/lib/kotoshu/language.rb +99 -0
data/lib/kotoshu/languages/de/language.rb +546 -0
data/lib/kotoshu/languages/en/language.rb +448 -0
data/lib/kotoshu/languages/es/language.rb +459 -0
data/lib/kotoshu/languages/fr/language.rb +493 -0
data/lib/kotoshu/languages/ja/language.rb +477 -0
data/lib/kotoshu/languages/pt/language.rb +423 -0
data/lib/kotoshu/languages/ru/language.rb +404 -0
data/lib/kotoshu/languages.rb +43 -0
data/lib/kotoshu/metrics_collector.rb +222 -0
data/lib/kotoshu/metrics_module.rb +110 -0
data/lib/kotoshu/models/context.rb +119 -0
data/lib/kotoshu/models/embedding_model.rb +182 -0
data/lib/kotoshu/models/fasttext_model.rb +220 -0
data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
data/lib/kotoshu/models/onnx_model.rb +333 -0
data/lib/kotoshu/models/semantic_error.rb +165 -0
data/lib/kotoshu/models/suggestion.rb +106 -0
data/lib/kotoshu/models/word_embedding.rb +107 -0
data/lib/kotoshu/paths.rb +53 -0
data/lib/kotoshu/personal_dictionary.rb +94 -0
data/lib/kotoshu/plugins/plugin.rb +61 -0
data/lib/kotoshu/plugins/registry.rb +120 -0
data/lib/kotoshu/project_config.rb +76 -0
data/lib/kotoshu/readers/aff_data.rb +356 -0
data/lib/kotoshu/readers/aff_reader.rb +375 -0
data/lib/kotoshu/readers/condition_checker.rb +142 -0
data/lib/kotoshu/readers/dic_reader.rb +118 -0
data/lib/kotoshu/readers/file_reader.rb +347 -0
data/lib/kotoshu/readers/lookup_builder.rb +299 -0
data/lib/kotoshu/readers/readers.rb +6 -0
data/lib/kotoshu/readers.rb +9 -0
data/lib/kotoshu/resource_bundle.rb +30 -0
data/lib/kotoshu/resource_manager.rb +295 -0
data/lib/kotoshu/results/result.rb +165 -0
data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
data/lib/kotoshu/source_registry.rb +74 -0
data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
data/lib/kotoshu/spellchecker.rb +298 -0
data/lib/kotoshu/string_metrics.rb +153 -0
data/lib/kotoshu/suggestions/context.rb +55 -0
data/lib/kotoshu/suggestions/generator.rb +175 -0
data/lib/kotoshu/suggestions/pipeline.rb +135 -0
data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
data/lib/kotoshu/suggestions/suggestion.rb +174 -0
data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
data/lib/kotoshu/version.rb +5 -0
data/lib/kotoshu.rb +493 -0
data/script/validate_all_dictionaries.rb +444 -0
data/sig/kotoshu.rbs +4 -0
data/test_oop.rb +79 -0
metadata +298 -0

data/lib/kotoshu/commands/check_command.rb ADDED Viewed

@@ -0,0 +1,312 @@
+# frozen_string_literal: true
+require_relative '../documents/document'
+require_relative '../analyzers/semantic_analyzer'
+require_relative '../models/fasttext_model'
+require_relative '../cache/model_cache'
+require_relative '../cli/interactive_reviewer'
+require_relative '../cli/batch_reporter'
+require_relative '../language/identifier'
+module Kotoshu
+  class CheckCommand < Thor
+    namespace :check
+    class_option :language, aliases: '-l', type: :string, default: 'auto',
+             desc: 'Language code (auto, de, en, es, fr, pt, ru)'
+    class_option :interactive, aliases: '-i', type: :boolean, default: false,
+             desc: 'Interactive mode for error review'
+    class_option :output, aliases: '-o', type: :string,
+             desc: 'Output file path (for batch mode)'
+    class_option :format, type: :string, enum: %w[text json yaml csv sarif], default: 'text',
+             desc: 'Output format (text, json, yaml, csv, sarif)'
+    class_option :model, type: :string, enum: %w[fasttext hunspell], default: 'hunspell',
+             desc: 'Analysis model (fasttext, hunspell)'
+    class_option :download, type: :boolean, default: true,
+             desc: 'Automatically download models if missing'
+    class_option :verbose, aliases: '-v', type: :boolean, default: false,
+             desc: 'Verbose output'
+    desc 'check FILE', 'Check spelling/grammar in a file'
+    def check(file)
+      # Validate file exists
+      unless File.exist?(file)
+        puts "Error: File not found: #{file}"
+        exit 1
+      end
+      # Detect language if auto
+      language = detect_language(file, options[:language])
+      # Load document
+      document = load_document(file, language)
+      # Load analyzer based on model type
+      analyzer = load_analyzer(language, options[:model])
+      puts "Analyzing #{file} (language: #{language})..." if options[:verbose]
+      # Run interactive or batch mode
+      if options[:interactive]
+        run_interactive_mode(document, analyzer)
+      else
+        run_batch_mode(document, analyzer)
+      end
+    end
+    desc 'string TEXT', 'Check spelling/grammar in a text string'
+    option :format, type: :string, enum: %w[text markdown], default: 'text',
+             desc: 'Text format (text, markdown)'
+    def string(text)
+      language_code = options[:language]
+      # Create document from string
+      format_sym = options[:format].to_sym
+      document = Documents::Document.from_string(text, language_code: language_code)
+      # Load analyzer
+      analyzer = load_analyzer(language_code, options[:model])
+      puts "Analyzing..." if options[:verbose]
+      # Always use batch mode for string input
+      reporter = run_batch_mode(document, analyzer)
+      # Print report
+      reporter.print(format: options[:format].to_sym)
+      # Exit with appropriate code
+      exit reporter.exit_code
+    end
+    desc 'stdin', 'Check spelling/grammar from stdin'
+    option :format, type: :string, enum: %w[text markdown], default: 'text',
+             desc: 'Text format (text, markdown)'
+    def stdin
+      text = $stdin.read
+      if text.nil? || text.empty?
+        puts "Error: No input provided"
+        exit 1
+      end
+      # Delegate to string command
+      invoke :string, [text], options
+    end
+    private
+    # Detect language from file or use specified language.
+    #
+    # @param filepath [String] Path to file
+    # @param language_code [String] Specified language code or 'auto'
+    # @return [String] Detected or specified language code
+    def detect_language(filepath, language_code)
+      return language_code unless language_code == 'auto'
+      puts "Detecting language..." if options[:verbose]
+      begin
+        lid = Language::LanguageIdentifier.new
+        result = lid.detect_from_file(filepath, top_k: 1).first
+        if result && result.confidence > 0.8
+          detected = result.language
+          puts "  Detected: #{detected} (#{(result.confidence * 100).round(0)}% confidence)" if options[:verbose]
+          detected
+        else
+          puts "  Language detection uncertain, using 'en'" if options[:verbose]
+          'en'
+        end
+      rescue StandardError => e
+        puts "  Language detection failed: #{e.message}" if options[:verbose]
+        puts "  Using 'en' as default" if options[:verbose]
+        'en'
+      end
+    end
+    # Load document from file.
+    #
+    # @param filepath [String] Path to file
+    # @param language_code [String] Language code
+    # @return [Documents::Document] Loaded document
+    def load_document(filepath, language_code = 'en')
+      Documents::Document.from_file(filepath, language_code: language_code)
+    rescue StandardError => e
+      puts "Error loading document: #{e.message}"
+      exit 1
+    end
+    # Load analyzer based on model type.
+    #
+    # @param language_code [String] Language code
+    # @param model_type [String] Model type
+    # @return [Object] Analyzer instance
+    def load_analyzer(language_code, model_type)
+      case model_type
+      when 'fasttext'
+        load_fasttext_analyzer(language_code)
+      when 'hunspell'
+        load_hunspell_analyzer(language_code)
+      else
+        raise ArgumentError, "Unknown model type: #{model_type}"
+      end
+    end
+    # Load FastText analyzer using ONNX model.
+    #
+    # ONNX is the ONLY supported format. No fallbacks.
+    #
+    # @param language_code [String] Language code
+    # @return [Analyzers::SemanticAnalyzer] FastText analyzer with ONNX model
+    def load_fasttext_analyzer(language_code)
+      cache = Cache::ModelCache.new
+      onnx_file = cache.get_onnx_model(language_code, force_download: options[:download])
+      unless onnx_file && File.exist?(onnx_file)
+        puts "Error: ONNX model not found for #{language_code}"
+        puts ""
+        puts "Download the model first:"
+        puts "  kotoshu model download #{language_code} --type onnx"
+        puts ""
+        puts "Or convert from FastText .vec file:"
+        puts "  kotoshu model convert cc.#{language_code}.300.vec fasttext.#{language_code}.onnx -l #{language_code}"
+        exit 1
+      end
+      puts "Loading ONNX model for #{language_code}..." if options[:verbose]
+      model = Models::OnnxModel.from_file(onnx_file)
+      model.preload_embedding_matrix if options[:verbose]
+      Analyzers::SemanticAnalyzer.new(model)
+    rescue StandardError => e
+      puts "Error loading FastText analyzer: #{e.message}"
+      puts ""
+      puts "Ensure ONNX Runtime is installed:"
+      puts "  gem install onnxruntime"
+      exit 1
+    end
+    # Load Hunspell analyzer.
+    #
+    # @param language_code [String] Language code
+    # @return [Object] Hunspell analyzer
+    def load_hunspell_analyzer(language_code)
+      require_relative '../dictionary/hunspell'
+      # Load Hunspell dictionary
+      if options[:download]
+        puts "Loading Hunspell dictionary for #{language_code}..." if options[:verbose]
+        dict = Dictionary::Hunspell.from_github(language_code)
+      else
+        # Try local paths
+        dict = Dictionary::Hunspell.for_language(language_code)
+      end
+      # Create Hunspell-based analyzer
+      # Note: This would use HunspellDictionary for checking + EditDistanceStrategy for suggestions
+      # For now, we'll use a placeholder
+      require_relative '../spell_checker'
+      SpellChecker.new(dictionary: dict, language: language_code)
+    rescue StandardError => e
+      puts "Error loading Hunspell analyzer: #{e.message}"
+      exit 1
+    end
+    # Run interactive mode.
+    #
+    # @param document [Documents::Document] Document to check
+    # @param analyzer [Object] Analyzer instance
+    def run_interactive_mode(document, analyzer)
+      # Create interactive reviewer
+      reviewer = Cli::InteractiveReviewer.new(document, analyzer)
+      unless reviewer.has_errors?
+        puts "No errors found!"
+        return
+      end
+      # Run interactive loop
+      reviewer.run
+      # Apply corrections if user accepted any
+      if reviewer.navigation.modified.any?
+        apply_corrections(document, reviewer.navigation)
+      end
+    end
+    # Run batch mode.
+    #
+    # @param document [Documents::Document] Document to check
+    # @param analyzer [Object] Analyzer instance
+    # @return [Cli::BatchReporter] Batch reporter
+    def run_batch_mode(document, analyzer)
+      # For batch mode with Hunspell, we need different approach
+      if analyzer.is_a?(SpellChecker)
+        # Use traditional spell checker
+        result = analyzer.check_string(document.content)
+        # Convert result to navigation...
+        # This is a placeholder - full implementation would convert
+      end
+      # For SemanticAnalyzer, create reviewer and get batch reporter
+      if analyzer.is_a?(Analyzers::SemanticAnalyzer)
+        reviewer = Cli::InteractiveReviewer.new(document, analyzer)
+        reporter = reviewer.run_batch
+        # Write to file if specified
+        if options[:output]
+          case options[:format]
+          when 'json'
+            reporter.to_json(filepath: options[:output])
+          when 'yaml'
+            reporter.to_yaml(filepath: options[:output])
+          when 'csv'
+            reporter.to_csv(filepath: options[:output])
+          when 'sarif'
+            reporter.to_sarif(filepath: options[:output])
+          else
+            File.write(options[:output], reporter.to_text)
+          end
+          puts "Report written to: #{options[:output]}" if options[:verbose]
+        end
+        return reporter
+      end
+      # Fallback
+      nil
+    end
+    # Apply corrections to document.
+    #
+    # @param document [Documents::Document] Original document
+    # @param navigation [Cli::NavigationManager] Navigation state with corrections
+    def apply_corrections(document, navigation)
+      corrections = navigation.export_corrections
+      if corrections.empty?
+        return
+      end
+      # Apply corrections
+      corrected_doc = document.apply(corrections.map { |c|
+        # Convert correction hash to SemanticError
+        # This is a placeholder - full implementation would reconstruct errors
+      }.compact)
+      # Write corrected document
+      backup_path = document.name + ".bak"
+      output_path = document.name
+      # Create backup
+      File.write(backup_path, document.content)
+      # Write corrected version
+      File.write(output_path, corrected_doc.content)
+      puts "Created backup: #{backup_path}" if options[:verbose]
+      puts "Wrote corrections to: #{output_path}"
+    end
+  end
+end

data/lib/kotoshu/commands/model_command.rb ADDED Viewed

@@ -0,0 +1,295 @@
+# frozen_string_literal: true
+require 'thor'
+require_relative '../models/fasttext_model'
+require_relative '../models/onnx_model'
+require_relative '../cache/model_cache'
+module Kotoshu
+  class ModelCommand < Thor
+    namespace :model
+    desc 'convert INPUT OUTPUT', 'Convert FastText .vec file to ONNX format'
+    option :language, aliases: '-l', type: :string, required: true,
+             desc: 'Language code (de, en, es, fr, pt, ru)'
+    option :max_vectors, type: :numeric, default: 500_000,
+             desc: 'Maximum vectors to convert (default: 500k)'
+    option :validate, type: :boolean, default: true,
+             desc: 'Validate model after conversion'
+    def convert(input, output)
+      puts "Converting #{input} to #{output}..."
+      # Check if input file exists
+      unless File.exist?(input)
+        puts "Error: Input file not found: #{input}"
+        exit 1
+      end
+      # Build Python command
+      script_path = File.join(File.dirname(__FILE__), '../../scripts/convert_fasttext_to_onnx.py')
+      unless File.exist?(script_path)
+        puts "Error: Conversion script not found: #{script_path}"
+        exit 1
+      end
+      # Build command
+      cmd = [
+        'python3',
+        script_path,
+        '--input', input,
+        '--output', output,
+        '--language', options[:language],
+        '--max-vectors', options[:max_vectors].to_s
+      ]
+      cmd << '--validate' if options[:validate]
+      puts "Running: #{cmd.join(' ')}"
+      # Execute conversion
+      system(*cmd)
+      if $?.success?
+        puts "\n✓ Conversion successful!"
+        puts "  Model: #{output}"
+        puts "  Vocab: #{output.sub('.onnx', '.vocab.json')}"
+        puts "  Metadata: #{output.sub('.onnx', '.metadata.json')}"
+        puts "  Optimized: #{output.sub('.onnx', '.ort.onnx')}"
+      else
+        puts "\n✗ Conversion failed!"
+        exit 1
+      end
+    end
+    desc 'download LANGUAGE', 'Download FastText model for a language'
+    option :type, type: :string, enum: %w[fasttext onnx], default: 'fasttext',
+             desc: 'Model type to download'
+    option :output, type: :string,
+             desc: 'Output path (default: $XDG_CACHE_HOME/kotoshu/languages/{code}/models/)'
+    option :force, type: :boolean, default: false,
+             desc: 'Force re-download even if cached'
+    def download(language)
+      puts "Downloading #{options[:type]} model for #{language}..."
+      cache = Cache::ModelCache.new
+      case options[:type]
+      when 'fasttext'
+        vec_file = cache.get_fasttext_model(language, force_download: options[:force])
+        puts "✓ Downloaded to: #{vec_file}"
+      when 'onnx'
+        onnx_file = cache.get_onnx_model(language, force_download: options[:force])
+        puts "✓ Downloaded to: #{onnx_file}"
+      end
+      # Show file info
+      show_model_info(language)
+    end
+    desc 'info LANGUAGE', 'Show information about available models'
+    option :type, type: :string, enum: %w[fasttext onnx],
+             desc: 'Model type to show (default: all)'
+    def info(language)
+      cache = Cache::ModelCache.new
+      puts "Model information for #{language}:"
+      puts ""
+      if options[:type].nil? || options[:type] == 'fasttext'
+        model_info = cache.model_info(language, :fasttext)
+        if model_info
+          puts "FastText:"
+          puts "  File: #{model_info[:file]}"
+          puts "  Size: #{model_info[:size].to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse} vectors"
+          puts "  Source: #{model_info[:source]}"
+          puts ""
+        end
+      end
+      if options[:type].nil? || options[:type] == 'onnx'
+        model_info = cache.model_info(language, :onnx)
+        if model_info
+          puts "ONNX:"
+          puts "  File: #{model_info[:file]}"
+          puts "  Source: #{model_info[:source]}"
+          puts ""
+        end
+      end
+    end
+    desc 'list', 'List all available models'
+    def list
+      cache = Cache::ModelCache.new
+      all_models = cache.all_available_models
+      puts "Available models:"
+      puts ""
+      all_models.each do |model_type, languages|
+        puts "#{model_type.to_s.capitalize}:"
+        languages.each do |code, info|
+          puts "  #{code}:"
+          puts "    File: #{info[:file]}"
+          puts "    Source: #{info[:source]}"
+        end
+        puts ""
+      end
+    end
+    desc 'validate MODEL_PATH', 'Validate an ONNX model'
+    def validate(model_path)
+      puts "Validating #{model_path}..."
+      unless File.exist?(model_path)
+        puts "Error: Model file not found: #{model_path}"
+        exit 1
+      end
+      # Try to load the model
+      begin
+        model = Models::OnnxModel.from_file(model_path)
+        puts "✓ Model loaded successfully"
+        puts "  Language: #{model.language_code}"
+        puts "  Dimension: #{model.dimension}"
+        puts "  Vocabulary: #{model.vocabulary_size} words"
+        # Test lookup
+        test_word = model.vocabulary.first
+        if test_word
+          embedding = model.embedding_for(test_word)
+          puts "  Test lookup: '#{test_word}' -> vector of size #{embedding.vector.size}"
+        end
+        puts "\n✓ Model is valid!"
+      rescue StandardError => e
+        puts "✗ Validation failed: #{e.message}"
+        exit 1
+      end
+    end
+    desc 'upload LANGUAGE MODEL_FILE', 'Upload model to dictionaries repository'
+    option :repo, type: :string, default: 'kotoshu/dictionaries',
+             desc: 'GitHub repository'
+    option :branch, type: :string, default: 'main',
+             desc: 'Target branch'
+    option :create_pr, type: :boolean, default: false,
+             desc: 'Create pull request instead of direct push'
+    def upload(language, model_file)
+      puts "Uploading #{model_file} to #{options[:repo]}..."
+      # Check if file exists
+      unless File.exist?(model_file)
+        puts "Error: File not found: #{model_file}"
+        exit 1
+      end
+      # Determine model type and destination path
+      if model_file.end_with?('.vec')
+        model_type = 'fasttext'
+        filename = File.basename(model_file)
+        dest_path = "#{language}/models/fasttext/#{filename}"
+      elsif model_file.end_with?('.onnx')
+        model_type = 'onnx'
+        filename = File.basename(model_file)
+        dest_path = "#{language}/models/onnx/#{filename}"
+        # Also upload vocab and metadata files
+        vocab_file = model_file.sub('.onnx', '.vocab.json')
+        metadata_file = model_file.sub('.onnx', '.metadata.json')
+        ort_file = model_file.sub('.onnx', '.ort.onnx')
+      else
+        puts "Error: Unknown file type. Expected .vec or .onnx"
+        exit 1
+      end
+      # Build gh command
+      cmd = [
+        'gh', 'repo', 'clone', options[:repo], '/tmp/kotoshu-dictionaries'
+      ]
+      puts "Cloning repository..."
+      system(*cmd)
+      unless $?.success?
+        puts "Error: Failed to clone repository"
+        exit 1
+      end
+      # Copy files to destination
+      target_dir = File.join('/tmp/kotoshu-dictionaries', File.dirname(dest_path))
+      FileUtils.mkdir_p(target_dir)
+      FileUtils.cp(model_file, File.join('/tmp/kotoshu-dictionaries', dest_path))
+      if model_type == 'onnx'
+        if File.exist?(vocab_file)
+          FileUtils.cp(vocab_file, File.join('/tmp/kotoshu-dictionaries', dest_path.sub('.onnx', '.vocab.json')))
+        end
+        if File.exist?(metadata_file)
+          FileUtils.cp(metadata_file, File.join('/tmp/kotoshu-dictionaries', dest_path.sub('.onnx', '.metadata.json')))
+        end
+        if File.exist?(ort_file)
+          FileUtils.cp(ort_file, File.join('/tmp/kotoshu-dictionaries', dest_path.sub('.onnx', '.ort.onnx')))
+        end
+      end
+      # Commit and push
+      Dir.chdir('/tmp/kotoshu-dictionaries') do
+        system('git', 'add', '.')
+        message = "Add #{model_type} model for #{language}\n\n"
+        message += "Model: #{filename}\n"
+        message += "Language: #{language}\n"
+        system('git', 'commit', '-m', message)
+        if options[:create_pr]
+          # Create branch and PR
+          branch_name = "add-#{model_type}-#{language}"
+          system('git', 'checkout', '-b', branch_name)
+          system('git', 'push', 'origin', branch_name)
+          system('gh', 'pr', 'create', '--title', "Add #{model_type} model for #{language}", '--body', message)
+        else
+          # Direct push
+          system('git', 'push')
+        end
+      end
+      if $?.success?
+        puts "✓ Upload successful!"
+        puts "  Path: #{dest_path}"
+        puts "  Repository: #{options[:repo]}"
+      else
+        puts "✗ Upload failed!"
+        exit 1
+      end
+    end
+    private
+    # Show model file information.
+    #
+    # @param language [String] Language code
+    def show_model_info(language)
+      cache = Cache::ModelCache.new
+      model_path = File.join(cache.instance_variable_get(:@cache_path), language, 'models')
+      if Dir.exist?(model_path)
+        puts "\nModel files:"
+        Dir.glob(File.join(model_path, '**/*')).each do |file|
+          next if File.directory?(file)
+          size = File.size(file)
+          size_mb = (size.to_f / 1024 / 1024).round(2)
+          puts "  #{File.basename(file)}: #{size_mb} MB"
+        end
+      end
+    end
+  end
+end

data/lib/kotoshu/components/passthrough_spell_checker.rb ADDED Viewed

@@ -0,0 +1,72 @@
+# frozen_string_literal: true
+require_relative 'spell_checker'
+module Kotoshu
+  module Components
+    # Passthrough spell checker for languages that don't use spell checking.
+    #
+    # This checker always returns that words are "found" (correct). It's used
+    # for languages that don't have traditional spell checking, such as:
+    # - CJK languages (Japanese, Chinese) - use confusion rules instead
+    # - Languages with purely rule-based checking
+    #
+    # @example
+    #   checker = PassthroughSpellChecker.new
+    #   result = checker.check('任意のテキスト')
+    #   # => { found: true, stem: nil, flags: [] }
+    #
+    # @example Getting suggestions (always empty)
+    #   suggestions = checker.suggest('テキスト')
+    #   # => []
+    class PassthroughSpellChecker < SpellChecker
+      # Create a new passthrough spell checker.
+      #
+      # @param reason [String] Optional reason why spell checking is not used
+      def initialize(reason: nil)
+        @reason = reason || "Language does not use spell checking"
+      end
+      # Always returns that the word is "found" (correct).
+      #
+      # @param _word [String] The word to check (ignored)
+      # @return [Hash] Always returns { found: true, stem: nil, flags: [] }
+      def check(_word)
+        { found: true, stem: nil, flags: [] }
+      end
+      # Returns no suggestions.
+      #
+      # Passthrough spell checkers don't provide suggestions.
+      #
+      # @param _word [String] The word (ignored)
+      # @param _max_suggestions [Integer] Max suggestions (ignored)
+      # @return [Array<Hash>] Always returns empty array
+      def suggest(_word, _max_suggestions: 10)
+        []
+      end
+      # Always returns true (all words are "correct").
+      #
+      # @param _word [String] The word to check (ignored)
+      # @return [Boolean] Always true
+      def correct?(_word)
+        true
+      end
+      # Get the reason why spell checking is not used.
+      #
+      # @return [String] Reason text
+      def reason
+        @reason
+      end
+      # Check if this is a passthrough checker.
+      #
+      # @return [Boolean] Always true for this class
+      def passthrough?
+        true
+      end
+    end
+  end
+end