RubyGems - kotoshu - Versions diffs - 0.3.0 - Mend

kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (210) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +18 -0
data/CHANGELOG.md +182 -0
data/CLAUDE.md +172 -0
data/CODE_OF_CONDUCT.md +132 -0
data/LICENSE +31 -0
data/README.adoc +955 -0
data/Rakefile +12 -0
data/SECURITY.md +93 -0
data/examples/01_basic_word_checking.rb +38 -0
data/examples/02_text_document_checking.rb +77 -0
data/examples/03_dictionary_backends.rb +137 -0
data/examples/04_trie_data_structure.rb +146 -0
data/examples/05_suggestion_algorithms.rb +239 -0
data/examples/06_configuration_advanced.rb +287 -0
data/examples/07_multi_language_dictionaries.rb +278 -0
data/exe/kotoshu +6 -0
data/lib/kotoshu/algorithms/capitalization.rb +276 -0
data/lib/kotoshu/algorithms/lookup.rb +876 -0
data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
data/lib/kotoshu/algorithms/permutations.rb +283 -0
data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
data/lib/kotoshu/algorithms/suggest.rb +575 -0
data/lib/kotoshu/algorithms.rb +14 -0
data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
data/lib/kotoshu/cache/base_cache.rb +596 -0
data/lib/kotoshu/cache/cache.rb +91 -0
data/lib/kotoshu/cache/frequency_cache.rb +224 -0
data/lib/kotoshu/cache/language_cache.rb +454 -0
data/lib/kotoshu/cache/lookup_cache.rb +166 -0
data/lib/kotoshu/cache/model_cache.rb +513 -0
data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
data/lib/kotoshu/cache.rb +40 -0
data/lib/kotoshu/cli/auto_setup.rb +71 -0
data/lib/kotoshu/cli/batch_reporter.rb +315 -0
data/lib/kotoshu/cli/cache_command.rb +356 -0
data/lib/kotoshu/cli/display_formatter.rb +431 -0
data/lib/kotoshu/cli/errors.rb +36 -0
data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
data/lib/kotoshu/cli/language_resolver.rb +91 -0
data/lib/kotoshu/cli/navigation_manager.rb +272 -0
data/lib/kotoshu/cli/progress_reporter.rb +114 -0
data/lib/kotoshu/cli/status_report.rb +130 -0
data/lib/kotoshu/cli.rb +627 -0
data/lib/kotoshu/commands/cache_command.rb +424 -0
data/lib/kotoshu/commands/check_command.rb +312 -0
data/lib/kotoshu/commands/model_command.rb +295 -0
data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
data/lib/kotoshu/components/pos_tagger.rb +98 -0
data/lib/kotoshu/components/spell_checker.rb +73 -0
data/lib/kotoshu/components/synthesizer.rb +60 -0
data/lib/kotoshu/components/tokenizer.rb +58 -0
data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
data/lib/kotoshu/configuration/builder.rb +209 -0
data/lib/kotoshu/configuration/resolver.rb +124 -0
data/lib/kotoshu/configuration.rb +702 -0
data/lib/kotoshu/core/exceptions.rb +165 -0
data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
data/lib/kotoshu/core/models/affix_rule.rb +260 -0
data/lib/kotoshu/core/models/result/document_result.rb +263 -0
data/lib/kotoshu/core/models/result/word_result.rb +203 -0
data/lib/kotoshu/core/models/word.rb +142 -0
data/lib/kotoshu/core/trie/builder.rb +119 -0
data/lib/kotoshu/core/trie/node.rb +94 -0
data/lib/kotoshu/core/trie/trie.rb +249 -0
data/lib/kotoshu/core.rb +28 -0
data/lib/kotoshu/data/common_words/de.yml +1800 -0
data/lib/kotoshu/data/common_words/en.yml +1215 -0
data/lib/kotoshu/data/common_words/es.yml +750 -0
data/lib/kotoshu/data/common_words/fr.yml +1015 -0
data/lib/kotoshu/data/common_words/pt.yml +870 -0
data/lib/kotoshu/data/common_words/ru.yml +484 -0
data/lib/kotoshu/data/common_words_loader.rb +152 -0
data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
data/lib/kotoshu/debug_logger.rb +146 -0
data/lib/kotoshu/debug_mode.rb +134 -0
data/lib/kotoshu/defaults.rb +86 -0
data/lib/kotoshu/dictionaries/catalog.rb +817 -0
data/lib/kotoshu/dictionary/base.rb +237 -0
data/lib/kotoshu/dictionary/cspell.rb +254 -0
data/lib/kotoshu/dictionary/custom.rb +224 -0
data/lib/kotoshu/dictionary/hunspell.rb +526 -0
data/lib/kotoshu/dictionary/plain_text.rb +282 -0
data/lib/kotoshu/dictionary/repository.rb +248 -0
data/lib/kotoshu/dictionary/unified.rb +260 -0
data/lib/kotoshu/dictionary/unix_words.rb +218 -0
data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
data/lib/kotoshu/documents/document.rb +229 -0
data/lib/kotoshu/documents/location.rb +139 -0
data/lib/kotoshu/documents/markdown_document.rb +389 -0
data/lib/kotoshu/documents/plain_text_document.rb +147 -0
data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
data/lib/kotoshu/embeddings/protocol.rb +83 -0
data/lib/kotoshu/embeddings/protocols.rb +17 -0
data/lib/kotoshu/embeddings/registry.rb +182 -0
data/lib/kotoshu/embeddings/search.rb +192 -0
data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
data/lib/kotoshu/embeddings.rb +97 -0
data/lib/kotoshu/fluent_checker.rb +91 -0
data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
data/lib/kotoshu/grammar/rule.rb +95 -0
data/lib/kotoshu/grammar/rule_engine.rb +111 -0
data/lib/kotoshu/grammar/rule_loader.rb +31 -0
data/lib/kotoshu/grammar.rb +18 -0
data/lib/kotoshu/integrity/audit_log.rb +88 -0
data/lib/kotoshu/integrity/manifest.rb +117 -0
data/lib/kotoshu/integrity/net_http.rb +46 -0
data/lib/kotoshu/integrity.rb +25 -0
data/lib/kotoshu/keyboard/layout.rb +115 -0
data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
data/lib/kotoshu/keyboard/registry.rb +146 -0
data/lib/kotoshu/keyboard.rb +60 -0
data/lib/kotoshu/language/detector.rb +242 -0
data/lib/kotoshu/language/identifier.rb +378 -0
data/lib/kotoshu/language/languages/base.rb +256 -0
data/lib/kotoshu/language/normalizer/base.rb +137 -0
data/lib/kotoshu/language/registry.rb +147 -0
data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
data/lib/kotoshu/language/tokenizer/base.rb +170 -0
data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
data/lib/kotoshu/language.rb +99 -0
data/lib/kotoshu/languages/de/language.rb +546 -0
data/lib/kotoshu/languages/en/language.rb +448 -0
data/lib/kotoshu/languages/es/language.rb +459 -0
data/lib/kotoshu/languages/fr/language.rb +493 -0
data/lib/kotoshu/languages/ja/language.rb +477 -0
data/lib/kotoshu/languages/pt/language.rb +423 -0
data/lib/kotoshu/languages/ru/language.rb +404 -0
data/lib/kotoshu/languages.rb +43 -0
data/lib/kotoshu/metrics_collector.rb +222 -0
data/lib/kotoshu/metrics_module.rb +110 -0
data/lib/kotoshu/models/context.rb +119 -0
data/lib/kotoshu/models/embedding_model.rb +182 -0
data/lib/kotoshu/models/fasttext_model.rb +220 -0
data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
data/lib/kotoshu/models/onnx_model.rb +333 -0
data/lib/kotoshu/models/semantic_error.rb +165 -0
data/lib/kotoshu/models/suggestion.rb +106 -0
data/lib/kotoshu/models/word_embedding.rb +107 -0
data/lib/kotoshu/paths.rb +53 -0
data/lib/kotoshu/personal_dictionary.rb +94 -0
data/lib/kotoshu/plugins/plugin.rb +61 -0
data/lib/kotoshu/plugins/registry.rb +120 -0
data/lib/kotoshu/project_config.rb +76 -0
data/lib/kotoshu/readers/aff_data.rb +356 -0
data/lib/kotoshu/readers/aff_reader.rb +375 -0
data/lib/kotoshu/readers/condition_checker.rb +142 -0
data/lib/kotoshu/readers/dic_reader.rb +118 -0
data/lib/kotoshu/readers/file_reader.rb +347 -0
data/lib/kotoshu/readers/lookup_builder.rb +299 -0
data/lib/kotoshu/readers/readers.rb +6 -0
data/lib/kotoshu/readers.rb +9 -0
data/lib/kotoshu/resource_bundle.rb +30 -0
data/lib/kotoshu/resource_manager.rb +295 -0
data/lib/kotoshu/results/result.rb +165 -0
data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
data/lib/kotoshu/source_registry.rb +74 -0
data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
data/lib/kotoshu/spellchecker.rb +298 -0
data/lib/kotoshu/string_metrics.rb +153 -0
data/lib/kotoshu/suggestions/context.rb +55 -0
data/lib/kotoshu/suggestions/generator.rb +175 -0
data/lib/kotoshu/suggestions/pipeline.rb +135 -0
data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
data/lib/kotoshu/suggestions/suggestion.rb +174 -0
data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
data/lib/kotoshu/version.rb +5 -0
data/lib/kotoshu.rb +493 -0
data/script/validate_all_dictionaries.rb +444 -0
data/sig/kotoshu.rbs +4 -0
data/test_oop.rb +79 -0
metadata +298 -0

data/lib/kotoshu/models/nearest_neighbor.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+require_relative 'word_embedding'
+module Kotoshu
+  module Models
+    # Value object for embedding search results (nearest neighbors).
+    #
+    # Represents a single suggestion from semantic similarity search,
+    # with similarity score and optional embedding reference.
+    #
+    # @example Creating a neighbor
+    #   neighbor = NearestNeighbor.new("hello", 0.85, embedding: emb)
+    #   neighbor.to_s  # => "hello [85%]"
+    class NearestNeighbor
+      attr_reader :word, :similarity, :distance, :embedding
+      # Create a new nearest neighbor result.
+      #
+      # @param word [String] The suggested word
+      # @param similarity [Float] Cosine similarity (0.0 to 1.0)
+      # @param embedding [WordEmbedding, nil] Optional embedding reference
+      def initialize(word, similarity, embedding: nil)
+        raise ArgumentError, "Similarity must be 0-1" unless similarity.between?(0.0, 1.0)
+        @word = word
+        @similarity = similarity
+        @distance = 1.0 - similarity
+        @embedding = embedding
+        freeze
+      end
+      # Comparison for sorting (higher similarity = better).
+      #
+      # @param other [NearestNeighbor] Another neighbor
+      # @return [Integer] Comparison result (-1, 0, 1)
+      def <=>(other)
+        return 0 unless other.is_a?(NearestNeighbor)
+        # Higher similarity = better rank (sort descending)
+        other.similarity <=> @similarity
+      end
+      # Check if this equals another neighbor.
+      #
+      # @param other [Object] Another object
+      # @return [Boolean] True if words match
+      def ==(other)
+        return false unless other.is_a?(NearestNeighbor)
+        @word == other.word
+      end
+      alias_method :eql?, :==
+      # Hash code for hash table usage.
+      #
+      # @return [Integer] Hash code
+      def hash
+        @word.hash
+      end
+      # String representation with percentage.
+      #
+      # @return [String] Human-readable representation
+      def to_s
+        "#{@word} [#{(@similarity * 100).to_i}%]"
+      end
+      alias_method :inspect, :to_s
+      # Check if this is a high-confidence suggestion.
+      #
+      # @return [Boolean] True if similarity > 0.8
+      def high_confidence?
+        @similarity > 0.8
+      end
+      # Get confidence level category.
+      #
+      # @return [Symbol] :high, :medium, or :low
+      def confidence_level
+        return :high if @similarity > 0.8
+        return :medium if @similarity > 0.5
+        :low
+      end
+    end
+  end
+end

data/lib/kotoshu/models/onnx_model.rb ADDED Viewed

@@ -0,0 +1,333 @@
+# frozen_string_literal: true
+module Kotoshu
+  module Models
+    # ONNX embedding model implementation.
+    #
+    # Loads FastText models converted to ONNX format for faster inference.
+    # Uses ONNX Runtime for efficient embedding lookup.
+    #
+    # @example Loading from file
+    #   model = OnnxModel.from_file('fasttext.en.onnx')
+    #   embedding = model.embedding_for('hello')
+    #
+    # @example Loading from GitHub (via ModelCache)
+    #   model = OnnxModel.from_github('en')
+    #   neighbors = model.nearest_neighbors('hello', k: 10)
+    class OnnxModel < EmbeddingModel
+      # Soft-load onnxruntime. The gem is intentionally NOT a hard runtime
+      # dependency — it fails to build on some platforms and would block
+      # install for users who only want traditional spell-checking. Semantic
+      # features light up automatically when the gem is present.
+      #
+      # KOTOSHU_NO_ONNX=1 forces semantic analysis off even when the gem is
+      # installed (useful for benchmarks / CI determinism).
+      ONNX_LOADED = begin
+        if ENV["KOTOSHU_NO_ONNX"] == "1"
+          false
+        else
+          require "onnxruntime"
+          true
+        end
+      rescue LoadError
+        false
+      end
+      # Error raised when semantic features are requested but onnxruntime
+      # is unavailable. Caller-friendly message points at the fix.
+      class OnnxUnavailable < Kotoshu::Error
+        def initialize(detail = nil)
+          message = "onnxruntime gem not loaded"
+          message += " (#{detail})" if detail
+          message += ". Install with: gem install onnxruntime"
+          message += ". Or set KOTOSHU_NO_ONNX=1 to silence this in code paths that opt out."
+          super(message)
+        end
+      end
+      # Default dimension for FastText models
+      DEFAULT_DIMENSION = 300
+      attr_reader :onnx_path, :vocabulary, :embedding_matrix
+      # Create a new ONNX model.
+      #
+      # @param language_code [String] ISO 639-1 language code
+      # @param dimension [Integer] Vector dimension
+      # @param onnx_path [String] Path to .onnx file
+      # @param vocabulary [Hash<String, Integer>] Word-to-index mapping
+      # @param embedding_matrix [Numo::SFloat] Pre-loaded embeddings (optional)
+      def initialize(language_code:, dimension: DEFAULT_DIMENSION, onnx_path:, vocabulary:, embedding_matrix: nil)
+        super(language_code: language_code, dimension: dimension)
+        @onnx_path = onnx_path
+        @vocabulary = vocabulary.freeze
+        @vocabulary_size = @vocabulary.size
+        # Pre-load embedding matrix if provided (for faster nearest neighbor search)
+        @embedding_matrix = embedding_matrix
+        # Lazy load session
+        @session = nil
+        @loaded = false
+      end
+      # Load ONNX model from a file.
+      #
+      # @param onnx_path [String] Path to .onnx file
+      # @param language_code [String] Language code (auto-detected from filename)
+      # @return [OnnxModel] Loaded model
+      # @raise [ArgumentError] if file doesn't exist
+      def self.from_file(onnx_path, language_code: nil)
+        raise ArgumentError, "File not found: #{onnx_path}" unless File.exist?(onnx_path)
+        # Detect language from filename if not provided
+        language_code ||= detect_language_from_path(onnx_path)
+        # Load vocabulary from .vocab.json file
+        vocab_path = onnx_path.sub('.onnx', '.vocab.json')
+        unless File.exist?(vocab_path)
+          raise ArgumentError, "Vocabulary file not found: #{vocab_path}"
+        end
+        require 'json'
+        vocabulary = JSON.parse(File.read(vocab_path))
+        # Load metadata
+        metadata_path = onnx_path.sub('.onnx', '.metadata.json')
+        dimension = DEFAULT_DIMENSION
+        if File.exist?(metadata_path)
+          metadata = JSON.parse(File.read(metadata_path))
+          dimension = metadata['dimension']
+        end
+        new(
+          language_code: language_code,
+          dimension: dimension,
+          onnx_path: onnx_path,
+          vocabulary: vocabulary
+        )
+      end
+      # Load ONNX model from GitHub (via ModelCache).
+      #
+      # Downloads the .onnx file from kotoshu/dictionaries repository.
+      #
+      # @param language_code [String] ISO 639-1 language code (de, en, es, fr, pt, ru)
+      # @param cache [ModelCache, nil] Optional cache instance
+      # @return [OnnxModel] Loaded model
+      # @raise [ArgumentError] if language not supported
+      def self.from_github(language_code, cache: nil)
+        require_relative '../cache/model_cache'
+        cache ||= Cache::ModelCache.new
+        # Get the .onnx file path from cache
+        onnx_file = cache.get_onnx_model(language_code)
+        from_file(onnx_file, language_code: language_code)
+      end
+      # Get embedding vector for a word.
+      #
+      # @param word [String] The word to lookup
+      # @return [WordEmbedding, nil] Embedding vector or nil if not found
+      def embedding_for(word)
+        return nil if word.nil? || word.empty?
+        index = @vocabulary[word]
+        return nil unless index
+        # Get embedding from ONNX model
+        vector = get_embedding_vector(index)
+        WordEmbedding.new(word, vector, @language_code, dimension: @dimension)
+      end
+      # Get the vocabulary (all words in the model).
+      #
+      # @return [Array<String>] Vocabulary words
+      def vocabulary
+        @vocabulary.keys
+      end
+      # Check if model is loaded.
+      #
+      # @return [Boolean] True if ONNX session is loaded
+      def loaded?
+        @loaded
+      end
+      # Find k nearest neighbors for a word.
+      #
+      # @param word [String] The query word
+      # @param k [Integer] Number of neighbors to return
+      # @return [Array<NearestNeighbor>] Nearest neighbors sorted by similarity
+      def nearest_neighbors(word, k: 10)
+        ensure_session_loaded
+        # Get query embedding
+        query = embedding_for(word)
+        return [] unless query
+        # If embedding matrix is pre-loaded, use it for faster search
+        if @embedding_matrix
+          nearest_neighbors_from_matrix(query, k)
+        else
+          super
+        end
+      end
+      # Batch lookup of embeddings for multiple words.
+      #
+      # More efficient than individual lookups when using ONNX.
+      #
+      # @param words [Array<String>] Words to lookup
+      # @return [Hash<String, WordEmbedding>] Word to embedding mapping
+      def batch_embeddings(words)
+        ensure_session_loaded
+        indices = words.map { |w| @vocabulary[w] }
+        vectors = batch_get_embeddings(indices)
+        words.zip(indices, vectors).each_with_object({}) do |(word, idx, vec)|
+          next unless idx && vec
+          [word, WordEmbedding.new(word, vec, @language_code, dimension: @dimension)]
+        end
+      end
+      # Preload the embedding matrix into memory for faster nearest neighbor search.
+      #
+      # Useful when doing many nearest neighbor queries.
+      #
+      # @return [Boolean] True if loaded successfully
+      def preload_embedding_matrix
+        ensure_session_loaded
+        # Get all embeddings at once
+        all_indices = (0...@vocabulary_size).to_a
+        vectors = batch_get_embeddings(all_indices)
+        # Convert to matrix (using Numo::SFloat for efficiency)
+        require 'numo/narray'
+        @embedding_matrix = Numo::Sfloat.cast(vectors).reshape(@vocabulary_size, @dimension)
+        true
+      rescue StandardError => e
+        warn "Failed to preload embedding matrix: #{e.message}"
+        false
+      end
+      private
+      # Get embedding vector from ONNX model.
+      #
+      # @param index [Integer] Word index
+      # @return [Array<Float>] Embedding vector
+      def get_embedding_vector(index)
+        ensure_session_loaded
+        result = @session.run(
+          ['embeddings'],
+          { word_indices: [index].pack('q<') }  # Pack int64 as little-endian
+        )
+        # Unpack float32 array
+        result.first.unpack('e*')
+      end
+      # Get embeddings for multiple indices.
+      #
+      # @param indices [Array<Integer>] Word indices
+      # @return [Array<Array<Float>>] Embedding vectors
+      def batch_get_embeddings(indices)
+        ensure_session_loaded
+        valid_indices = indices.compact
+        return [] if valid_indices.empty?
+        # Pack indices as int64 array
+        input_data = valid_indices.pack('q<*')
+        result = @session.run(
+          ['embeddings'],
+          { word_indices: input_data }
+        )
+        # Unpack float32 matrix
+        vectors = result.first.unpack('e*')
+        chunk_size = @dimension
+        vectors.each_slice(chunk_size).to_a
+      end
+      # Find nearest neighbors using pre-loaded embedding matrix.
+      #
+      # @param query [WordEmbedding] Query embedding
+      # @param k [Integer] Number of neighbors
+      # @return [Array<NearestNeighbor>] Nearest neighbors
+      def nearest_neighbors_from_matrix(query, k)
+        return [] unless @embedding_matrix
+        # Compute cosine similarity with all words
+        query_vec = Numo::Sfloat.cast(query.vector)
+        similarities = []
+        @vocabulary.each_with_index do |(word, idx)|
+          vec = @embedding_matrix[idx, true]
+          sim = cosine_similarity(query_vec, vec)
+          similarities << [word, sim]
+        end
+        # Sort by similarity and take top k
+        similarities.sort_by { |_, s| -s }.first(k).map do |word, sim|
+          NearestNeighbor.new(
+            word: word,
+            similarity: sim,
+            embedding: embedding_for(word)
+          )
+        end
+      end
+      # Calculate cosine similarity between two vectors.
+      #
+      # @param vec1 [Numo::SFloat] First vector
+      # @param vec2 [Numo::SFloat] Second vector
+      # @return [Float] Cosine similarity
+      def cosine_similarity(vec1, vec2)
+        dot = (vec1 * vec2).sum
+        norm1 = Math.sqrt((vec1 ** 2).sum)
+        norm2 = Math.sqrt((vec2 ** 2).sum)
+        return 0.0 if norm1.zero? || norm2.zero?
+        dot / (norm1 * norm2)
+      end
+      # Ensure ONNX session is loaded.
+      def ensure_session_loaded
+        return if @loaded
+        raise OnnxUnavailable unless ONNX_LOADED
+        @session = OnnxRuntime::Session.new(@onnx_path)
+        @loaded = true
+      end
+      # Detect language code from file path.
+      #
+      # @param path [String] File path
+      # @return [String] Detected language code
+      def self.detect_language_from_path(path)
+        # Extract from path like "fasttext.en.onnx"
+        if path =~ /\.([a-z]{2})\./i
+          Regexp.last_match(1).downcase
+        else
+          'en'  # Default to English
+        end
+      end
+    end
+  end
+end

data/lib/kotoshu/models/semantic_error.rb ADDED Viewed

@@ -0,0 +1,165 @@
+# frozen_string_literal: true
+require_relative 'context'
+require_relative 'suggestion'
+module Kotoshu
+  module Models
+    # Unified semantic error (NO artificial spelling/grammar split!).
+    #
+    # Represents ANY kind of language error detected through semantic analysis.
+    # Uses semantic categories instead of traditional "spelling" vs "grammar" labels.
+    #
+    # Error types (semantic categories):
+    # - :word_choice - Wrong word for context (e.g., "desert" vs "dessert")
+    # - :verb_agreement - Subject-verb mismatch (e.g., "they is" → "they are")
+    # - :tense - Temporal inconsistency (e.g., "Yesterday I will go")
+    # - :orthographic - Actual typo/misspelling (e.g., "wrold" → "world")
+    # - :preposition - Wrong preposition (e.g., "bored of" → "bored with")
+    # - :article - Wrong article (e.g., "a apple" → "an apple")
+    # - :morphology - Wrong word form (e.g., "goed" → "went")
+    # - :capitalization - Capitalization error (e.g., "i am" → "I am")
+    # - :punctuation - Punctuation error (e.g., "its" vs "it's")
+    # - :style - Style/usage suggestion
+    #
+    # @example Creating a semantic error
+    #   error = SemanticError.new(
+    #     id: "error_1",
+    #     location: Location.new(line: 5, column: 12),
+    #     original: "desert",
+    #     suggestions: [Suggestion.new("dessert", confidence: 0.92)],
+    #     error_type: :word_choice,
+    #     confidence: 0.92,
+    #     context: context
+    #   )
+    class SemanticError
+      # Error type definitions with display names
+      ERROR_TYPES = {
+        word_choice: 'Word Choice',
+        verb_agreement: 'Verb Agreement',
+        tense: 'Tense',
+        orthographic: 'Spelling',
+        preposition: 'Preposition',
+        article: 'Article',
+        morphology: 'Word Form',
+        capitalization: 'Capitalization',
+        punctuation: 'Punctuation',
+        style: 'Style'
+      }.freeze
+      attr_reader :id, :location, :original, :suggestions, :error_type, :confidence, :context
+      # Create a new semantic error.
+      #
+      # @param id [String, Symbol] Unique identifier for this error
+      # @param location [Documents::Location] Location of error in document
+      # @param original [String] The original (incorrect) word/text
+      # @param suggestions [Array<Suggestion>] Suggested corrections
+      # @param error_type [Symbol] Error type (must be in ERROR_TYPES)
+      # @param confidence [Float] Confidence score (0.0 to 1.0)
+      # @param context [Context] Context around the error
+      # @raise [ArgumentError] if error_type is invalid
+      def initialize(id:, location:, original:, suggestions:, error_type:, confidence:, context:)
+        raise ArgumentError, "Invalid error type: #{error_type}" unless ERROR_TYPES.key?(error_type)
+        raise ArgumentError, "Confidence must be 0-1" unless confidence.between?(0.0, 1.0)
+        raise ArgumentError, "Suggestions cannot be empty" if suggestions.nil? || suggestions.empty?
+        @id = id.to_s
+        @location = location
+        @original = original
+        @suggestions = suggestions.sort_by(&:confidence).reverse.freeze
+        @error_type = error_type
+        @confidence = confidence
+        @context = context
+        freeze
+      end
+      # Get user-friendly display type name.
+      #
+      # @return [String] Display type name
+      def display_type
+        ERROR_TYPES[@error_type] || @error_type.to_s.capitalize
+      end
+      # Check if this is a high-confidence error.
+      #
+      # @return [Boolean] True if confidence > 0.8
+      def high_confidence?
+        @confidence > 0.8
+      end
+      # Get confidence level category.
+      #
+      # @return [Symbol] :high, :medium, or :low
+      def confidence_level
+        return :high if @confidence > 0.8
+        return :medium if @confidence > 0.5
+        :low
+      end
+      # Get the recommended (top) suggestion.
+      #
+      # @return [Suggestion] The highest-confidence suggestion
+      def recommended_suggestion
+        @suggestions.first
+      end
+      # Check if this error equals another.
+      #
+      # @param other [Object] Another object
+      # @return [Boolean] True if IDs match
+      def ==(other)
+        return false unless other.is_a?(SemanticError)
+        @id == other.id
+      end
+      alias_method :eql?, :==
+      # Hash code for hash table usage.
+      #
+      # @return [Integer] Hash code
+      def hash
+        @id.hash
+      end
+      # Comparison for sorting (by location, then confidence).
+      #
+      # Errors are sorted by:
+      # 1. Document location (line number, then column)
+      # 2. Confidence (highest first)
+      #
+      # @param other [SemanticError] Another error
+      # @return [Integer] Comparison result (-1, 0, 1)
+      def <=>(other)
+        return 0 unless other.is_a?(SemanticError)
+        # First by location (line, then column)
+        loc_cmp = @location <=> other.location
+        return loc_cmp unless loc_cmp.zero?
+        # Then by confidence (highest first)
+        other.confidence <=> @confidence
+      end
+      # String representation.
+      #
+      # @return [String] Human-readable representation
+      def to_s
+        "#{@location}: '#{@original}' → #{recommended_suggestion.word} [#{(@confidence * 100).to_i}%]"
+      end
+      alias_method :inspect, :to_s
+      # Create an abbreviated display for lists.
+      #
+      # @param max_length [Integer] Maximum line length
+      # @return [String] Abbreviated representation
+      def abbreviated(max_length: 80)
+        orig_display = "'#{@original}'"
+        sugg_display = "'#{recommended_suggestion.word}'"
+        "#{@location}: #{orig_display} → #{sugg_display} [#{(@confidence * 100).to_i}%]"
+      end
+    end
+  end
+end

data/lib/kotoshu/models/suggestion.rb ADDED Viewed

@@ -0,0 +1,106 @@
+# frozen_string_literal: true
+module Kotoshu
+  module Models
+    # Value object for correction suggestions.
+    #
+    # Represents a suggested correction for a detected error,
+    # with confidence score and metadata.
+    #
+    # @example Creating a suggestion
+    #   suggestion = Suggestion.new("dessert", confidence: 0.92, source: :semantic)
+    #   suggestion.to_s  # => "dessert [92%]"
+    class Suggestion
+      attr_reader :word, :confidence, :source, :metadata
+      # Create a new suggestion.
+      #
+      # @param word [String] The suggested word
+      # @param confidence [Float] Confidence score (0.0 to 1.0)
+      # @param source [Symbol, nil] Source of the suggestion (e.g., :semantic, :edit_distance)
+      # @param metadata [Hash] Additional metadata (optional)
+      # @option metadata [WordEmbedding, nil] :embedding The word embedding
+      # @option metadata [Float] :edit_distance Edit distance score
+      # @option metadata [Float] :frequency_bonus Frequency score bonus
+      # @option metadata [String] :explanation Explanation for the suggestion
+      def initialize(word, confidence:, source: nil, metadata: {})
+        raise ArgumentError, "Confidence must be 0-1" unless confidence.between?(0.0, 1.0)
+        @word = word
+        @confidence = confidence
+        @source = source || :unknown
+        @metadata = metadata.freeze
+        freeze
+      end
+      # Comparison for sorting (higher confidence = better).
+      #
+      # @param other [Suggestion] Another suggestion
+      # @return [Integer] Comparison result (-1, 0, 1)
+      def <=>(other)
+        return 0 unless other.is_a?(Suggestion)
+        # Higher confidence = better rank (sort descending)
+        other.confidence <=> @confidence
+      end
+      # Check if this equals another suggestion.
+      #
+      # @param other [Object] Another object
+      # @return [Boolean] True if words match
+      def ==(other)
+        return false unless other.is_a?(Suggestion)
+        @word == other.word
+      end
+      alias_method :eql?, :==
+      # Hash code for hash table usage.
+      #
+      # @return [Integer] Hash code
+      def hash
+        @word.hash
+      end
+      # String representation with percentage.
+      #
+      # @return [String] Human-readable representation
+      def to_s
+        if @source && @source != :unknown
+          "#{@word} [#{(@confidence * 100).to_i}%] (#{@source})"
+        else
+          "#{@word} [#{(@confidence * 100).to_i}%]"
+        end
+      end
+      alias_method :inspect, :to_s
+      # Get the embedding if available.
+      #
+      # @return [WordEmbedding, nil] The embedding or nil
+      def embedding
+        @metadata[:embedding]
+      end
+      # Get the edit distance if available.
+      #
+      # @return [Float, nil] Edit distance or nil
+      def edit_distance
+        @metadata[:edit_distance]
+      end
+      # Check if this is a high-confidence suggestion.
+      #
+      # @return [Boolean] True if confidence > 0.8
+      def high_confidence?
+        @confidence > 0.8
+      end
+      # Get explanation text if available.
+      #
+      # @return [String, nil] Explanation or nil
+      def explanation
+        @metadata[:explanation]
+      end
+    end
+  end
+end