RubyGems - kotoshu - Versions diffs - 0.3.0 - Mend

kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (210) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +18 -0
data/CHANGELOG.md +182 -0
data/CLAUDE.md +172 -0
data/CODE_OF_CONDUCT.md +132 -0
data/LICENSE +31 -0
data/README.adoc +955 -0
data/Rakefile +12 -0
data/SECURITY.md +93 -0
data/examples/01_basic_word_checking.rb +38 -0
data/examples/02_text_document_checking.rb +77 -0
data/examples/03_dictionary_backends.rb +137 -0
data/examples/04_trie_data_structure.rb +146 -0
data/examples/05_suggestion_algorithms.rb +239 -0
data/examples/06_configuration_advanced.rb +287 -0
data/examples/07_multi_language_dictionaries.rb +278 -0
data/exe/kotoshu +6 -0
data/lib/kotoshu/algorithms/capitalization.rb +276 -0
data/lib/kotoshu/algorithms/lookup.rb +876 -0
data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
data/lib/kotoshu/algorithms/permutations.rb +283 -0
data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
data/lib/kotoshu/algorithms/suggest.rb +575 -0
data/lib/kotoshu/algorithms.rb +14 -0
data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
data/lib/kotoshu/cache/base_cache.rb +596 -0
data/lib/kotoshu/cache/cache.rb +91 -0
data/lib/kotoshu/cache/frequency_cache.rb +224 -0
data/lib/kotoshu/cache/language_cache.rb +454 -0
data/lib/kotoshu/cache/lookup_cache.rb +166 -0
data/lib/kotoshu/cache/model_cache.rb +513 -0
data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
data/lib/kotoshu/cache.rb +40 -0
data/lib/kotoshu/cli/auto_setup.rb +71 -0
data/lib/kotoshu/cli/batch_reporter.rb +315 -0
data/lib/kotoshu/cli/cache_command.rb +356 -0
data/lib/kotoshu/cli/display_formatter.rb +431 -0
data/lib/kotoshu/cli/errors.rb +36 -0
data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
data/lib/kotoshu/cli/language_resolver.rb +91 -0
data/lib/kotoshu/cli/navigation_manager.rb +272 -0
data/lib/kotoshu/cli/progress_reporter.rb +114 -0
data/lib/kotoshu/cli/status_report.rb +130 -0
data/lib/kotoshu/cli.rb +627 -0
data/lib/kotoshu/commands/cache_command.rb +424 -0
data/lib/kotoshu/commands/check_command.rb +312 -0
data/lib/kotoshu/commands/model_command.rb +295 -0
data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
data/lib/kotoshu/components/pos_tagger.rb +98 -0
data/lib/kotoshu/components/spell_checker.rb +73 -0
data/lib/kotoshu/components/synthesizer.rb +60 -0
data/lib/kotoshu/components/tokenizer.rb +58 -0
data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
data/lib/kotoshu/configuration/builder.rb +209 -0
data/lib/kotoshu/configuration/resolver.rb +124 -0
data/lib/kotoshu/configuration.rb +702 -0
data/lib/kotoshu/core/exceptions.rb +165 -0
data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
data/lib/kotoshu/core/models/affix_rule.rb +260 -0
data/lib/kotoshu/core/models/result/document_result.rb +263 -0
data/lib/kotoshu/core/models/result/word_result.rb +203 -0
data/lib/kotoshu/core/models/word.rb +142 -0
data/lib/kotoshu/core/trie/builder.rb +119 -0
data/lib/kotoshu/core/trie/node.rb +94 -0
data/lib/kotoshu/core/trie/trie.rb +249 -0
data/lib/kotoshu/core.rb +28 -0
data/lib/kotoshu/data/common_words/de.yml +1800 -0
data/lib/kotoshu/data/common_words/en.yml +1215 -0
data/lib/kotoshu/data/common_words/es.yml +750 -0
data/lib/kotoshu/data/common_words/fr.yml +1015 -0
data/lib/kotoshu/data/common_words/pt.yml +870 -0
data/lib/kotoshu/data/common_words/ru.yml +484 -0
data/lib/kotoshu/data/common_words_loader.rb +152 -0
data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
data/lib/kotoshu/debug_logger.rb +146 -0
data/lib/kotoshu/debug_mode.rb +134 -0
data/lib/kotoshu/defaults.rb +86 -0
data/lib/kotoshu/dictionaries/catalog.rb +817 -0
data/lib/kotoshu/dictionary/base.rb +237 -0
data/lib/kotoshu/dictionary/cspell.rb +254 -0
data/lib/kotoshu/dictionary/custom.rb +224 -0
data/lib/kotoshu/dictionary/hunspell.rb +526 -0
data/lib/kotoshu/dictionary/plain_text.rb +282 -0
data/lib/kotoshu/dictionary/repository.rb +248 -0
data/lib/kotoshu/dictionary/unified.rb +260 -0
data/lib/kotoshu/dictionary/unix_words.rb +218 -0
data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
data/lib/kotoshu/documents/document.rb +229 -0
data/lib/kotoshu/documents/location.rb +139 -0
data/lib/kotoshu/documents/markdown_document.rb +389 -0
data/lib/kotoshu/documents/plain_text_document.rb +147 -0
data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
data/lib/kotoshu/embeddings/protocol.rb +83 -0
data/lib/kotoshu/embeddings/protocols.rb +17 -0
data/lib/kotoshu/embeddings/registry.rb +182 -0
data/lib/kotoshu/embeddings/search.rb +192 -0
data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
data/lib/kotoshu/embeddings.rb +97 -0
data/lib/kotoshu/fluent_checker.rb +91 -0
data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
data/lib/kotoshu/grammar/rule.rb +95 -0
data/lib/kotoshu/grammar/rule_engine.rb +111 -0
data/lib/kotoshu/grammar/rule_loader.rb +31 -0
data/lib/kotoshu/grammar.rb +18 -0
data/lib/kotoshu/integrity/audit_log.rb +88 -0
data/lib/kotoshu/integrity/manifest.rb +117 -0
data/lib/kotoshu/integrity/net_http.rb +46 -0
data/lib/kotoshu/integrity.rb +25 -0
data/lib/kotoshu/keyboard/layout.rb +115 -0
data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
data/lib/kotoshu/keyboard/registry.rb +146 -0
data/lib/kotoshu/keyboard.rb +60 -0
data/lib/kotoshu/language/detector.rb +242 -0
data/lib/kotoshu/language/identifier.rb +378 -0
data/lib/kotoshu/language/languages/base.rb +256 -0
data/lib/kotoshu/language/normalizer/base.rb +137 -0
data/lib/kotoshu/language/registry.rb +147 -0
data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
data/lib/kotoshu/language/tokenizer/base.rb +170 -0
data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
data/lib/kotoshu/language.rb +99 -0
data/lib/kotoshu/languages/de/language.rb +546 -0
data/lib/kotoshu/languages/en/language.rb +448 -0
data/lib/kotoshu/languages/es/language.rb +459 -0
data/lib/kotoshu/languages/fr/language.rb +493 -0
data/lib/kotoshu/languages/ja/language.rb +477 -0
data/lib/kotoshu/languages/pt/language.rb +423 -0
data/lib/kotoshu/languages/ru/language.rb +404 -0
data/lib/kotoshu/languages.rb +43 -0
data/lib/kotoshu/metrics_collector.rb +222 -0
data/lib/kotoshu/metrics_module.rb +110 -0
data/lib/kotoshu/models/context.rb +119 -0
data/lib/kotoshu/models/embedding_model.rb +182 -0
data/lib/kotoshu/models/fasttext_model.rb +220 -0
data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
data/lib/kotoshu/models/onnx_model.rb +333 -0
data/lib/kotoshu/models/semantic_error.rb +165 -0
data/lib/kotoshu/models/suggestion.rb +106 -0
data/lib/kotoshu/models/word_embedding.rb +107 -0
data/lib/kotoshu/paths.rb +53 -0
data/lib/kotoshu/personal_dictionary.rb +94 -0
data/lib/kotoshu/plugins/plugin.rb +61 -0
data/lib/kotoshu/plugins/registry.rb +120 -0
data/lib/kotoshu/project_config.rb +76 -0
data/lib/kotoshu/readers/aff_data.rb +356 -0
data/lib/kotoshu/readers/aff_reader.rb +375 -0
data/lib/kotoshu/readers/condition_checker.rb +142 -0
data/lib/kotoshu/readers/dic_reader.rb +118 -0
data/lib/kotoshu/readers/file_reader.rb +347 -0
data/lib/kotoshu/readers/lookup_builder.rb +299 -0
data/lib/kotoshu/readers/readers.rb +6 -0
data/lib/kotoshu/readers.rb +9 -0
data/lib/kotoshu/resource_bundle.rb +30 -0
data/lib/kotoshu/resource_manager.rb +295 -0
data/lib/kotoshu/results/result.rb +165 -0
data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
data/lib/kotoshu/source_registry.rb +74 -0
data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
data/lib/kotoshu/spellchecker.rb +298 -0
data/lib/kotoshu/string_metrics.rb +153 -0
data/lib/kotoshu/suggestions/context.rb +55 -0
data/lib/kotoshu/suggestions/generator.rb +175 -0
data/lib/kotoshu/suggestions/pipeline.rb +135 -0
data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
data/lib/kotoshu/suggestions/suggestion.rb +174 -0
data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
data/lib/kotoshu/version.rb +5 -0
data/lib/kotoshu.rb +493 -0
data/script/validate_all_dictionaries.rb +444 -0
data/sig/kotoshu.rbs +4 -0
data/test_oop.rb +79 -0
metadata +298 -0

data/lib/kotoshu/core/exceptions.rb ADDED Viewed

@@ -0,0 +1,165 @@
+# frozen_string_literal: true
+module Kotoshu
+  # Base error class for all Kotoshu exceptions.
+  #
+  # @example Raising a custom Kotoshu error
+  #   raise Kotoshu::Error, "Something went wrong"
+  class Error < StandardError; end
+  # Error raised when a dictionary file cannot be found.
+  #
+  # @example Dictionary not found
+  #   raise DictionaryNotFoundError, "Dictionary not found: /path/to/dic.dic"
+  class DictionaryNotFoundError < Error
+    # Create a new dictionary not found error.
+    #
+    # @param path [String] The path that was not found
+    # @param message [String] Custom message (optional)
+    def initialize(path, message = nil)
+      @path = path
+      super(message || "Dictionary not found: #{path}")
+    end
+    # @return [String] The path that was not found
+    attr_reader :path
+  end
+  # Error raised when a dictionary file has an invalid format.
+  #
+  # @example Invalid dictionary format
+  #   raise InvalidDictionaryFormatError, "Invalid .dic file format"
+  class InvalidDictionaryFormatError < Error
+    # Create a new invalid format error.
+    #
+    # @param path [String] The file path
+    # @param details [String] Details about the format issue
+    def initialize(path, details = nil)
+      @path = path
+      @details = details
+      super("Invalid dictionary format#{": #{details}" if details}: #{path}")
+    end
+    # @return [String] The file path
+    attr_reader :path
+    # @return [String, nil] Details about the format issue
+    attr_reader :details
+  end
+  # Error raised when there is a configuration issue.
+  #
+  # @example Invalid configuration
+  #   raise ConfigurationError, "Invalid dictionary type: unknown_type"
+  class ConfigurationError < Error
+    # Create a new configuration error.
+    #
+    # @param message [String] The error message
+    # @param key [String, Symbol] The configuration key (optional)
+    def initialize(message, key: nil)
+      @key = key
+      super(message)
+    end
+    # @return [String, Symbol, nil] The configuration key
+    attr_reader :key
+  end
+  # Error raised during spell checking operations.
+  #
+  # @example Spell check failure
+  #   raise SpellcheckError, "Failed to check word: encoding error"
+  class SpellcheckError < Error
+    # Create a new spellcheck error.
+    #
+    # @param message [String] The error message
+    # @param word [String] The word being checked (optional)
+    def initialize(message, word: nil)
+      @word = word
+      super(message)
+    end
+    # @return [String, nil] The word being checked
+    attr_reader :word
+  end
+  # Error raised when an affix rule cannot be parsed.
+  #
+  # @example Invalid affix rule
+  #   raise AffixRuleError, "Invalid affix rule: PFX A Y 1 re"
+  class AffixRuleError < Error
+    # Create a new affix rule error.
+    #
+    # @param message [String] The error message
+    # @param rule [String] The rule that failed to parse (optional)
+    def initialize(message, rule: nil)
+      @rule = rule
+      super(message)
+    end
+    # @return [String, nil] The rule that failed to parse
+    attr_reader :rule
+  end
+  # Error raised when a required resource is not cached and cannot be
+  # downloaded (offline mode or network failure).
+  class ResourceNotCachedError < Error
+    def initialize(language, resource_type)
+      @language = language
+      @resource_type = resource_type
+      super("Resource not cached: #{language}:#{resource_type}. " \
+            "Pre-fetch with `kotoshu cache download language #{language}` " \
+            "or disable offline mode (KOTOSHU_OFFLINE=0).")
+    end
+    attr_reader :language, :resource_type
+  end
+  # Error raised by the hot path (Kotoshu.correct?, .suggest, .check,
+  # .check_file, .spellchecker_for) when a language hasn't been set up
+  # via Kotoshu.setup / kotoshu setup. The hot path is cache-only and
+  # never downloads — explicit setup is required.
+  class ResourceNotSetupError < Error
+    def initialize(language, resource_type = "spelling")
+      @language = language
+      @resource_type = resource_type
+      super("Language '#{language}' is not set up (missing #{resource_type}). " \
+            "Run `kotoshu setup #{language}` or " \
+            "`Kotoshu.setup(:#{language})` first.")
+    end
+    attr_reader :language, :resource_type
+  end
+  # Error raised when a resource cannot be resolved for a language
+  # (unsupported language, download failure, etc.).
+  class ResourceResolutionError < Error
+    def initialize(language, reason)
+      @language = language
+      super("Cannot resolve resources for '#{language}': #{reason}")
+    end
+    attr_reader :language
+  end
+  # Error raised when a downloaded resource fails integrity verification
+  # (SHA-256 mismatch against manifest, truncated content, etc.).
+  #
+  # The downloaded bytes are never trusted until verified against a known
+  # manifest entry. Mismatch raises this error with both hashes so the
+  # caller can surface them in audit logs and CI output.
+  class IntegrityError < Error
+    def initialize(resource_id, expected:, actual:, url: nil)
+      @resource_id = resource_id
+      @expected = expected
+      @actual = actual
+      @url = url
+      msg = +"Integrity verification failed for #{resource_id}: "
+      msg << "expected sha256=#{expected}, got sha256=#{actual}"
+      msg << " (url: #{url})" if url
+      super(msg)
+    end
+    attr_reader :resource_id, :expected, :actual, :url
+  end
+end

data/lib/kotoshu/core/indexed_dictionary.rb ADDED Viewed

@@ -0,0 +1,291 @@
+# frozen_string_literal: true
+module Kotoshu
+  module Core
+    # Indexed dictionary for efficient word lookup with multiple indexes.
+    # This is MORE model-driven than Spylls which uses simple hash indices.
+    #
+    # This is a proper domain model with rich behavior including:
+    # - Multiple indexes (case-sensitive, case-insensitive, prefix, suffix)
+    # - Rich query methods
+    # - Index management
+    # - Domain-specific behavior
+    class IndexedDictionary
+      attr_reader :words, :size
+      # @param words [Array<String>] Initial words to add
+      def initialize(words = [])
+        @words = []
+        @indexes = {
+          exact: {},              # case_sensitive: word => [positions]
+          lowercase: {},          # case_insensitive: word.downcase => [positions]
+          prefix: {},             # prefix => [words]
+          suffix: {},             # suffix => [words]
+          flag: {}                # flag => [words] (future: for Hunspell)
+        }
+        @size = 0
+        words.each { |word| add_word(word) }
+      end
+      # Add a word to the dictionary with optional metadata.
+      #
+      # @param word [String] The word to add
+      # @param metadata [Hash] Optional metadata associated with the word
+      # @return [IndexedDictionary] Self for chaining
+      def add_word(word, metadata = {})
+        # Store the word with its index and metadata
+        entry = { word: word, index: @size, metadata: metadata }
+        @words << entry
+        @size += 1
+        # Update exact index (case-sensitive)
+        @indexes[:exact][word] ||= []
+        @indexes[:exact][word] << @size - 1
+        # Update lowercase index (case-insensitive)
+        lower = word.downcase
+        @indexes[:lowercase][lower] ||= []
+        @indexes[:lowercase][lower] << @size - 1
+        # Update prefix indexes (for prefix searching)
+        (1...word.length).each do |i|
+          prefix = word[0...i]
+          @indexes[:prefix][prefix] ||= []
+          @indexes[:prefix][prefix] << word
+          # Update suffix indexes (for suffix searching)
+          suffix = word[i..]
+          @indexes[:suffix][suffix] ||= []
+          @indexes[:suffix][suffix] << word
+        end
+        self
+      end
+      alias << add_word
+      # Add multiple words.
+      #
+      # @param new_words [Array<String>] Words to add
+      # @return [IndexedDictionary] Self for chaining
+      def add_words(new_words)
+        new_words.each { |word| add_word(word) }
+        self
+      end
+      # Check if a word exists (case-sensitive).
+      #
+      # @param word [String] The word to check
+      # @return [Boolean] True if word exists
+      def has_word?(word)
+        @indexes[:exact].key?(word)
+      end
+      alias include? has_word?
+      alias contains? has_word?
+      # Check if a word exists (case-insensitive).
+      #
+      # @param word [String] The word to check
+      # @return [Boolean] True if word exists (any case)
+      def has_word_ignorecase?(word)
+        @indexes[:lowercase].key?(word.downcase)
+      end
+      # Look up a word (case-sensitive).
+      #
+      # @param word [String] The word to look up
+      # @return [Hash, nil] Word entry or nil
+      def lookup(word)
+        indices = @indexes[:exact][word]
+        return nil if indices.nil? || indices.empty?
+        @words[indices.first]
+      end
+      # Look up a word (case-insensitive).
+      #
+      # @param word [String] The word to look up
+      # @return [Hash, nil] Word entry or nil
+      def lookup_ignorecase(word)
+        indices = @indexes[:lowercase][word.downcase]
+        return nil if indices.nil? || indices.empty?
+        @words[indices.first]
+      end
+      # Find all words with a given prefix.
+      #
+      # @param prefix [String] The prefix to match
+      # @param ignore_case [Boolean] Whether to ignore case
+      # @return [Array<String>] Words with the prefix
+      def find_by_prefix(prefix, ignore_case: false)
+        if ignore_case
+          prefix_lower = prefix.downcase
+          all_words.select { |w| w.downcase.start_with?(prefix_lower) }
+        else
+          @indexes[:prefix].fetch(prefix, []).dup
+        end
+      end
+      # Find all words with a given suffix.
+      #
+      # @param suffix [String] The suffix to match
+      # @param ignore_case [Boolean] Whether to ignore case
+      # @return [Array<String>] Words with the suffix
+      def find_by_suffix(suffix, ignore_case: false)
+        if ignore_case
+          suffix_lower = suffix.downcase
+          all_words.select { |w| w.downcase.end_with?(suffix_lower) }
+        else
+          @indexes[:suffix].fetch(suffix, []).dup
+        end
+      end
+      # Find words matching a pattern.
+      #
+      # @param pattern [Regexp] The pattern to match
+      # @return [Array<String>] Matching words
+      def find_by_pattern(pattern)
+        all_words.select { |w| w.match?(pattern) }
+      end
+      # Find words of a specific length.
+      #
+      # @param length [Integer] The exact length
+      # @return [Array<String>] Words of the given length
+      def find_by_length(length)
+        all_words.select { |w| w.length == length }
+      end
+      # Find words within a length range.
+      #
+      # @param min_length [Integer] Minimum length
+      # @param max_length [Integer] Maximum length
+      # @return [Array<String>] Words within the length range
+      def find_by_length_range(min_length:, max_length:)
+        all_words.select { |w| w.length >= min_length && w.length <= max_length }
+      end
+      # Get all words in the dictionary.
+      #
+      # @return [Array<String>] All words
+      def all_words
+        @words.map { |entry| entry[:word] }
+      end
+      # Get random words from the dictionary.
+      #
+      # @param count [Integer] Number of random words
+      # @return [Array<String>] Random words
+      def random_words(count: 1)
+        return [] if @words.empty?
+        indices = (0...@size).to_a.sample(count)
+        indices.map { |i| @words[i][:word] }
+      end
+      # Get words starting with each letter (A-Z).
+      #
+      # @return [Hash] Hash of letter => word count
+      def count_by_first_letter
+        result = Hash.new(0)
+        all_words.each do |word|
+          next if word.empty?
+          letter = word[0].upcase
+          result[letter] += 1
+        end
+        result
+      end
+      # Get word length distribution.
+      #
+      # @return [Hash] Hash of length => count
+      def count_by_length
+        result = Hash.new(0)
+        all_words.each { |word| result[word.length] += 1 }
+        result
+      end
+      # Check if the dictionary is empty.
+      #
+      # @return [Boolean] True if empty
+      def empty?
+        @size.zero?
+      end
+      # Iterate over all words.
+      #
+      # @yield [word] Each word
+      # @return [Enumerator] Enumerator if no block given
+      def each_word
+        return enum_for(:each_word) unless block_given?
+        @words.each { |entry| yield entry[:word] }
+      end
+      # Iterate over all words with indices.
+      #
+      # @yield [word, index] Each word and its index
+      # @return [Enumerator] Enumerator if no block given
+      def each_with_index
+        return enum_for(:each_with_index) unless block_given?
+        @words.each { |entry| yield entry[:word], entry[:index] }
+      end
+      # Build a Trie from the dictionary words.
+      #
+      # @return [Trie] New trie containing all words
+      def to_trie
+        require_relative "trie/trie"
+        require_relative "trie/builder"
+        Trie::Builder.from_array(all_words)
+      end
+      # Get statistics about the dictionary.
+      #
+      # @return [Hash] Statistics
+      def statistics
+        lengths = all_words.map(&:length)
+        {
+          total_words: @size,
+          unique_words: all_words.uniq.size,
+          min_length: lengths.min || 0,
+          max_length: lengths.max || 0,
+          avg_length: lengths.empty? ? 0 : (lengths.sum.to_f / lengths.size).round(2),
+          count_by_first_letter: count_by_first_letter,
+          count_by_length: count_by_length
+        }
+      end
+      # Convert to string.
+      #
+      # @return [String] String representation
+      def to_s
+        "IndexedDictionary(size: #{@size})"
+      end
+      alias inspect to_s
+      # Create indexed dictionary from a file.
+      #
+      # @param path [String] Path to word list file
+      # @return [IndexedDictionary] New dictionary
+      def self.from_file(path)
+        words = File.foreach(path, chomp: true).reject { |l| l.empty? || l.start_with?("#") }
+        new(words)
+      end
+      # Create indexed dictionary from a Trie.
+      #
+      # @param trie [Trie] The trie to convert
+      # @return [IndexedDictionary] New dictionary
+      def self.from_trie(trie)
+        words = trie.all_words
+        new(words)
+      end
+    end
+  end
+end

data/lib/kotoshu/core/models/affix_rule.rb ADDED Viewed

@@ -0,0 +1,260 @@
+# frozen_string_literal: true
+module Kotoshu
+  module Models
+    # Affix rule model for Hunspell-style affix processing.
+    #
+    # Affix rules define how prefixes and suffixes can be added or removed
+    # from words to generate morphological variants.
+    #
+    # This is a value object that represents a single affix rule.
+    #
+    # @note This class is immutable and frozen on initialization.
+    #
+    # @example Creating a prefix rule
+    #   rule = Models::AffixRule.new(
+    #     type: :prefix,
+    #     flag: "A",
+    #     strip: "",
+    #     add: "re",
+    #     condition: "."
+    #   )
+    #   rule.prefix?   # => true
+    #   rule.suffix?   # => false
+    class AffixRule
+      # @return [Symbol] The affix type (:prefix or :suffix)
+      attr_reader :type
+      # @return [String] The flag character identifying this rule
+      attr_reader :flag
+      # @return [String] Characters to strip from the word
+      attr_reader :strip
+      # @return [String] Characters to add to the word
+      attr_reader :add
+      # @return [String, Regexp] Condition for applying this rule
+      attr_reader :condition
+      # @return [Boolean] Whether this is a cross-product rule
+      attr_reader :cross_product
+      # Affix rule types.
+      TYPES = {
+        prefix: "PFX",
+        suffix: "SFX"
+      }.freeze
+      # Create a new AffixRule.
+      #
+      # @param type [Symbol] The affix type (:prefix or :suffix)
+      # @param flag [String] The flag character
+      # @param strip [String] Characters to strip
+      # @param add [String] Characters to add
+      # @param condition [String, Regexp] Condition for applying
+      # @param cross_product [Boolean] Whether this is cross-product
+      def initialize(type:, flag:, strip:, add:, condition: ".", cross_product: false)
+        raise ArgumentError, "Invalid type: #{type}" unless %i[prefix suffix].include?(type)
+        raise ArgumentError, "Flag cannot be empty" if flag.nil? || flag.empty?
+        @type = type
+        @flag = flag.dup.freeze
+        @strip = strip.dup.freeze
+        @add = add.dup.freeze
+        @condition = condition.is_a?(Regexp) ? condition : compile_condition(condition)
+        @cross_product = cross_product
+        freeze
+      end
+      # Check if this is a prefix rule.
+      #
+      # @return [Boolean] True if prefix
+      def prefix?
+        @type == :prefix
+      end
+      # Check if this is a suffix rule.
+      #
+      # @return [Boolean] True if suffix
+      def suffix?
+        @type == :suffix
+      end
+      # Check if this rule can be applied to a word.
+      #
+      # @param word [String] The word to check
+      # @return [Boolean] True if the rule applies
+      def applies_to?(word)
+        return false if word.nil? || word.empty?
+        word.match?(@condition)
+      end
+      # Apply this rule to a word.
+      #
+      # @param word [String] The word to modify
+      # @return [String, nil] The modified word, or nil if rule doesn't apply
+      def apply(word)
+        return nil unless applies_to?(word)
+        if prefix?
+          # Strip from beginning, add prefix
+          word.start_with?(@strip) ? @add + word[@strip.length..] : nil
+        else
+          # Strip from end, add suffix
+          word.end_with?(@strip) ? word[0...-@strip.length] + @add : nil
+        end
+      end
+      # Remove this affix from a word (reverse operation).
+      #
+      # @param word [String] The word to modify
+      # @return [String, nil] The stripped word, or nil if affix doesn't match
+      def remove(word)
+        return nil unless applies_to?(word)
+        if prefix?
+          # Remove prefix if it matches
+          word.start_with?(@add) ? @strip + word[@add.length..] : nil
+        else
+          # Remove suffix if it matches
+          word.end_with?(@add) ? word[0...-@add.length] + @strip : nil
+        end
+      end
+      # Get the Hunspell representation.
+      #
+      # @return [String] The affix line for Hunspell format
+      def to_hunspell
+        type_code = TYPES[@type]
+        cross = @cross_product ? "Y" : "N"
+        "#{type_code} #{@flag} #{cross} #{@strip.empty? ? "0" : @strip} " \
+        "#{@add} #{@condition.is_a?(Regexp) ? condition_to_s : @condition}"
+      end
+      # Convert to hash.
+      #
+      # @return [Hash] Hash representation
+      def to_h
+        {
+          type: @type,
+          flag: @flag,
+          strip: @strip,
+          add: @add,
+          condition: @condition.is_a?(Regexp) ? @condition.source : @condition,
+          cross_product: @cross_product
+        }
+      end
+      # Check equality based on all attributes.
+      #
+      # @param other [AffixRule] The other rule
+      # @return [Boolean] True if equal
+      def ==(other)
+        return false unless other.is_a?(AffixRule)
+        @type == other.type &&
+          @flag == other.flag &&
+          @strip == other.strip &&
+          @add == other.add &&
+          @condition == other.condition &&
+          @cross_product == other.cross_product
+      end
+      alias eql? ==
+      # Hash based on all attributes.
+      #
+      # @return [Integer] Hash code
+      def hash
+        [@type, @flag, @strip, @add, @cross_product].hash
+      end
+      # Compare rules by flag.
+      #
+      # @param other [AffixRule] The other rule
+      # @return [Integer] Comparison result
+      def <=>(other)
+        return nil unless other.is_a?(AffixRule)
+        @flag <=> other.flag
+      end
+      private
+      # Compile condition string to regex.
+      #
+      # @param condition [String] The condition string
+      # @return [Regexp] The compiled regex
+      def compile_condition(condition)
+        return // if condition == "."
+        # Hunspell uses '.' for match-all, '[...]' for character classes
+        # and '^[...]' for negated classes. Convert to Ruby regex.
+        regex_str = condition.dup
+        # Convert [...] to Ruby character class
+        regex_str = regex_str.gsub(/\[([^\]]+)\]/, "(?:\\1)")
+        # Convert ^[...] to negative lookahead
+        # Convert ^ to negative lookahead for single character
+        regex_str = regex_str.gsub("\\^(\\w)", "(?!\\1).")
+        # Anchor to end for suffix, beginning for prefix
+        if @type == :suffix
+          Regexp.new("#{regex_str}\\$")
+        else
+          Regexp.new("\\^#{regex_str}")
+        end
+      end
+      # Convert regex condition back to string.
+      #
+      # @return [String] The condition string
+      def condition_to_s
+        source = @condition.source
+        # Remove anchors
+        source = source.gsub("\\^", "").gsub("\\$", "")
+        # Convert negative lookaheads back
+        source = source.gsub("\\(\\?\\!([^)]+)\\)\\.", "^\\1")
+        # Convert non-capturing groups back
+        source.gsub("\\(\\?:", "[").gsub("\\)", "]")
+      end
+      # Create an affix rule from a Hunspell affix line.
+      #
+      # @param line [String] The affix line
+      # @param type [Symbol] The rule type (:prefix or :suffix)
+      # @return [AffixRule] New affix rule
+      #
+      # @example Parsing a Hunspell prefix rule
+      #   AffixRule.from_hunspell("PFX A Y 1 re .", :prefix)
+      #
+      # @example Parsing a Hunspell suffix rule
+      #   AffixRule.from_hunspell("SFX V N 2 ive e", :suffix)
+      def self.from_hunspell(line, type)
+        parts = line.split
+        return nil if parts.length < 5
+        flag = parts[1]
+        cross_product = parts[2] == "Y"
+        strip = parts[3] == "0" ? "" : parts[3]
+        add = parts[4]
+        condition = parts[5] || "."
+        new(
+          type: type,
+          flag: flag,
+          strip: strip,
+          add: add,
+          condition: condition,
+          cross_product: cross_product
+        )
+      end
+    end
+  end
+end