RubyGems - kotoshu - Versions diffs - 0.3.0 - Mend

kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (210) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +18 -0
data/CHANGELOG.md +182 -0
data/CLAUDE.md +172 -0
data/CODE_OF_CONDUCT.md +132 -0
data/LICENSE +31 -0
data/README.adoc +955 -0
data/Rakefile +12 -0
data/SECURITY.md +93 -0
data/examples/01_basic_word_checking.rb +38 -0
data/examples/02_text_document_checking.rb +77 -0
data/examples/03_dictionary_backends.rb +137 -0
data/examples/04_trie_data_structure.rb +146 -0
data/examples/05_suggestion_algorithms.rb +239 -0
data/examples/06_configuration_advanced.rb +287 -0
data/examples/07_multi_language_dictionaries.rb +278 -0
data/exe/kotoshu +6 -0
data/lib/kotoshu/algorithms/capitalization.rb +276 -0
data/lib/kotoshu/algorithms/lookup.rb +876 -0
data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
data/lib/kotoshu/algorithms/permutations.rb +283 -0
data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
data/lib/kotoshu/algorithms/suggest.rb +575 -0
data/lib/kotoshu/algorithms.rb +14 -0
data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
data/lib/kotoshu/cache/base_cache.rb +596 -0
data/lib/kotoshu/cache/cache.rb +91 -0
data/lib/kotoshu/cache/frequency_cache.rb +224 -0
data/lib/kotoshu/cache/language_cache.rb +454 -0
data/lib/kotoshu/cache/lookup_cache.rb +166 -0
data/lib/kotoshu/cache/model_cache.rb +513 -0
data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
data/lib/kotoshu/cache.rb +40 -0
data/lib/kotoshu/cli/auto_setup.rb +71 -0
data/lib/kotoshu/cli/batch_reporter.rb +315 -0
data/lib/kotoshu/cli/cache_command.rb +356 -0
data/lib/kotoshu/cli/display_formatter.rb +431 -0
data/lib/kotoshu/cli/errors.rb +36 -0
data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
data/lib/kotoshu/cli/language_resolver.rb +91 -0
data/lib/kotoshu/cli/navigation_manager.rb +272 -0
data/lib/kotoshu/cli/progress_reporter.rb +114 -0
data/lib/kotoshu/cli/status_report.rb +130 -0
data/lib/kotoshu/cli.rb +627 -0
data/lib/kotoshu/commands/cache_command.rb +424 -0
data/lib/kotoshu/commands/check_command.rb +312 -0
data/lib/kotoshu/commands/model_command.rb +295 -0
data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
data/lib/kotoshu/components/pos_tagger.rb +98 -0
data/lib/kotoshu/components/spell_checker.rb +73 -0
data/lib/kotoshu/components/synthesizer.rb +60 -0
data/lib/kotoshu/components/tokenizer.rb +58 -0
data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
data/lib/kotoshu/configuration/builder.rb +209 -0
data/lib/kotoshu/configuration/resolver.rb +124 -0
data/lib/kotoshu/configuration.rb +702 -0
data/lib/kotoshu/core/exceptions.rb +165 -0
data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
data/lib/kotoshu/core/models/affix_rule.rb +260 -0
data/lib/kotoshu/core/models/result/document_result.rb +263 -0
data/lib/kotoshu/core/models/result/word_result.rb +203 -0
data/lib/kotoshu/core/models/word.rb +142 -0
data/lib/kotoshu/core/trie/builder.rb +119 -0
data/lib/kotoshu/core/trie/node.rb +94 -0
data/lib/kotoshu/core/trie/trie.rb +249 -0
data/lib/kotoshu/core.rb +28 -0
data/lib/kotoshu/data/common_words/de.yml +1800 -0
data/lib/kotoshu/data/common_words/en.yml +1215 -0
data/lib/kotoshu/data/common_words/es.yml +750 -0
data/lib/kotoshu/data/common_words/fr.yml +1015 -0
data/lib/kotoshu/data/common_words/pt.yml +870 -0
data/lib/kotoshu/data/common_words/ru.yml +484 -0
data/lib/kotoshu/data/common_words_loader.rb +152 -0
data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
data/lib/kotoshu/debug_logger.rb +146 -0
data/lib/kotoshu/debug_mode.rb +134 -0
data/lib/kotoshu/defaults.rb +86 -0
data/lib/kotoshu/dictionaries/catalog.rb +817 -0
data/lib/kotoshu/dictionary/base.rb +237 -0
data/lib/kotoshu/dictionary/cspell.rb +254 -0
data/lib/kotoshu/dictionary/custom.rb +224 -0
data/lib/kotoshu/dictionary/hunspell.rb +526 -0
data/lib/kotoshu/dictionary/plain_text.rb +282 -0
data/lib/kotoshu/dictionary/repository.rb +248 -0
data/lib/kotoshu/dictionary/unified.rb +260 -0
data/lib/kotoshu/dictionary/unix_words.rb +218 -0
data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
data/lib/kotoshu/documents/document.rb +229 -0
data/lib/kotoshu/documents/location.rb +139 -0
data/lib/kotoshu/documents/markdown_document.rb +389 -0
data/lib/kotoshu/documents/plain_text_document.rb +147 -0
data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
data/lib/kotoshu/embeddings/protocol.rb +83 -0
data/lib/kotoshu/embeddings/protocols.rb +17 -0
data/lib/kotoshu/embeddings/registry.rb +182 -0
data/lib/kotoshu/embeddings/search.rb +192 -0
data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
data/lib/kotoshu/embeddings.rb +97 -0
data/lib/kotoshu/fluent_checker.rb +91 -0
data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
data/lib/kotoshu/grammar/rule.rb +95 -0
data/lib/kotoshu/grammar/rule_engine.rb +111 -0
data/lib/kotoshu/grammar/rule_loader.rb +31 -0
data/lib/kotoshu/grammar.rb +18 -0
data/lib/kotoshu/integrity/audit_log.rb +88 -0
data/lib/kotoshu/integrity/manifest.rb +117 -0
data/lib/kotoshu/integrity/net_http.rb +46 -0
data/lib/kotoshu/integrity.rb +25 -0
data/lib/kotoshu/keyboard/layout.rb +115 -0
data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
data/lib/kotoshu/keyboard/registry.rb +146 -0
data/lib/kotoshu/keyboard.rb +60 -0
data/lib/kotoshu/language/detector.rb +242 -0
data/lib/kotoshu/language/identifier.rb +378 -0
data/lib/kotoshu/language/languages/base.rb +256 -0
data/lib/kotoshu/language/normalizer/base.rb +137 -0
data/lib/kotoshu/language/registry.rb +147 -0
data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
data/lib/kotoshu/language/tokenizer/base.rb +170 -0
data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
data/lib/kotoshu/language.rb +99 -0
data/lib/kotoshu/languages/de/language.rb +546 -0
data/lib/kotoshu/languages/en/language.rb +448 -0
data/lib/kotoshu/languages/es/language.rb +459 -0
data/lib/kotoshu/languages/fr/language.rb +493 -0
data/lib/kotoshu/languages/ja/language.rb +477 -0
data/lib/kotoshu/languages/pt/language.rb +423 -0
data/lib/kotoshu/languages/ru/language.rb +404 -0
data/lib/kotoshu/languages.rb +43 -0
data/lib/kotoshu/metrics_collector.rb +222 -0
data/lib/kotoshu/metrics_module.rb +110 -0
data/lib/kotoshu/models/context.rb +119 -0
data/lib/kotoshu/models/embedding_model.rb +182 -0
data/lib/kotoshu/models/fasttext_model.rb +220 -0
data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
data/lib/kotoshu/models/onnx_model.rb +333 -0
data/lib/kotoshu/models/semantic_error.rb +165 -0
data/lib/kotoshu/models/suggestion.rb +106 -0
data/lib/kotoshu/models/word_embedding.rb +107 -0
data/lib/kotoshu/paths.rb +53 -0
data/lib/kotoshu/personal_dictionary.rb +94 -0
data/lib/kotoshu/plugins/plugin.rb +61 -0
data/lib/kotoshu/plugins/registry.rb +120 -0
data/lib/kotoshu/project_config.rb +76 -0
data/lib/kotoshu/readers/aff_data.rb +356 -0
data/lib/kotoshu/readers/aff_reader.rb +375 -0
data/lib/kotoshu/readers/condition_checker.rb +142 -0
data/lib/kotoshu/readers/dic_reader.rb +118 -0
data/lib/kotoshu/readers/file_reader.rb +347 -0
data/lib/kotoshu/readers/lookup_builder.rb +299 -0
data/lib/kotoshu/readers/readers.rb +6 -0
data/lib/kotoshu/readers.rb +9 -0
data/lib/kotoshu/resource_bundle.rb +30 -0
data/lib/kotoshu/resource_manager.rb +295 -0
data/lib/kotoshu/results/result.rb +165 -0
data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
data/lib/kotoshu/source_registry.rb +74 -0
data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
data/lib/kotoshu/spellchecker.rb +298 -0
data/lib/kotoshu/string_metrics.rb +153 -0
data/lib/kotoshu/suggestions/context.rb +55 -0
data/lib/kotoshu/suggestions/generator.rb +175 -0
data/lib/kotoshu/suggestions/pipeline.rb +135 -0
data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
data/lib/kotoshu/suggestions/suggestion.rb +174 -0
data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
data/lib/kotoshu/version.rb +5 -0
data/lib/kotoshu.rb +493 -0
data/script/validate_all_dictionaries.rb +444 -0
data/sig/kotoshu.rbs +4 -0
data/test_oop.rb +79 -0
metadata +298 -0

data/lib/kotoshu/dictionary/base.rb ADDED Viewed

@@ -0,0 +1,237 @@
+# frozen_string_literal: true
+module Kotoshu
+  module Dictionary
+    # Base class for all dictionary backends.
+    #
+    # This abstract class defines the interface that all dictionary
+    # implementations must follow.
+    #
+    # @note Subclasses must implement the abstract methods: {#lookup},
+    #       {#suggest}, {#add_word}, and {#remove_word}.
+    #
+    # @example Implementing a custom dictionary
+    #   class MyDictionary < Base
+    #     def initialize(path, language_code:, locale: nil)
+    #       super(language_code, locale: locale)
+    #       @words = load_words(path)
+    #     end
+    #
+    #     def lookup(word)
+    #       @words.include?(word.downcase)
+    #     end
+    #
+    #     # ... implement other abstract methods
+    #   end
+    class Base
+      # @return [String] The language code (e.g., "en-US", "en-GB")
+      attr_reader :language_code
+      # @return [String, nil] The locale (e.g., "en", "en_US")
+      attr_reader :locale
+      # @return [Hash] Additional metadata
+      attr_reader :metadata
+      # Create a new dictionary.
+      #
+      # @param language_code [String] The language code (e.g., "en-US")
+      # @param locale [String, nil] The locale (optional)
+      # @param metadata [Hash] Additional metadata (optional)
+      def initialize(language_code, locale: nil, metadata: {})
+        raise ArgumentError, "Language code cannot be empty" if language_code.nil? || language_code.empty?
+        @language_code = language_code.dup.freeze
+        @locale = locale&.dup&.freeze
+        @metadata = metadata.dup.freeze
+      end
+      # Check if a word exists in the dictionary.
+      #
+      # @abstract Subclasses must implement this method.
+      # @param word [String] The word to look up
+      # @return [Boolean] True if the word exists
+      # @raise [NotImplementedError] Subclass must implement
+      def lookup(word)
+        raise NotImplementedError, "#{self.class} must implement #lookup"
+      end
+      # Check if a word exists in the dictionary (alias for lookup).
+      #
+      # @param word [String] The word to look up
+      # @return [Boolean] True if the word exists
+      def lookup?(word)
+        lookup(word)
+      end
+      alias has_word? lookup
+      alias include? lookup
+      alias contains? lookup
+      # Generate spelling suggestions for a word.
+      #
+      # @abstract Subclasses must implement this method.
+      # @param word [String] The misspelled word
+      # @param max_suggestions [Integer] Maximum number of suggestions
+      # @return [Array<String>] List of suggested words
+      # @raise [NotImplementedError] Subclass must implement
+      def suggest(word, max_suggestions: 10)
+        raise NotImplementedError, "#{self.class} must implement #suggest"
+      end
+      # Add a word to the dictionary.
+      #
+      # @abstract Subclasses must implement this method.
+      # @param word [String] The word to add
+      # @param flags [Array<String>] Morphological flags (optional)
+      # @return [Boolean] True if the word was added
+      # @raise [NotImplementedError] Subclass must implement
+      def add_word(word, flags: [])
+        raise NotImplementedError, "#{self.class} must implement #add_word"
+      end
+      alias << add_word
+      # Remove a word from the dictionary.
+      #
+      # @abstract Subclasses must implement this method.
+      # @param word [String] The word to remove
+      # @return [Boolean] True if the word was removed
+      # @raise [NotImplementedError] Subclass must implement
+      def remove_word(word)
+        raise NotImplementedError, "#{self.class} must implement #remove_word"
+      end
+      # Get all words in the dictionary.
+      #
+      # @abstract Subclasses must implement this method.
+      # @return [Array<String>] All words
+      # @raise [NotImplementedError] Subclass must implement
+      def words
+        raise NotImplementedError, "#{self.class} must implement #words"
+      end
+      alias all_words words
+      # Get the number of words in the dictionary.
+      #
+      # @return [Integer] Word count
+      def size
+        words.length
+      end
+      alias count size
+      alias length size
+      # Check if the dictionary is empty.
+      #
+      # @return [Boolean] True if empty
+      def empty?
+        size.zero?
+      end
+      # Iterate over all words.
+      #
+      # @yield [word] Each word
+      # @return [Enumerator] Enumerator if no block given
+      def each_word(&block)
+        return enum_for(:each_word) unless block_given?
+        words.each(&block)
+      end
+      # Get words starting with a prefix.
+      #
+      # @param prefix [String] The prefix
+      # @return [Array<String>] Words with the prefix
+      def words_with_prefix(prefix)
+        words.select { |w| w.start_with?(prefix) }
+      end
+      # Get words matching a pattern.
+      #
+      # @param pattern [Regexp] The pattern
+      # @return [Array<String>] Matching words
+      def words_matching(pattern)
+        words.select { |w| w.match?(pattern) }
+      end
+      # Convert to string.
+      #
+      # @return [String] String representation
+      def to_s
+        "#{self.class.name}(language: #{@language_code}, size: #{size})"
+      end
+      alias inspect to_s
+      # Dictionary type identifier.
+      #
+      # @return [Symbol] The dictionary type
+      def type
+        self.class.name.split("::").last.gsub(/(.)([A-Z])/, '\1_\2').downcase.to_sym
+      end
+      # Register this dictionary type.
+      #
+      # @param type_key [Symbol] The type key to register as
+      #
+      # @example Registering a custom dictionary type
+      #   MyDictionary.register_type(:my_custom)
+      def self.register_type(type_key)
+        Kotoshu::Dictionary.register_type(type_key, self)
+      end
+      # Class-level registry for dictionary types.
+      #
+      # @return [Hash] Registry of type keys to classes
+      def self.registry
+        @registry ||= {}
+      end
+      # Load a dictionary by type.
+      #
+      # @param type [Symbol] The dictionary type
+      # @param args [Array] Arguments to pass to constructor
+      # @return [Base] The loaded dictionary
+      # @raise [ConfigurationError] If type is not registered
+      def self.load(type, *args)
+        klass = registry[type]
+        raise ConfigurationError, "Unknown dictionary type: #{type}" unless klass
+        klass.new(*args)
+      end
+    end
+    # Module-level registry for dictionary types.
+    #
+    # @return [Hash] Registry of type keys to classes
+    def self.registry
+      @registry ||= {}
+    end
+    # Register a dictionary type.
+    #
+    # @param type [Symbol] The type key
+    # @param klass [Class] The dictionary class
+    #
+    # @example Registering a custom dictionary type
+    #   Dictionary.register_type(:my_custom, MyDictionary)
+    def self.register_type(type, klass)
+      @registry ||= {}
+      @registry[type] = klass
+    end
+    # Load a dictionary by type.
+    #
+    # @param type [Symbol] The dictionary type
+    # @param args [Array] Arguments to pass to constructor
+    # @return [Base] The loaded dictionary
+    #
+    # @example Loading a dictionary
+    #   dict = Dictionary.load(:unix_words, "/usr/share/dict/words",
+    #                          language_code: "en-US")
+    def self.load(type, *args)
+      klass = registry[type]
+      raise ConfigurationError, "Unknown dictionary type: #{type}" unless klass
+      klass.new(*args)
+    end
+  end
+end

data/lib/kotoshu/dictionary/cspell.rb ADDED Viewed

@@ -0,0 +1,254 @@
+# frozen_string_literal: true
+require_relative "base"
+module Kotoshu
+  module Dictionary
+    # CSpell dictionary backend.
+    #
+    # This dictionary reads CSpell-formatted dictionary files (plain text .txt
+    # or compressed .trie files). CSpell is the spell checker used by VS Code.
+    #
+    # File format:
+    # - .txt: Plain text with one word per line, # comments supported
+    # - .trie: Compressed trie format (DAFSA - Deterministic Acyclic Finite State Automaton)
+    #
+    # @example Creating from a text file
+    #   dict = CSpell.new("words.txt", language_code: "en-US")
+    #   dict.lookup?("hello")  # => true
+    #
+    # @example Creating from a trie file
+    #   dict = CSpell.new("words.trie", language_code: "en")
+    class CSpell < Base
+      # @return [String] The path to the dictionary file
+      attr_reader :path
+      # @return [Boolean] Whether lookups are case-sensitive
+      attr_reader :case_sensitive
+      # @return [Core::Trie::Trie] The trie data structure
+      attr_reader :trie
+      # Create a new CSpell dictionary.
+      #
+      # @param path [String] Path to the dictionary file (.txt or .trie)
+      # @param language_code [String] The language code
+      # @param locale [String, nil] The locale (optional)
+      # @param case_sensitive [Boolean] Whether lookups are case-sensitive
+      # @param metadata [Hash] Additional metadata (optional)
+      def initialize(path, language_code:, locale: nil, case_sensitive: false, metadata: {})
+        super(language_code, locale: locale, metadata: metadata)
+        @path = File.expand_path(path)
+        @case_sensitive = case_sensitive
+        raise DictionaryNotFoundError, @path unless File.exist?(@path)
+        # Load based on file extension
+        @trie = if @path.end_with?(".trie")
+                  load_trie_file(@path)
+                else
+                  load_text_file(@path)
+                end
+        # Register this dictionary type
+        self.class.register_type(:cspell) unless Dictionary.registry.key?(:cspell)
+      end
+      # Check if a word exists in the dictionary.
+      #
+      # @param word [String] The word to look up
+      # @return [Boolean] True if the word exists
+      def lookup(word)
+        return false if word.nil? || word.empty?
+        lookup_word = @case_sensitive ? word : word.downcase
+        @trie.lookup(lookup_word)
+      end
+      # Check if the dictionary has words with a prefix.
+      #
+      # @param prefix [String] The prefix
+      # @return [Boolean] True if words exist with the prefix
+      def has_prefix?(prefix)
+        return false if prefix.nil? || prefix.empty?
+        lookup_prefix = @case_sensitive ? prefix : prefix.downcase
+        @trie.has_prefix?(lookup_prefix)
+      end
+      # Generate spelling suggestions.
+      #
+      # Uses trie walk to find similar words.
+      #
+      # @param word [String] The misspelled word
+      # @param max_suggestions [Integer] Maximum suggestions
+      # @return [Array<String>] List of suggested words
+      def suggest(word, max_suggestions: 10)
+        return [] if word.nil? || word.empty?
+        lookup_word = @case_sensitive ? word : word.downcase
+        # First try prefix-based suggestions
+        prefix_suggestions = @trie.suggestions(lookup_word, max_results: max_suggestions)
+        # If we have enough prefix suggestions, return them
+        return prefix_suggestions if prefix_suggestions.length >= max_suggestions
+        # Otherwise, use edit distance for more suggestions
+        all_words = @trie.all_words
+        candidates = all_words.select do |w|
+          w.length >= lookup_word.length - 2 &&
+            w.length <= lookup_word.length + 2
+        end
+        # Calculate edit distances
+        results = candidates.map do |dict_word|
+          dist = edit_distance(lookup_word, dict_word)
+          [dict_word, dist]
+        end.select { |_, dist| dist.positive? && dist <= 2 }
+                            .sort_by { |_, dist| dist }
+                            .first(max_suggestions - prefix_suggestions.length)
+                            .map(&:first)
+        # Combine both sets
+        (prefix_suggestions + results).uniq.first(max_suggestions)
+      end
+      # Add a word to the dictionary.
+      #
+      # @param word [String] The word to add
+      # @param flags [Array<String>] Flags (ignored for CSpell)
+      # @return [Boolean] True if added
+      def add_word(word, flags: [])
+        return false if word.nil? || word.empty?
+        lookup_word = @case_sensitive ? word : word.downcase
+        return false if @trie.lookup(lookup_word)
+        @trie.insert(lookup_word)
+        true
+      end
+      # Remove a word from the dictionary.
+      #
+      # @param word [String] The word to remove
+      # @return [Boolean] True if removed
+      # @note CSpell dictionaries are typically immutable after loading
+      def remove_word(_word)
+        # Trie doesn't support removal easily
+        # Would need to rebuild the trie
+        false
+      end
+      # Get all words in the dictionary.
+      #
+      # @return [Array<String>] All words
+      def words
+        @trie.all_words
+      end
+      # Get words with a prefix.
+      #
+      # @param prefix [String] The prefix
+      # @return [Array<String>] Words with the prefix
+      def words_with_prefix(prefix)
+        return [] if prefix.nil? || prefix.empty?
+        lookup_prefix = @case_sensitive ? prefix : prefix.downcase
+        @trie.words_with_prefix(lookup_prefix)
+      end
+      # Create a dictionary from an array of words.
+      #
+      # @param words [Array<String>] The words
+      # @param language_code [String] The language code
+      # @param locale [String, nil] The locale (optional)
+      # @param case_sensitive [Boolean] Whether lookups are case-sensitive
+      # @return [CSpell] New dictionary
+      #
+      # @example
+      #   dict = CSpell.from_words(%w[hello world test], language_code: "en")
+      def self.from_words(words, language_code:, locale: nil, case_sensitive: false)
+        dict = allocate
+        # Build trie from words
+        normalized_words = words.map { |w| case_sensitive ? w : w.downcase }.uniq
+        trie = Core::Trie::Builder.from_array(normalized_words)
+        dict.instance_variable_set(:@language_code, language_code.dup.freeze)
+        dict.instance_variable_set(:@locale, locale&.dup&.freeze)
+        dict.instance_variable_set(:@path, nil)
+        dict.instance_variable_set(:@case_sensitive, case_sensitive)
+        dict.instance_variable_set(:@trie, trie)
+        dict.instance_variable_set(:@metadata, {}.freeze)
+        # Register this dictionary type (unless already registered)
+        register_type(:cspell) unless Dictionary.registry.key?(:cspell)
+        dict
+      end
+      private
+      # Load a text dictionary file.
+      #
+      # @param path [String] The file path
+      # @return [Core::Trie::Trie] The loaded trie
+      def load_text_file(path)
+        words = File.foreach(path, chomp: true)
+                    .reject { |line| line.empty? || line.strip.empty? || line.strip.start_with?("#") }
+                    .map(&:strip)
+                    .map { |word| @case_sensitive ? word : word.downcase }
+                    .uniq
+        Core::Trie::Builder.from_array(words)
+      end
+      # Load a compressed trie file.
+      #
+      # @param path [String] The file path
+      # @return [Core::Trie::Trie] The loaded trie
+      #
+      # @note For now, this falls back to treating the file as text.
+      #       Full .trie format support would require implementing DAFSA decompression.
+      def load_trie_file(path)
+        # For now, treat as text file
+        # Full implementation would parse the CSpell .trie format
+        # which uses DAFSA (Deterministic Acyclic Finite State Automaton) compression
+        load_text_file(path)
+      end
+      # Calculate Levenshtein edit distance.
+      #
+      # @param str1 [String] First string
+      # @param str2 [String] Second string
+      # @return [Integer] Edit distance
+      def edit_distance(str1, str2)
+        return str2.length if str1.empty?
+        return str1.length if str2.empty?
+        # Use smaller string for inner loop
+        str1, str2 = str2, str1 if str1.length > str2.length
+        previous = (0..str1.length).to_a
+        str2.each_char.with_index do |char2, j|
+          current = [j + 1]
+          str1.each_char.with_index do |char1, i|
+            insert_cost = current[i] + 1
+            delete_cost = previous[i + 1] + 1
+            substitute_cost = previous[i] + (char1 == char2 ? 0 : 1)
+            current << [insert_cost, delete_cost, substitute_cost].min
+          end
+          previous = current
+        end
+        previous.last
+      end
+    end
+  end
+end

data/lib/kotoshu/dictionary/custom.rb ADDED Viewed

@@ -0,0 +1,224 @@
+# frozen_string_literal: true
+require_relative "base"
+module Kotoshu
+  module Dictionary
+    # Custom in-memory dictionary.
+    #
+    # This is a simple dictionary that stores words in memory,
+    # designed for runtime customization and user-defined words.
+    #
+    # @example Creating an empty dictionary
+    #   dict = Custom.new(language_code: "en-US")
+    #   dict.add_word("Kotoshu")
+    #   dict.lookup?("Kotoshu")  # => true
+    #
+    # @example Creating with initial words
+    #   dict = Custom.new(words: %w[hello world], language_code: "en")
+    #   dict.lookup?("hello")  # => true
+    class Custom < Base
+      # @return [Boolean] Whether lookups are case-sensitive
+      attr_reader :case_sensitive
+      # Create a new Custom dictionary.
+      #
+      # @param words [Array<String>] Initial words (optional)
+      # @param language_code [String] The language code
+      # @param locale [String, nil] The locale (optional)
+      # @param case_sensitive [Boolean] Whether lookups are case-sensitive
+      # @param metadata [Hash] Additional metadata (optional)
+      def initialize(language_code:, words: [], locale: nil, case_sensitive: false, metadata: {})
+        super(language_code, locale: locale, metadata: metadata)
+        @case_sensitive = case_sensitive
+        @words = normalize_words(words)
+        @word_set = build_word_set
+        # Register this dictionary type
+        self.class.register_type(:custom) unless Dictionary.registry.key?(:custom)
+      end
+      # Check if a word exists in the dictionary.
+      #
+      # @param word [String] The word to look up
+      # @return [Boolean] True if the word exists
+      def lookup(word)
+        return false if word.nil? || word.empty?
+        lookup_word = @case_sensitive ? word : word.downcase
+        @word_set.key?(lookup_word)
+      end
+      # Generate spelling suggestions.
+      #
+      # Uses edit distance to find similar words in the dictionary.
+      #
+      # @param word [String] The misspelled word
+      # @param max_suggestions [Integer] Maximum suggestions
+      # @return [Array<String>] List of suggested words
+      def suggest(word, max_suggestions: 10)
+        return [] if word.nil? || word.empty?
+        lookup_word = @case_sensitive ? word : word.downcase
+        # Find words with same prefix
+        prefix_len = [lookup_word.length - 1, 2].max
+        prefix = lookup_word[0...prefix_len]
+        candidates = @words.select { |w| w.start_with?(prefix) }
+        # Calculate edit distances
+        candidates.map do |dict_word|
+          dist = edit_distance(lookup_word, dict_word)
+          [dict_word, dist]
+        end.select { |_, dist| dist.positive? && dist <= 2 }
+                  .sort_by { |_, dist| dist }
+                  .first(max_suggestions)
+                  .map(&:first)
+      end
+      # Add a word to the dictionary.
+      #
+      # @param word [String] The word to add
+      # @param flags [Array<String>] Flags (ignored for Custom)
+      # @return [Boolean] True if added
+      def add_word(word, flags: [])
+        return false if word.nil? || word.empty?
+        lookup_word = normalize_word(word)
+        return false if @word_set.key?(lookup_word)
+        @words << lookup_word
+        @word_set[lookup_word] = @words.length - 1
+        true
+      end
+      # Remove a word from the dictionary.
+      #
+      # @param word [String] The word to remove
+      # @return [Boolean] True if removed
+      def remove_word(word)
+        return false if word.nil? || word.empty?
+        lookup_word = normalize_word(word)
+        return false unless @word_set.key?(lookup_word)
+        index = @word_set.delete(lookup_word)
+        @words.delete_at(index)
+        true
+      end
+      # Get all words in the dictionary.
+      #
+      # @return [Array<String>] All words
+      def words
+        @words.dup
+      end
+      # Clear all words from the dictionary.
+      #
+      # @return [self] Self for chaining
+      def clear
+        @words.clear
+        @word_set.clear
+        self
+      end
+      # Check if the dictionary is read-only.
+      #
+      # @return [Boolean] Always false for Custom dictionary
+      def readonly?
+        false
+      end
+      # Merge another dictionary into this one.
+      #
+      # @param other [Base, Array<String>] Dictionary or words to merge
+      # @return [self] Self for chaining
+      #
+      # @example Merging another dictionary
+      #   dict1 = Custom.new(words: %w[hello], language_code: "en")
+      #   dict2 = Custom.new(words: %w[world], language_code: "en")
+      #   dict1.merge(dict2)
+      #
+      # @example Merging an array of words
+      #   dict.merge(%w[test example])
+      def merge(other)
+        words_to_add = if other.is_a?(Base)
+                         other.words
+                       elsif other.is_a?(Array)
+                         other
+                       else
+                         []
+                       end
+        words_to_add.each { |word| add_word(word) }
+        self
+      end
+      private
+      # Normalize words for storage.
+      #
+      # @param words [Array<String>] Words to normalize
+      # @return [Array<String>] Normalized words
+      def normalize_words(words)
+        words.map { |w| normalize_word(w) }.compact
+      end
+      # Normalize a single word.
+      #
+      # @param word [String] The word to normalize
+      # @return [String, nil] Normalized word or nil if invalid
+      def normalize_word(word)
+        return nil if word.nil? || word.empty?
+        word = word.strip
+        return nil if word.empty?
+        @case_sensitive ? word : word.downcase
+      end
+      # Build a hash set for O(1) lookups.
+      #
+      # @return [Hash] Word to index mapping
+      def build_word_set
+        @words.each_with_index.to_h
+      end
+      # Calculate Levenshtein edit distance.
+      #
+      # @param str1 [String] First string
+      # @param str2 [String] Second string
+      # @return [Integer] Edit distance
+      def edit_distance(str1, str2)
+        return str2.length if str1.empty?
+        return str1.length if str2.empty?
+        # Use smaller string for inner loop
+        str1, str2 = str2, str1 if str1.length > str2.length
+        previous = (0..str1.length).to_a
+        str2.each_char.with_index do |char2, j|
+          current = [j + 1]
+          str1.each_char.with_index do |char1, i|
+            insert_cost = current[i] + 1
+            delete_cost = previous[i + 1] + 1
+            substitute_cost = previous[i] + (char1 == char2 ? 0 : 1)
+            current << [insert_cost, delete_cost, substitute_cost].min
+          end
+          previous = current
+        end
+        previous.last
+      end
+    end
+  end
+end