RubyGems - kotoshu - Versions diffs - 0.3.0 - Mend

kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (210) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +18 -0
data/CHANGELOG.md +182 -0
data/CLAUDE.md +172 -0
data/CODE_OF_CONDUCT.md +132 -0
data/LICENSE +31 -0
data/README.adoc +955 -0
data/Rakefile +12 -0
data/SECURITY.md +93 -0
data/examples/01_basic_word_checking.rb +38 -0
data/examples/02_text_document_checking.rb +77 -0
data/examples/03_dictionary_backends.rb +137 -0
data/examples/04_trie_data_structure.rb +146 -0
data/examples/05_suggestion_algorithms.rb +239 -0
data/examples/06_configuration_advanced.rb +287 -0
data/examples/07_multi_language_dictionaries.rb +278 -0
data/exe/kotoshu +6 -0
data/lib/kotoshu/algorithms/capitalization.rb +276 -0
data/lib/kotoshu/algorithms/lookup.rb +876 -0
data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
data/lib/kotoshu/algorithms/permutations.rb +283 -0
data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
data/lib/kotoshu/algorithms/suggest.rb +575 -0
data/lib/kotoshu/algorithms.rb +14 -0
data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
data/lib/kotoshu/cache/base_cache.rb +596 -0
data/lib/kotoshu/cache/cache.rb +91 -0
data/lib/kotoshu/cache/frequency_cache.rb +224 -0
data/lib/kotoshu/cache/language_cache.rb +454 -0
data/lib/kotoshu/cache/lookup_cache.rb +166 -0
data/lib/kotoshu/cache/model_cache.rb +513 -0
data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
data/lib/kotoshu/cache.rb +40 -0
data/lib/kotoshu/cli/auto_setup.rb +71 -0
data/lib/kotoshu/cli/batch_reporter.rb +315 -0
data/lib/kotoshu/cli/cache_command.rb +356 -0
data/lib/kotoshu/cli/display_formatter.rb +431 -0
data/lib/kotoshu/cli/errors.rb +36 -0
data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
data/lib/kotoshu/cli/language_resolver.rb +91 -0
data/lib/kotoshu/cli/navigation_manager.rb +272 -0
data/lib/kotoshu/cli/progress_reporter.rb +114 -0
data/lib/kotoshu/cli/status_report.rb +130 -0
data/lib/kotoshu/cli.rb +627 -0
data/lib/kotoshu/commands/cache_command.rb +424 -0
data/lib/kotoshu/commands/check_command.rb +312 -0
data/lib/kotoshu/commands/model_command.rb +295 -0
data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
data/lib/kotoshu/components/pos_tagger.rb +98 -0
data/lib/kotoshu/components/spell_checker.rb +73 -0
data/lib/kotoshu/components/synthesizer.rb +60 -0
data/lib/kotoshu/components/tokenizer.rb +58 -0
data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
data/lib/kotoshu/configuration/builder.rb +209 -0
data/lib/kotoshu/configuration/resolver.rb +124 -0
data/lib/kotoshu/configuration.rb +702 -0
data/lib/kotoshu/core/exceptions.rb +165 -0
data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
data/lib/kotoshu/core/models/affix_rule.rb +260 -0
data/lib/kotoshu/core/models/result/document_result.rb +263 -0
data/lib/kotoshu/core/models/result/word_result.rb +203 -0
data/lib/kotoshu/core/models/word.rb +142 -0
data/lib/kotoshu/core/trie/builder.rb +119 -0
data/lib/kotoshu/core/trie/node.rb +94 -0
data/lib/kotoshu/core/trie/trie.rb +249 -0
data/lib/kotoshu/core.rb +28 -0
data/lib/kotoshu/data/common_words/de.yml +1800 -0
data/lib/kotoshu/data/common_words/en.yml +1215 -0
data/lib/kotoshu/data/common_words/es.yml +750 -0
data/lib/kotoshu/data/common_words/fr.yml +1015 -0
data/lib/kotoshu/data/common_words/pt.yml +870 -0
data/lib/kotoshu/data/common_words/ru.yml +484 -0
data/lib/kotoshu/data/common_words_loader.rb +152 -0
data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
data/lib/kotoshu/debug_logger.rb +146 -0
data/lib/kotoshu/debug_mode.rb +134 -0
data/lib/kotoshu/defaults.rb +86 -0
data/lib/kotoshu/dictionaries/catalog.rb +817 -0
data/lib/kotoshu/dictionary/base.rb +237 -0
data/lib/kotoshu/dictionary/cspell.rb +254 -0
data/lib/kotoshu/dictionary/custom.rb +224 -0
data/lib/kotoshu/dictionary/hunspell.rb +526 -0
data/lib/kotoshu/dictionary/plain_text.rb +282 -0
data/lib/kotoshu/dictionary/repository.rb +248 -0
data/lib/kotoshu/dictionary/unified.rb +260 -0
data/lib/kotoshu/dictionary/unix_words.rb +218 -0
data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
data/lib/kotoshu/documents/document.rb +229 -0
data/lib/kotoshu/documents/location.rb +139 -0
data/lib/kotoshu/documents/markdown_document.rb +389 -0
data/lib/kotoshu/documents/plain_text_document.rb +147 -0
data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
data/lib/kotoshu/embeddings/protocol.rb +83 -0
data/lib/kotoshu/embeddings/protocols.rb +17 -0
data/lib/kotoshu/embeddings/registry.rb +182 -0
data/lib/kotoshu/embeddings/search.rb +192 -0
data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
data/lib/kotoshu/embeddings.rb +97 -0
data/lib/kotoshu/fluent_checker.rb +91 -0
data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
data/lib/kotoshu/grammar/rule.rb +95 -0
data/lib/kotoshu/grammar/rule_engine.rb +111 -0
data/lib/kotoshu/grammar/rule_loader.rb +31 -0
data/lib/kotoshu/grammar.rb +18 -0
data/lib/kotoshu/integrity/audit_log.rb +88 -0
data/lib/kotoshu/integrity/manifest.rb +117 -0
data/lib/kotoshu/integrity/net_http.rb +46 -0
data/lib/kotoshu/integrity.rb +25 -0
data/lib/kotoshu/keyboard/layout.rb +115 -0
data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
data/lib/kotoshu/keyboard/registry.rb +146 -0
data/lib/kotoshu/keyboard.rb +60 -0
data/lib/kotoshu/language/detector.rb +242 -0
data/lib/kotoshu/language/identifier.rb +378 -0
data/lib/kotoshu/language/languages/base.rb +256 -0
data/lib/kotoshu/language/normalizer/base.rb +137 -0
data/lib/kotoshu/language/registry.rb +147 -0
data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
data/lib/kotoshu/language/tokenizer/base.rb +170 -0
data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
data/lib/kotoshu/language.rb +99 -0
data/lib/kotoshu/languages/de/language.rb +546 -0
data/lib/kotoshu/languages/en/language.rb +448 -0
data/lib/kotoshu/languages/es/language.rb +459 -0
data/lib/kotoshu/languages/fr/language.rb +493 -0
data/lib/kotoshu/languages/ja/language.rb +477 -0
data/lib/kotoshu/languages/pt/language.rb +423 -0
data/lib/kotoshu/languages/ru/language.rb +404 -0
data/lib/kotoshu/languages.rb +43 -0
data/lib/kotoshu/metrics_collector.rb +222 -0
data/lib/kotoshu/metrics_module.rb +110 -0
data/lib/kotoshu/models/context.rb +119 -0
data/lib/kotoshu/models/embedding_model.rb +182 -0
data/lib/kotoshu/models/fasttext_model.rb +220 -0
data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
data/lib/kotoshu/models/onnx_model.rb +333 -0
data/lib/kotoshu/models/semantic_error.rb +165 -0
data/lib/kotoshu/models/suggestion.rb +106 -0
data/lib/kotoshu/models/word_embedding.rb +107 -0
data/lib/kotoshu/paths.rb +53 -0
data/lib/kotoshu/personal_dictionary.rb +94 -0
data/lib/kotoshu/plugins/plugin.rb +61 -0
data/lib/kotoshu/plugins/registry.rb +120 -0
data/lib/kotoshu/project_config.rb +76 -0
data/lib/kotoshu/readers/aff_data.rb +356 -0
data/lib/kotoshu/readers/aff_reader.rb +375 -0
data/lib/kotoshu/readers/condition_checker.rb +142 -0
data/lib/kotoshu/readers/dic_reader.rb +118 -0
data/lib/kotoshu/readers/file_reader.rb +347 -0
data/lib/kotoshu/readers/lookup_builder.rb +299 -0
data/lib/kotoshu/readers/readers.rb +6 -0
data/lib/kotoshu/readers.rb +9 -0
data/lib/kotoshu/resource_bundle.rb +30 -0
data/lib/kotoshu/resource_manager.rb +295 -0
data/lib/kotoshu/results/result.rb +165 -0
data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
data/lib/kotoshu/source_registry.rb +74 -0
data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
data/lib/kotoshu/spellchecker.rb +298 -0
data/lib/kotoshu/string_metrics.rb +153 -0
data/lib/kotoshu/suggestions/context.rb +55 -0
data/lib/kotoshu/suggestions/generator.rb +175 -0
data/lib/kotoshu/suggestions/pipeline.rb +135 -0
data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
data/lib/kotoshu/suggestions/suggestion.rb +174 -0
data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
data/lib/kotoshu/version.rb +5 -0
data/lib/kotoshu.rb +493 -0
data/script/validate_all_dictionaries.rb +444 -0
data/sig/kotoshu.rbs +4 -0
data/test_oop.rb +79 -0
metadata +298 -0

data/lib/kotoshu/languages/ru/language.rb ADDED Viewed

@@ -0,0 +1,404 @@
+# frozen_string_literal: true
+require_relative '../../readers/lookup_builder'
+require_relative '../../components/spell_checker'
+require_relative '../../components/pos_tagger'
+require_relative '../../language/normalizer/base'
+module Kotoshu
+  module Languages
+    # Russian language implementation.
+    #
+    # Supports multiple dialects: ru-RU, ru-BY, ru-KZ, ru-KG, ru-MD
+    #
+    # Full Hunspell integration with spell checking, POS tagging, and grammar rules
+    # specifically handling Russian Cyrillic script and case system.
+    class Russian < Language::Base
+      # Russian spell checker with Hunspell integration.
+      class SpellChecker < Components::SpellChecker
+        attr_reader :aff_path, :dic_path, :script
+        def initialize(aff_path:, dic_path:, script: :cyrillic, encoding: 'UTF-8')
+          @aff_path = aff_path
+          @dic_path = dic_path
+          @script = script
+          @encoding = encoding
+          @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
+        end
+        def check(word)
+          return { found: false, stem: nil, flags: [] } if word.nil? || word.empty?
+          first_form = @lookuper.good_forms(word).first
+          if first_form
+            { found: true, stem: first_form.stem || word, flags: first_form.flags&.to_a || [] }
+          else
+            { found: false, stem: nil, flags: [] }
+          end
+        end
+        def suggest(word, max_suggestions: 10)
+          return [] if word.nil? || word.empty?
+          first_form = @lookuper.good_forms(word).first
+          return [] if first_form
+          generate_suggestions(word, max_suggestions).take(max_suggestions)
+        end
+        def correct?(word)
+          check(word)[:found]
+        end
+        def lookuper
+          @lookuper
+        end
+        private
+        def calculate_distance(a, b)
+          return a.length if b.empty?
+          return b.length if a.empty?
+          matrix = Array.new(a.length + 1) { |i| [i] + [0] * b.length }
+          (1..b.length).each { |j| matrix[0][j] = j }
+          (1..a.length).each do |i|
+            (1..b.length).each do |j|
+              cost = a[i - 1] == b[j - 1] ? 0 : 1
+              matrix[i][j] = [matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost].min
+            end
+          end
+          matrix[a.length][b.length]
+        end
+        def calculate_score(original, suggestion, rank)
+          distance = calculate_distance(original, suggestion)
+          max_len = [original.length, suggestion.length].max
+          distance_score = 1.0 - (distance.to_f / max_len)
+          rank_penalty = rank * 0.05
+          [distance_score - rank_penalty, 0.0].max
+        end
+        def generate_suggestions(word, max_suggestions)
+          variations = []
+          # Russian character substitutions (common Cyrillic errors)
+          cyrillic_substitutions = {
+            'а' => %w[о и е я],
+            'о' => %w[а е и],
+            'е' => %w[и э а],
+            'и' => %w[е е],
+            'п' => %w[т к],
+            'т' => %w[п д],
+            'к' => %w[г х],
+            'н' => %w[т п],
+            'с' => %w[з ш],
+            'ш' => %w[с щ],
+            'щ' => %w[ш],
+            'б' => %w[п в],
+            'в' => %w[б ф],
+            'ф' => %w[в в],
+            'д' => %w[т],
+            'г' => %w[к х],
+            'х' => %w[г к],
+            'я' => %w[а е],
+            'ю' => %w[у],
+            'ё' => %w[е],
+            'ж' => %w[з ш],
+            'з' => %w[с ж],
+            'ь' => %w[ъ],
+            'ъ' => %w[ь],
+          }
+          word.chars.each_with_index do |char, i|
+            next unless cyrillic_substitutions.key?(char.downcase)
+            cyrillic_substitutions[char.downcase].each do |sub|
+              substituted = word.dup
+              substituted[i] = sub
+              variations << substituted if @lookuper.good_forms(substituted).first
+            end
+          end
+          # Doubled and deleted letters
+          word.chars.each_with_index do |char, i|
+            next if i == 0
+            doubled = word.dup
+            doubled.insert(i, char)
+            variations << doubled if @lookuper.good_forms(doubled).first
+          end
+          (0...word.length).each do |i|
+            deleted = word.dup
+            deleted.slice!(i)
+            next if deleted.empty?
+            variations << deleted if @lookuper.good_forms(deleted).first
+          end
+          variations.uniq!
+          variations.map do |suggestion|
+            { word: suggestion, distance: calculate_distance(word, suggestion), score: calculate_score(word, suggestion, 0) }
+          end.sort_by { |s| s[:distance] }
+        end
+      end
+      # Russian tokenizer with abbreviation handling.
+      class Tokenizer < Language::Tokenizer::RussianTokenizer
+      end
+      # Russian POS tagger.
+      class POSTagger < Components::PosTagger
+        FLAG_TO_POS = {
+          'N' => 'NOUN', 'NN' => 'NOUN', 'NNS' => 'NOUN', 'NNP' => 'NOUN_PROPER',
+          'S' => 'NOUN', 'Sub' => 'NOUN',
+          'V' => 'VERB', 'VB' => 'VERB', 'VBD' => 'VERB', 'VBG' => 'VERB', 'VBN' => 'VERB',
+          'VBP' => 'VERB', 'VBZ' => 'VERB',
+          'A' => 'ADJ', 'JJ' => 'ADJ', 'JJR' => 'ADJ', 'JJS' => 'ADJ',
+          'Adj' => 'ADJ',
+          'R' => 'ADV', 'RB' => 'ADV', 'RBR' => 'ADV', 'RBS' => 'ADV',
+          'Adv' => 'ADV',
+          'D' => 'DET', 'DT' => 'DET', 'PDT' => 'DET',
+          'P' => 'PRON', 'PP' => 'PRON', 'PRP' => 'PRON', 'PRP$' => 'PRON_POSS',
+          'WP' => 'PRON', 'WP$' => 'PRON_POSS',
+          'Pro' => 'PRON',
+          'I' => 'PREP', 'IN' => 'PREP',
+          'Präp' => 'PREP',
+          'C' => 'CONJ', 'CC' => 'CONJ',
+          'Conj' => 'CONJ',
+          'U' => 'PART', 'RP' => 'PART',
+          'Pt' => 'PART',
+          'INTJ' => 'INTJ', 'UH' => 'INTJ',
+          'Int' => 'INTJ',
+          'CD' => 'NUM',
+          'FW' => 'X',
+          'PUNCT' => 'PUNCT', '.' => 'PUNCT', ',' => 'PUNCT', '!' => 'PUNCT',
+          '?' => 'PUNCT', ';' => 'PUNCT', ':' => 'PUNCT'
+        }.freeze
+        attr_reader :aff_path, :dic_path, :script
+        def initialize(aff_path:, dic_path:, script: :cyrillic, encoding: 'UTF-8', flag_mapping: FLAG_TO_POS)
+          @aff_path = aff_path
+          @dic_path = dic_path
+          @script = script
+          @encoding = encoding
+          @flag_mapping = flag_mapping
+          @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
+          @lookup_cache = {}
+        end
+        def tag(tokens)
+          return [] if tokens.nil? || tokens.empty?
+          tokens.map do |token|
+            word = token[:token]
+            if word.nil? || word.empty?
+              token.merge(pos_tag: nil, lemma: nil)
+            else
+              lookup_result = lookup_with_pos(word)
+              token.merge(pos_tag: lookup_result[:pos_tag], lemma: lookup_result[:lemma] || word)
+            end
+          end
+        end
+        def flag_mapping
+          @flag_mapping
+        end
+        def flag_mapping=(mapping)
+          @flag_mapping = mapping
+        end
+        def clear_cache
+          @lookup_cache.clear
+        end
+        private
+        def lookup_with_pos(word)
+          return { pos_tag: nil, lemma: nil } if word.nil? || word.empty?
+          return @lookup_cache[word] if @lookup_cache.key?(word)
+          first_form = @lookuper.good_forms(word).first
+          pos_tag = derive_pos_tag(first_form)
+          cache_result = { pos_tag: pos_tag, lemma: first_form&.stem }
+          @lookup_cache[word] = cache_result
+          cache_result
+        end
+        def derive_pos_tag(result)
+          return nil unless result
+          flags = result.flags&.to_a || []
+          return guess_pos_from_affix(result) if flags.empty?
+          flags.each do |flag|
+            pos_tag = flag_to_pos(flag)
+            return pos_tag if pos_tag
+          end
+          guess_pos_from_affix(result)
+        end
+        def flag_to_pos(flag)
+          return @flag_mapping[flag] if @flag_mapping.key?(flag)
+          first_char = flag[0]
+          @flag_mapping[first_char]
+        end
+        def guess_pos_from_affix(result)
+          suffix = result.suffix
+          return guess_pos_from_suffix(suffix) if suffix
+          nil
+        end
+        def guess_pos_from_suffix(suffix)
+          # Russian suffix patterns
+          return 'VERB' if suffix.match?(/^(ть|ти|чь|л|ла|ло|ли|ют|ют|ешь|ишь|им|ите|ат|ят)$/)
+          return 'ADV' if suffix.match?(/^(о|е|и)$/)
+          return 'NOUN' if suffix.match?(/^(ость|ение|ание|ка|ник|чик|щик|ство|тель|ение|ство)$/)
+          return 'ADJ' if suffix.match?(/^(ый|ий|ой|ое|ая|ое|ые|их|ем|им|ом|ого|ому)$/)
+          nil
+        end
+      end
+      # Russian grammar rules module.
+      module GrammarRules
+        class Rule
+          attr_reader :id, :name, :description
+          def initialize(id, name, description)
+            @id = id
+            @name = name
+            @description = description
+          end
+          def check(tokens)
+            raise NotImplementedError, "#{self.class} must implement #check"
+          end
+        end
+        # Rule: Verbal aspect consistency
+        class VerbalAspectRule < Rule
+          IMPERFECTIVE_SUFFIXES = %w[ать ять].freeze
+          PERFECTIVE_SUFFIXES = %w[ить по].freeze
+          def initialize
+            super('RU_VERBAL_ASPECT', 'Verbal Aspect', 'Russian verbs should use consistent aspect (imperfective/perfective).')
+          end
+          def check(tokens)
+            # Simplified implementation
+            []
+          end
+        end
+        # Rule: Case agreement
+        class CaseAgreementRule < Rule
+          def initialize
+            super('RU_CASE_AGREEMENT', 'Case Agreement', 'Nouns, adjectives, and verbs must agree in case.')
+          end
+          def check(tokens)
+            # Simplified implementation
+            []
+          end
+        end
+        class RuleRegistry
+          class << self
+            def default_rules
+              [VerbalAspectRule.new, CaseAgreementRule.new]
+            end
+            def get_rule(id)
+              default_rules.find { |rule| rule.id == id }
+            end
+          end
+        end
+      end
+      # Registration
+      register "ru"
+      register "ru-RU"
+      register "ru-BY"
+      register "ru-KZ"
+      register "ru-KG"
+      register "ru-MD"
+      HUNSPELL_DICTIONARIES = {
+        'ru-RU' => {
+          aff: 'spec/integrational/fixtures/ru_RU.aff',
+          dic: 'spec/integrational/fixtures/ru_RU.dic'
+        }
+      }.freeze
+      VARIANT_NAMES = {
+        'RU' => 'Russian',
+        'BY' => 'Belarusian',
+        'KZ' => 'Kazakh',
+        'KG' => 'Kyrgyz',
+        'MD' => 'Moldovan'
+      }.freeze
+      def initialize(code: "ru", name: "Russian", variant: nil)
+        variant ||= extract_region_code(code)
+        super(code: code, name: name, variant: variant)
+        @hunspell_paths = resolve_hunspell_paths(code)
+      end
+      def description
+        return name unless variant
+        variant_name = VARIANT_NAMES[variant] || variant
+        "#{name} (#{variant_name})"
+      end
+      def tokenizer
+        @tokenizer ||= Tokenizer.new
+      end
+      def normalizer
+        @normalizer ||= Language::Normalizer::Base.new
+      end
+      def dictionary_class
+        Dictionary::UnixWords
+      end
+      def default_dictionary_paths
+        case code
+        when "ru-RU"
+          ["/usr/share/dict/russian"]
+        else
+          ["/usr/share/dict/words"]
+        end
+      end
+      def script_type
+        :cyrillic
+      end
+      def create_spell_checker
+        SpellChecker.new(
+          aff_path: @hunspell_paths[:aff],
+          dic_path: @hunspell_paths[:dic],
+          script: :cyrillic
+        )
+      end
+      def create_tokenizer
+        Tokenizer.new
+      end
+      def create_pos_tagger
+        POSTagger.new(
+          aff_path: @hunspell_paths[:aff],
+          dic_path: @hunspell_paths[:dic],
+          script: :cyrillic,
+          flag_mapping: POSTagger::FLAG_TO_POS
+        )
+      end
+      private
+      def extract_region_code(code)
+        return nil unless code.include?("-")
+        code.split("-", 2).last.upcase
+      end
+      def resolve_hunspell_paths(code)
+        HUNSPELL_DICTIONARIES[code] || HUNSPELL_DICTIONARIES['ru-RU']
+      end
+    end
+  end
+end

data/lib/kotoshu/languages.rb ADDED Viewed

@@ -0,0 +1,43 @@
+# frozen_string_literal: true
+# Load all language-specific modules
+require_relative 'languages/en/language'
+require_relative 'languages/fr/language'
+require_relative 'languages/de/language'
+require_relative 'languages/ja/language'
+require_relative 'languages/pt/language'
+require_relative 'languages/ru/language'
+require_relative 'languages/es/language'
+module Kotoshu
+  # Languages module for language-specific implementations.
+  #
+  # Each language has its own namespace under this module,
+  # allowing for clean organization and scalability.
+  #
+  # @example English components
+  #   Kotoshu::Languages::English::SpellChecker
+  #   Kotoshu::Languages::English::Tokenizer
+  #   Kotoshu::Languages::English::POSTagger
+  #   Kotoshu::Languages::English::GrammarRules
+  #
+  # @example French components
+  #   Kotoshu::Languages::French::Tokenizer
+  #
+  # @example German components
+  #   Kotoshu::Languages::German::Tokenizer
+  #
+  # @example Japanese components
+  #   Kotoshu::Languages::Japanese::Tokenizer
+  #
+  # @example Portuguese components
+  #   Kotoshu::Languages::Portuguese::Tokenizer
+  #
+  # @example Russian components
+  #   Kotoshu::Languages::Russian::Tokenizer
+  #
+  # @example Spanish components
+  #   Kotoshu::Languages::Spanish::Tokenizer
+  module Languages
+  end
+end

data/lib/kotoshu/metrics_collector.rb ADDED Viewed

@@ -0,0 +1,222 @@
+# frozen_string_literal: true
+module Kotoshu
+  module Metrics
+    # Thread-safe metrics collector.
+    #
+    # Tracks performance metrics for spellchecking operations:
+    # - Lookup counts and timing
+    # - Cache hit/miss rates
+    # - Suggestion generation stats
+    #
+    # @example
+    #   collector = Kotoshu::Metrics::Collector.new
+    #   collector.record_lookup("hello", result: true, time: 0.5)
+    #   collector.stats
+    #   # => { lookups: 1, correct_lookups: 1, avg_lookup_time: 0.5, ... }
+    class Collector
+      # Initialize a new collector.
+      def initialize
+        @mutex = Mutex.new
+        reset
+      end
+      # Record a lookup operation.
+      #
+      # @param word [String] The word being looked up
+      # @param result [Boolean] The lookup result
+      # @param time [Float] Time taken in milliseconds
+      def record_lookup(_word, result:, time:)
+        @mutex.synchronize do
+          @metrics[:lookups] += 1
+          @metrics[:correct_lookups] += 1 if result
+          @metrics[:misspelled_lookups] += 1 unless result
+          @metrics[:lookup_times] << time
+        end
+      end
+      # Record a cache operation.
+      #
+      # @param cache_type [String] Type of cache (lookup, suggestion)
+      # @param hit [Boolean] True if cache hit
+      def record_cache(cache_type, hit:)
+        @mutex.synchronize do
+          key = "#{cache_type}_cache_hits".to_sym
+          miss_key = "#{cache_type}_cache_misses".to_sym
+          if hit
+            @metrics[key] += 1
+          else
+            @metrics[miss_key] += 1
+          end
+        end
+      end
+      # Record suggestion generation.
+      #
+      # @param word [String] The input word
+      # @param count [Integer] Number of suggestions generated
+      # @param time [Float] Time taken in milliseconds
+      def record_suggestions(_word, count:, time:)
+        @mutex.synchronize do
+          @metrics[:suggestion_requests] += 1
+          @metrics[:suggestions_generated] += count
+          @metrics[:suggestion_times] << time
+        end
+      end
+      # Get current metrics statistics.
+      #
+      # @return [Hash] Current statistics with computed averages
+      def stats
+        @mutex.synchronize do
+          calculate_stats
+        end
+      end
+      # Reset all metrics.
+      def reset
+        @mutex.synchronize do
+          @metrics = {
+            lookups: 0,
+            correct_lookups: 0,
+            misspelled_lookups: 0,
+            lookup_times: [],
+            lookup_cache_hits: 0,
+            lookup_cache_misses: 0,
+            suggestion_cache_hits: 0,
+            suggestion_cache_misses: 0,
+            suggestion_requests: 0,
+            suggestions_generated: 0,
+            suggestion_times: [],
+            started_at: Time.now
+          }
+        end
+      end
+      # Export metrics in StatsD format.
+      #
+      # @return [String] StatsD protocol lines
+      def to_statsd
+        s = stats
+        prefix = "kotoshu"
+        lines = []
+        lines << "#{prefix}.lookups:#{s[:lookups]}|c"
+        lines << "#{prefix}.correct_lookups:#{s[:correct_lookups]}|c"
+        lines << "#{prefix}.misspelled_lookups:#{s[:misspelled_lookups]}|c"
+        lines << "#{prefix}.avg_lookup_time:#{s[:avg_lookup_time]}|ms"
+        lines << "#{prefix}.lookup_cache_hits:#{s[:lookup_cache_hits]}|c"
+        lines << "#{prefix}.lookup_cache_misses:#{s[:lookup_cache_misses]}|c"
+        lines << "#{prefix}.suggestion_requests:#{s[:suggestion_requests]}|c"
+        lines << "#{prefix}.suggestions_generated:#{s[:suggestions_generated]}|c"
+        lines << "#{prefix}.avg_suggestion_time:#{s[:avg_suggestion_time]}|ms"
+        lines.join("\n")
+      end
+      # Export metrics in Prometheus exposition format.
+      #
+      # @return [String] Prometheus format
+      def to_prometheus
+        s = stats
+        lines = []
+        lines << "# HELP kotoshu_lookups Total number of word lookups"
+        lines << "# TYPE kotoshu_lookups counter"
+        lines << "kotoshu_lookups #{s[:lookups]}"
+        lines << "# HELP kotoshu_correct_lookups Number of correct word lookups"
+        lines << "# TYPE kotoshu_correct_lookups counter"
+        lines << "kotoshu_correct_lookups #{s[:correct_lookups]}"
+        lines << "# HELP kotoshu_misspelled_lookups Number of misspelled word lookups"
+        lines << "# TYPE kotoshu_misspelled_lookups counter"
+        lines << "kotoshu_misspelled_lookups #{s[:misspelled_lookups]}"
+        lines << "# HELP kotoshu_avg_lookup_time Average lookup time in milliseconds"
+        lines << "# TYPE kotoshu_avg_lookup_time gauge"
+        lines << "kotoshu_avg_lookup_time #{s[:avg_lookup_time]}"
+        lines << "# HELP kotoshu_lookup_cache_hits Number of lookup cache hits"
+        lines << "# TYPE kotoshu_lookup_cache_hits counter"
+        lines << "kotoshu_lookup_cache_hits #{s[:lookup_cache_hits]}"
+        lines << "# HELP kotoshu_lookup_cache_misses Number of lookup cache misses"
+        lines << "# TYPE kotoshu_lookup_cache_misses counter"
+        lines << "kotoshu_lookup_cache_misses #{s[:lookup_cache_misses]}"
+        lines << "# HELP kotoshu_suggestion_requests Number of suggestion requests"
+        lines << "# TYPE kotoshu_suggestion_requests counter"
+        lines << "kotoshu_suggestion_requests #{s[:suggestion_requests]}"
+        lines << "# HELP kotoshu_suggestions_generated Total number of suggestions generated"
+        lines << "# TYPE kotoshu_suggestions_generated counter"
+        lines << "kotoshu_suggestions_generated #{s[:suggestions_generated]}"
+        lines << "# HELP kotoshu_avg_suggestion_time Average suggestion generation time in milliseconds"
+        lines << "# TYPE kotoshu_avg_suggestion_time gauge"
+        lines << "kotoshu_avg_suggestion_time #{s[:avg_suggestion_time]}"
+        lines.join("\n")
+      end
+      private
+      # Calculate computed statistics.
+      #
+      # @return [Hash] Statistics with computed values
+      def calculate_stats
+        lookup_times = @metrics[:lookup_times]
+        suggestion_times = @metrics[:suggestion_times]
+        avg_lookup = lookup_times.empty? ? 0 : lookup_times.sum / lookup_times.size
+        avg_suggestion = suggestion_times.empty? ? 0 : suggestion_times.sum / suggestion_times.size
+        lookup_hit_rate = calculate_hit_rate(@metrics[:lookup_cache_hits], @metrics[:lookup_cache_misses])
+        suggestion_hit_rate = calculate_hit_rate(@metrics[:suggestion_cache_hits], @metrics[:suggestion_cache_misses])
+        {
+          lookups: @metrics[:lookups],
+          correct_lookups: @metrics[:correct_lookups],
+          misspelled_lookups: @metrics[:misspelled_lookups],
+          avg_lookup_time: avg_lookup.round(3),
+          lookup_cache_hits: @metrics[:lookup_cache_hits],
+          lookup_cache_misses: @metrics[:lookup_cache_misses],
+          lookup_cache_hit_rate: lookup_hit_rate,
+          suggestion_cache_hits: @metrics[:suggestion_cache_hits],
+          suggestion_cache_misses: @metrics[:suggestion_cache_misses],
+          suggestion_cache_hit_rate: suggestion_hit_rate,
+          suggestion_requests: @metrics[:suggestion_requests],
+          suggestions_generated: @metrics[:suggestions_generated],
+          avg_suggestions_per_request: if @metrics[:suggestion_requests].positive?
+                                         (@metrics[:suggestions_generated].to_f / @metrics[:suggestion_requests]).round(2)
+                                       else
+                                         0
+                                       end,
+          avg_suggestion_time: avg_suggestion.round(3),
+          uptime_seconds: (Time.now - @metrics[:started_at]).round(2)
+        }
+      end
+      # Calculate cache hit rate.
+      #
+      # @param hits [Integer] Number of hits
+      # @param misses [Integer] Number of misses
+      # @return [Float] Hit rate (0-1)
+      def calculate_hit_rate(hits, misses)
+        total = hits + misses
+        total.positive? ? (hits.to_f / total).round(4) : 0.0
+      end
+    end
+  end
+end