RubyGems - kotoshu - Versions diffs - 0.3.0 - Mend

kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (210) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +18 -0
data/CHANGELOG.md +182 -0
data/CLAUDE.md +172 -0
data/CODE_OF_CONDUCT.md +132 -0
data/LICENSE +31 -0
data/README.adoc +955 -0
data/Rakefile +12 -0
data/SECURITY.md +93 -0
data/examples/01_basic_word_checking.rb +38 -0
data/examples/02_text_document_checking.rb +77 -0
data/examples/03_dictionary_backends.rb +137 -0
data/examples/04_trie_data_structure.rb +146 -0
data/examples/05_suggestion_algorithms.rb +239 -0
data/examples/06_configuration_advanced.rb +287 -0
data/examples/07_multi_language_dictionaries.rb +278 -0
data/exe/kotoshu +6 -0
data/lib/kotoshu/algorithms/capitalization.rb +276 -0
data/lib/kotoshu/algorithms/lookup.rb +876 -0
data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
data/lib/kotoshu/algorithms/permutations.rb +283 -0
data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
data/lib/kotoshu/algorithms/suggest.rb +575 -0
data/lib/kotoshu/algorithms.rb +14 -0
data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
data/lib/kotoshu/cache/base_cache.rb +596 -0
data/lib/kotoshu/cache/cache.rb +91 -0
data/lib/kotoshu/cache/frequency_cache.rb +224 -0
data/lib/kotoshu/cache/language_cache.rb +454 -0
data/lib/kotoshu/cache/lookup_cache.rb +166 -0
data/lib/kotoshu/cache/model_cache.rb +513 -0
data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
data/lib/kotoshu/cache.rb +40 -0
data/lib/kotoshu/cli/auto_setup.rb +71 -0
data/lib/kotoshu/cli/batch_reporter.rb +315 -0
data/lib/kotoshu/cli/cache_command.rb +356 -0
data/lib/kotoshu/cli/display_formatter.rb +431 -0
data/lib/kotoshu/cli/errors.rb +36 -0
data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
data/lib/kotoshu/cli/language_resolver.rb +91 -0
data/lib/kotoshu/cli/navigation_manager.rb +272 -0
data/lib/kotoshu/cli/progress_reporter.rb +114 -0
data/lib/kotoshu/cli/status_report.rb +130 -0
data/lib/kotoshu/cli.rb +627 -0
data/lib/kotoshu/commands/cache_command.rb +424 -0
data/lib/kotoshu/commands/check_command.rb +312 -0
data/lib/kotoshu/commands/model_command.rb +295 -0
data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
data/lib/kotoshu/components/pos_tagger.rb +98 -0
data/lib/kotoshu/components/spell_checker.rb +73 -0
data/lib/kotoshu/components/synthesizer.rb +60 -0
data/lib/kotoshu/components/tokenizer.rb +58 -0
data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
data/lib/kotoshu/configuration/builder.rb +209 -0
data/lib/kotoshu/configuration/resolver.rb +124 -0
data/lib/kotoshu/configuration.rb +702 -0
data/lib/kotoshu/core/exceptions.rb +165 -0
data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
data/lib/kotoshu/core/models/affix_rule.rb +260 -0
data/lib/kotoshu/core/models/result/document_result.rb +263 -0
data/lib/kotoshu/core/models/result/word_result.rb +203 -0
data/lib/kotoshu/core/models/word.rb +142 -0
data/lib/kotoshu/core/trie/builder.rb +119 -0
data/lib/kotoshu/core/trie/node.rb +94 -0
data/lib/kotoshu/core/trie/trie.rb +249 -0
data/lib/kotoshu/core.rb +28 -0
data/lib/kotoshu/data/common_words/de.yml +1800 -0
data/lib/kotoshu/data/common_words/en.yml +1215 -0
data/lib/kotoshu/data/common_words/es.yml +750 -0
data/lib/kotoshu/data/common_words/fr.yml +1015 -0
data/lib/kotoshu/data/common_words/pt.yml +870 -0
data/lib/kotoshu/data/common_words/ru.yml +484 -0
data/lib/kotoshu/data/common_words_loader.rb +152 -0
data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
data/lib/kotoshu/debug_logger.rb +146 -0
data/lib/kotoshu/debug_mode.rb +134 -0
data/lib/kotoshu/defaults.rb +86 -0
data/lib/kotoshu/dictionaries/catalog.rb +817 -0
data/lib/kotoshu/dictionary/base.rb +237 -0
data/lib/kotoshu/dictionary/cspell.rb +254 -0
data/lib/kotoshu/dictionary/custom.rb +224 -0
data/lib/kotoshu/dictionary/hunspell.rb +526 -0
data/lib/kotoshu/dictionary/plain_text.rb +282 -0
data/lib/kotoshu/dictionary/repository.rb +248 -0
data/lib/kotoshu/dictionary/unified.rb +260 -0
data/lib/kotoshu/dictionary/unix_words.rb +218 -0
data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
data/lib/kotoshu/documents/document.rb +229 -0
data/lib/kotoshu/documents/location.rb +139 -0
data/lib/kotoshu/documents/markdown_document.rb +389 -0
data/lib/kotoshu/documents/plain_text_document.rb +147 -0
data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
data/lib/kotoshu/embeddings/protocol.rb +83 -0
data/lib/kotoshu/embeddings/protocols.rb +17 -0
data/lib/kotoshu/embeddings/registry.rb +182 -0
data/lib/kotoshu/embeddings/search.rb +192 -0
data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
data/lib/kotoshu/embeddings.rb +97 -0
data/lib/kotoshu/fluent_checker.rb +91 -0
data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
data/lib/kotoshu/grammar/rule.rb +95 -0
data/lib/kotoshu/grammar/rule_engine.rb +111 -0
data/lib/kotoshu/grammar/rule_loader.rb +31 -0
data/lib/kotoshu/grammar.rb +18 -0
data/lib/kotoshu/integrity/audit_log.rb +88 -0
data/lib/kotoshu/integrity/manifest.rb +117 -0
data/lib/kotoshu/integrity/net_http.rb +46 -0
data/lib/kotoshu/integrity.rb +25 -0
data/lib/kotoshu/keyboard/layout.rb +115 -0
data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
data/lib/kotoshu/keyboard/registry.rb +146 -0
data/lib/kotoshu/keyboard.rb +60 -0
data/lib/kotoshu/language/detector.rb +242 -0
data/lib/kotoshu/language/identifier.rb +378 -0
data/lib/kotoshu/language/languages/base.rb +256 -0
data/lib/kotoshu/language/normalizer/base.rb +137 -0
data/lib/kotoshu/language/registry.rb +147 -0
data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
data/lib/kotoshu/language/tokenizer/base.rb +170 -0
data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
data/lib/kotoshu/language.rb +99 -0
data/lib/kotoshu/languages/de/language.rb +546 -0
data/lib/kotoshu/languages/en/language.rb +448 -0
data/lib/kotoshu/languages/es/language.rb +459 -0
data/lib/kotoshu/languages/fr/language.rb +493 -0
data/lib/kotoshu/languages/ja/language.rb +477 -0
data/lib/kotoshu/languages/pt/language.rb +423 -0
data/lib/kotoshu/languages/ru/language.rb +404 -0
data/lib/kotoshu/languages.rb +43 -0
data/lib/kotoshu/metrics_collector.rb +222 -0
data/lib/kotoshu/metrics_module.rb +110 -0
data/lib/kotoshu/models/context.rb +119 -0
data/lib/kotoshu/models/embedding_model.rb +182 -0
data/lib/kotoshu/models/fasttext_model.rb +220 -0
data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
data/lib/kotoshu/models/onnx_model.rb +333 -0
data/lib/kotoshu/models/semantic_error.rb +165 -0
data/lib/kotoshu/models/suggestion.rb +106 -0
data/lib/kotoshu/models/word_embedding.rb +107 -0
data/lib/kotoshu/paths.rb +53 -0
data/lib/kotoshu/personal_dictionary.rb +94 -0
data/lib/kotoshu/plugins/plugin.rb +61 -0
data/lib/kotoshu/plugins/registry.rb +120 -0
data/lib/kotoshu/project_config.rb +76 -0
data/lib/kotoshu/readers/aff_data.rb +356 -0
data/lib/kotoshu/readers/aff_reader.rb +375 -0
data/lib/kotoshu/readers/condition_checker.rb +142 -0
data/lib/kotoshu/readers/dic_reader.rb +118 -0
data/lib/kotoshu/readers/file_reader.rb +347 -0
data/lib/kotoshu/readers/lookup_builder.rb +299 -0
data/lib/kotoshu/readers/readers.rb +6 -0
data/lib/kotoshu/readers.rb +9 -0
data/lib/kotoshu/resource_bundle.rb +30 -0
data/lib/kotoshu/resource_manager.rb +295 -0
data/lib/kotoshu/results/result.rb +165 -0
data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
data/lib/kotoshu/source_registry.rb +74 -0
data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
data/lib/kotoshu/spellchecker.rb +298 -0
data/lib/kotoshu/string_metrics.rb +153 -0
data/lib/kotoshu/suggestions/context.rb +55 -0
data/lib/kotoshu/suggestions/generator.rb +175 -0
data/lib/kotoshu/suggestions/pipeline.rb +135 -0
data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
data/lib/kotoshu/suggestions/suggestion.rb +174 -0
data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
data/lib/kotoshu/version.rb +5 -0
data/lib/kotoshu.rb +493 -0
data/script/validate_all_dictionaries.rb +444 -0
data/sig/kotoshu.rbs +4 -0
data/test_oop.rb +79 -0
metadata +298 -0

data/lib/kotoshu/analyzers/semantic_analyzer.rb ADDED Viewed

@@ -0,0 +1,295 @@
+# frozen_string_literal: true
+require_relative '../models/embedding_model'
+require_relative '../models/semantic_error'
+require_relative '../models/context'
+require_relative '../documents/document'
+module Kotoshu
+  module Analyzers
+    # Unified semantic error analyzer.
+    #
+    # Uses word embeddings for context-aware error detection and suggestions.
+    # Provides unified semantic analysis without artificial spelling/grammar split.
+    #
+    # @example Analyzing a document
+    #   model = FastTextModel.from_github('en')
+    #   analyzer = SemanticAnalyzer.new(model)
+    #   errors = analyzer.analyze(document)
+    #
+    # @example Checking a single word
+    #   suggestions = analyzer.suggest_corrections('helo', context_words: ['hello', 'world'])
+    class SemanticAnalyzer
+      # Similarity threshold for high-confidence suggestions
+      HIGH_CONFIDENCE_THRESHOLD = 0.85
+      # Similarity threshold for medium-confidence suggestions
+      MEDIUM_CONFIDENCE_THRESHOLD = 0.70
+      # Minimum similarity for suggestions
+      MIN_SIMILARITY = 0.50
+      # Default number of suggestions to generate
+      DEFAULT_MAX_SUGGESTIONS = 5
+      attr_reader :model, :max_suggestions
+      # Create a new semantic analyzer.
+      #
+      # @param model [EmbeddingModel] The embedding model to use
+      # @param max_suggestions [Integer] Maximum suggestions per error
+      # @param min_similarity [Float] Minimum similarity threshold
+      def initialize(model, max_suggestions: DEFAULT_MAX_SUGGESTIONS, min_similarity: MIN_SIMILARITY)
+        raise ArgumentError, "Model must be an EmbeddingModel" unless model.is_a?(Models::EmbeddingModel)
+        @model = model
+        @max_suggestions = max_suggestions
+        @min_similarity = min_similarity
+      end
+      # Analyze a document for semantic errors.
+      #
+      # @param document [Document] The document to analyze
+      # @return [Array<Models::SemanticError>] List of errors found
+      def analyze(document)
+        errors = []
+        # Get text nodes from document
+        document.text_nodes.each do |text_node|
+          # Tokenize and check each word
+          words = tokenize_words(text_node.text)
+          words.each do |word|
+            next if valid_word?(word)
+            # Detect error
+            error = detect_error(
+              word: word,
+              location: text_node.location,
+              context: document.context_for(text_node.location)
+            )
+            errors << error if error
+          end
+        end
+        # Sort errors by location and confidence
+        errors.sort
+      end
+      # Detect semantic error for a single word.
+      #
+      # @param word [String] The word to check
+      # @param location [Location] Error location
+      # @param context [Models::Context, nil] Context around the word
+      # @return [Models::SemanticError, nil] Error object or nil if valid
+      def detect_error(word:, location:, context: nil)
+        return nil if valid_word?(word)
+        # Get suggestions
+        suggestions = suggest_corrections(word, context: context)
+        # Determine error type based on analysis
+        error_type = classify_error(word, suggestions, context)
+        # Calculate confidence based on suggestions
+        confidence = calculate_confidence(suggestions)
+        # Create error object
+        Models::SemanticError.new(
+          id: generate_error_id(word, location),
+          location: location,
+          original: word,
+          suggestions: suggestions,
+          error_type: error_type,
+          confidence: confidence,
+          context: context
+        )
+      end
+      # Suggest corrections for a word.
+      #
+      # @param word [String] The misspelled word
+      # @param context [Models::Context, nil] Context for context-aware suggestions
+      # @return [Array<Models::Suggestion>] Suggested corrections
+      def suggest_corrections(word, context: nil)
+        return [] if word.nil? || word.empty?
+        # Get nearest neighbors from embedding model
+        neighbors = @model.nearest_neighbors(word, k: @max_suggestions * 3)
+        # Filter by minimum similarity
+        neighbors = neighbors.select { |n| n.similarity >= @min_similarity }
+        # If we have context, rank by contextual relevance
+        if context && context.respond_to?(:surrounding_words)
+          neighbors = rank_by_context(neighbors, context)
+        end
+        # Convert to Suggestions
+        neighbors.first(@max_suggestions).map do |neighbor|
+          Models::Suggestion.new(
+            word: neighbor.word,
+            confidence: neighbor.similarity,
+            source: :semantic,
+            metadata: {
+              distance: neighbor.distance,
+              similarity: neighbor.similarity
+            }
+          )
+        end
+      end
+      # Check if a word is valid (exists in vocabulary).
+      #
+      # @param word [String] The word to check
+      # @return [Boolean] True if word is valid
+      def valid_word?(word)
+        return false if word.nil? || word.empty?
+        # Skip numbers
+        return true if word =~ /^\d+$/
+        # Skip single characters (likely abbreviations)
+        return true if word.length == 1
+        # Check if word exists in model vocabulary
+        @model.has_word?(word)
+      end
+      # Calculate confidence score for suggestions.
+      #
+      # @param suggestions [Array<Models::Suggestion>] List of suggestions
+      # @return [Float] Confidence score (0.0 to 1.0)
+      def calculate_confidence(suggestions)
+        return 0.0 unless suggestions&.any?
+        # Confidence is based on top suggestion quality
+        top = suggestions.first
+        # High confidence: top suggestion > 0.85 similarity
+        return 1.0 if top.confidence > HIGH_CONFIDENCE_THRESHOLD
+        # Medium confidence: top suggestion > 0.70 similarity
+        return 0.7 if top.confidence > MEDIUM_CONFIDENCE_THRESHOLD
+        # Low confidence: top suggestion < 0.70
+        0.5
+      end
+      private
+      # Tokenize text into words.
+      #
+      # @param text [String] Text to tokenize
+      # @return [Array<String>] Words
+      def tokenize_words(text)
+        return [] unless text
+        # Simple word tokenization (splits on non-word characters)
+        # In full implementation, would use language-specific tokenization
+        text.downcase.scan(/[a-z]+(?:['’-][a-z]+)*/i)
+      end
+      # Classify error type based on word and suggestions.
+      #
+      # @param word [String] The error word
+      # @param suggestions [Array<Models::Suggestion>] Suggestions
+      # @param context [Models::Context, nil] Context
+      # @return [Symbol] Error type
+      def classify_error(word, suggestions, context)
+        return :orthographic if suggestions&.empty?
+        top_suggestion = suggestions.first
+        # Check if it's a capitalization error
+        if word.downcase == top_suggestion.word.downcase
+          return :capitalization
+        end
+        # Check if it's a diacritic/accent error
+        if similar_without_diacritics?(word, top_suggestion.word)
+          return :orthographic
+        end
+        # Check if it's a word choice error (semantic similarity but different word)
+        if suggestions.first&.source == :semantic
+          return :word_choice
+        end
+        # Default to orthographic (spelling)
+        :orthographic
+      end
+      # Check if two words are similar ignoring diacritics.
+      #
+      # @param word1 [String] First word
+      # @param word2 [String] Second word
+      # @return [Boolean] True if similar without diacritics
+      def similar_without_diacritics?(word1, word2)
+        # Remove diacritics and compare
+        normalize_diacritics(word1) == normalize_diacritics(word2)
+      end
+      # Normalize diacritics from a word.
+      #
+      # @param word [String] Word with diacritics
+      # @return [String] Word without diacritics
+      def normalize_diacritics(word)
+        # Simple normalization (transliterate to ASCII)
+        word.encode('ASCII', fallback: ->(c) { c == 'ä' ? 'ae' : c == 'ö' ? 'oe' : c == 'ü' ? 'ue' : c == 'ß' ? 'ss' : c })
+          .downcase
+      end
+      # Rank neighbors by contextual relevance.
+      #
+      # @param neighbors [Array<Models::NearestNeighbor>] Neighbors to rank
+      # @param context [Models::Context] Context for ranking
+      # @return [Array<Models::NearestNeighbor>] Ranked neighbors
+      def rank_by_context(neighbors, context)
+        # Get surrounding words
+        surrounding = context.surrounding_words(3)
+        return neighbors unless surrounding&.any?
+        # Boost neighbors that appear in similar context
+        # In full implementation, would use more sophisticated context modeling
+        neighbors.map do |neighbor|
+          boost = context_boost(neighbor.word, surrounding)
+          # Create boosted neighbor (create new object to avoid mutation)
+          boosted_similarity = [neighbor.similarity + boost, 1.0].min
+          Models::NearestNeighbor.new(
+            word: neighbor.word,
+            similarity: boosted_similarity,
+            embedding: neighbor.embedding
+          )
+        end.sort.reverse
+      end
+      # Calculate context boost for a word.
+      #
+      # @param word [String] Word to boost
+      # @param surrounding [Array<String>] Surrounding words
+      # @return [Float] Boost amount (0.0 to 0.1)
+      def context_boost(word, surrounding)
+        return 0.0 unless surrounding&.any?
+        # Simple boost: if word is semantically similar to surrounding words
+        surrounding.reduce(0.0) do |boost, surrounding_word|
+          sim = @model.similarity(word, surrounding_word)
+          boost + (sim || 0.0) * 0.02  # Small boost for each similar word
+        end
+      end
+      # Generate unique error ID.
+      #
+      # @param word [String] The error word
+      # @param location [Location] Error location
+      # @return [String] Unique ID
+      def generate_error_id(word, location)
+        # Create ID from word and location hash
+        base = "#{word}-#{location}"
+        Digest::SHA256.hexdigest(base)[0...16]
+      end
+    end
+  end
+end