kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
class Spellchecker
|
|
5
|
+
# Fluent checker for chainable configuration.
|
|
6
|
+
#
|
|
7
|
+
# Provides a convenient API for spell checking with method chaining.
|
|
8
|
+
#
|
|
9
|
+
# @example Basic usage
|
|
10
|
+
# result = Kotoshu.fluent.check("Hello wrold")
|
|
11
|
+
#
|
|
12
|
+
# @example With options
|
|
13
|
+
# Kotoshu.fluent
|
|
14
|
+
# .ignore_words(/https?:\/\/\S+/)
|
|
15
|
+
# .max_suggestions(5)
|
|
16
|
+
# .check("Hello wrold")
|
|
17
|
+
class FluentChecker
|
|
18
|
+
# @return [Spellchecker] The underlying spellchecker
|
|
19
|
+
attr_reader :spellchecker
|
|
20
|
+
|
|
21
|
+
# @return [Hash] Configuration options
|
|
22
|
+
attr_reader :options
|
|
23
|
+
|
|
24
|
+
# Create a new fluent checker.
|
|
25
|
+
#
|
|
26
|
+
# @param spellchecker [Spellchecker] The underlying spellchecker
|
|
27
|
+
# @param options [Hash] Configuration options
|
|
28
|
+
def initialize(spellchecker:, options: {})
|
|
29
|
+
@spellchecker = spellchecker
|
|
30
|
+
@options = options
|
|
31
|
+
@progress_callback = nil
|
|
32
|
+
@error_callback = nil
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Check text for spelling errors.
|
|
36
|
+
#
|
|
37
|
+
# @param text [String] Text to check
|
|
38
|
+
# @return [Models::Result::DocumentResult] Check result
|
|
39
|
+
def check(text)
|
|
40
|
+
@spellchecker.check(text)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Ignore words matching pattern.
|
|
44
|
+
#
|
|
45
|
+
# @param pattern [Regexp] Pattern to ignore
|
|
46
|
+
# @return [FluentChecker] Self for chaining
|
|
47
|
+
#
|
|
48
|
+
# @example
|
|
49
|
+
# fluent.ignore_words(/https?:\/\/\S+/)
|
|
50
|
+
def ignore_words(pattern)
|
|
51
|
+
@options[:ignore_patterns] ||= []
|
|
52
|
+
@options[:ignore_patterns] << pattern
|
|
53
|
+
self
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Set maximum suggestions.
|
|
57
|
+
#
|
|
58
|
+
# @param max [Integer] Maximum suggestions
|
|
59
|
+
# @return [FluentChecker] Self for chaining
|
|
60
|
+
def max_suggestions(max)
|
|
61
|
+
@options[:max_suggestions] = max
|
|
62
|
+
self
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Set progress callback.
|
|
66
|
+
#
|
|
67
|
+
# @param block [Proc] Callback proc
|
|
68
|
+
# @return [FluentChecker] Self for chaining
|
|
69
|
+
def on_progress(&block)
|
|
70
|
+
@progress_callback = block
|
|
71
|
+
self
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Set error callback.
|
|
75
|
+
#
|
|
76
|
+
# @param block [Proc] Callback proc
|
|
77
|
+
# @return [FluentChecker] Self for chaining
|
|
78
|
+
def on_error(&block)
|
|
79
|
+
@error_callback = block
|
|
80
|
+
self
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Get the result.
|
|
84
|
+
#
|
|
85
|
+
# @return [Models::Result::ResultDocumentResult] Check result
|
|
86
|
+
def result
|
|
87
|
+
check(@text)
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Grammar
|
|
5
|
+
module PatternMatchers
|
|
6
|
+
# Base class for pattern matchers.
|
|
7
|
+
#
|
|
8
|
+
# Pattern matchers check token sequences against specific patterns
|
|
9
|
+
# defined in YAML configuration.
|
|
10
|
+
class BaseMatcher
|
|
11
|
+
def initialize(pattern, exceptions = {})
|
|
12
|
+
@pattern = pattern
|
|
13
|
+
@exceptions = exceptions
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Match tokens against the pattern.
|
|
17
|
+
#
|
|
18
|
+
# @param tokens [Array<Hash>] Array of token hashes
|
|
19
|
+
# @param rule [Rule] The rule being checked
|
|
20
|
+
# @return [Array<Hash>] Array of error hashes
|
|
21
|
+
def match(tokens, rule)
|
|
22
|
+
[]
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
protected
|
|
26
|
+
|
|
27
|
+
# Extract target tokens from context specification.
|
|
28
|
+
#
|
|
29
|
+
# @param tokens [Array<Hash>] Array of token hashes
|
|
30
|
+
# @param context_spec [Hash] Context specification from pattern
|
|
31
|
+
# @return [Array<Hash>] Array of matching tokens with their indices
|
|
32
|
+
def extract_tokens_from_context(tokens, context_spec)
|
|
33
|
+
result = []
|
|
34
|
+
context_spec.each do |spec|
|
|
35
|
+
if spec['target_token']
|
|
36
|
+
tokens.each_with_index do |token, idx|
|
|
37
|
+
if token[:token]&.downcase == spec['target_token']
|
|
38
|
+
result << { token: token, index: idx }
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
result
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'base_matcher'
|
|
4
|
+
|
|
5
|
+
module Kotoshu
|
|
6
|
+
module Grammar
|
|
7
|
+
module PatternMatchers
|
|
8
|
+
# Matcher for double negative rules.
|
|
9
|
+
#
|
|
10
|
+
# This matcher detects when multiple negative words appear
|
|
11
|
+
# within a certain distance.
|
|
12
|
+
class DoubleNegativeMatcher < BaseMatcher
|
|
13
|
+
# Match tokens against the double negative pattern.
|
|
14
|
+
#
|
|
15
|
+
# @param tokens [Array<Hash>] Array of token hashes
|
|
16
|
+
# @param rule [Rule] The rule being checked
|
|
17
|
+
# @return [Array<Hash>] Array of error hashes
|
|
18
|
+
def match(tokens, rule)
|
|
19
|
+
errors = []
|
|
20
|
+
exceptions = rule.exceptions || {}
|
|
21
|
+
exception_phrases = exceptions['phrases'] || []
|
|
22
|
+
|
|
23
|
+
conditions = @pattern['conditions'] || []
|
|
24
|
+
distance_condition = conditions.find { |c| c['type'] == 'distance_check' }
|
|
25
|
+
max_distance = distance_condition&.dig('max_distance') || 15
|
|
26
|
+
|
|
27
|
+
negative_indices = []
|
|
28
|
+
tokens.each_with_index do |token, idx|
|
|
29
|
+
word = token[:token]&.downcase
|
|
30
|
+
next unless is_negative?(word)
|
|
31
|
+
|
|
32
|
+
# Skip "not only... but also" pattern
|
|
33
|
+
next if in_exception_phrase?(idx, tokens, exception_phrases)
|
|
34
|
+
|
|
35
|
+
negative_indices << idx
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
negative_indices.each_cons(2) do |idx1, idx2|
|
|
39
|
+
pos1 = tokens[idx1][:position]
|
|
40
|
+
pos2 = tokens[idx2][:position]
|
|
41
|
+
distance = pos2 - pos1
|
|
42
|
+
next if distance > max_distance
|
|
43
|
+
|
|
44
|
+
error = build_error(tokens, idx1, idx2, rule)
|
|
45
|
+
errors << error if error
|
|
46
|
+
end
|
|
47
|
+
errors
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
private
|
|
51
|
+
|
|
52
|
+
# Check if a word is a negative.
|
|
53
|
+
#
|
|
54
|
+
# @param word [String] The word to check
|
|
55
|
+
# @return [Boolean] True if the word is a negative
|
|
56
|
+
def is_negative?(word)
|
|
57
|
+
return false if word.nil? || word.empty?
|
|
58
|
+
|
|
59
|
+
negatives = %w[not no neither nobody never nothing nowhere hardly barely scarcely]
|
|
60
|
+
return true if negatives.include?(word)
|
|
61
|
+
return true if word.end_with?("n't")
|
|
62
|
+
|
|
63
|
+
false
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Check if an index is part of an exception phrase.
|
|
67
|
+
#
|
|
68
|
+
# @param idx [Integer] The token index
|
|
69
|
+
# @param tokens [Array<Hash>] Array of token hashes
|
|
70
|
+
# @param exception_phrases [Array<String>] Exception phrases
|
|
71
|
+
# @return [Boolean] True if in exception phrase
|
|
72
|
+
def in_exception_phrase?(idx, tokens, exception_phrases)
|
|
73
|
+
return false if exception_phrases.empty?
|
|
74
|
+
|
|
75
|
+
# Check "not only... but also" pattern
|
|
76
|
+
if idx > 0 && tokens[idx - 1][:token] == 'not' && tokens[idx + 1]&.dig(:token) == 'only'
|
|
77
|
+
return true
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
false
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Build an error hash.
|
|
84
|
+
#
|
|
85
|
+
# @param tokens [Array<Hash>] Array of token hashes
|
|
86
|
+
# @param idx1 [Integer] First negative index
|
|
87
|
+
# @param idx2 [Integer] Second negative index
|
|
88
|
+
# @param rule [Rule] The rule being checked
|
|
89
|
+
# @return [Hash] Error hash
|
|
90
|
+
def build_error(tokens, idx1, idx2, rule)
|
|
91
|
+
words = tokens[idx1..idx2].map { |t| t[:token] }.join(' ')
|
|
92
|
+
|
|
93
|
+
{
|
|
94
|
+
rule_id: rule.id,
|
|
95
|
+
position: tokens[idx1][:position],
|
|
96
|
+
message: rule.message,
|
|
97
|
+
suggestion: rule.suggestion,
|
|
98
|
+
context: words,
|
|
99
|
+
suggestions: rule.suggestion ? [rule.suggestion] : []
|
|
100
|
+
}
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'base_matcher'
|
|
4
|
+
|
|
5
|
+
module Kotoshu
|
|
6
|
+
module Grammar
|
|
7
|
+
module PatternMatchers
|
|
8
|
+
# Matcher for there/their/they're confusion rules.
|
|
9
|
+
#
|
|
10
|
+
# This matcher detects when "there" is used where "their"
|
|
11
|
+
# (possessive) is intended.
|
|
12
|
+
class PossessiveContextMatcher < BaseMatcher
|
|
13
|
+
# Match tokens against the there/their pattern.
|
|
14
|
+
#
|
|
15
|
+
# @param tokens [Array<Hash>] Array of token hashes
|
|
16
|
+
# @param rule [Rule] The rule being checked
|
|
17
|
+
# @return [Array<Hash>] Array of error hashes
|
|
18
|
+
def match(tokens, rule)
|
|
19
|
+
errors = []
|
|
20
|
+
exceptions = rule.exceptions || {}
|
|
21
|
+
|
|
22
|
+
location_indicators = exceptions['location_indicators'] || {}
|
|
23
|
+
location_verbs = location_indicators['verbs'] || []
|
|
24
|
+
possessive_nouns = location_indicators['possessive_nouns'] || []
|
|
25
|
+
|
|
26
|
+
tokens.each_with_index do |token, idx|
|
|
27
|
+
word = token[:token]&.downcase
|
|
28
|
+
next unless word == 'there'
|
|
29
|
+
|
|
30
|
+
next_token = tokens[idx + 1]
|
|
31
|
+
next unless next_token
|
|
32
|
+
|
|
33
|
+
next_word = next_token[:token]&.downcase
|
|
34
|
+
|
|
35
|
+
# Skip if followed by verb (location/existence context)
|
|
36
|
+
next if location_verbs.include?(next_word)
|
|
37
|
+
|
|
38
|
+
uses_their = false
|
|
39
|
+
|
|
40
|
+
# Check POS tags first
|
|
41
|
+
next_pos = next_token[:pos_tag]
|
|
42
|
+
if next_pos && ['NOUN', 'NOUN_PROPER', 'ADJ'].include?(next_pos)
|
|
43
|
+
uses_their = true
|
|
44
|
+
# Fallback to word list
|
|
45
|
+
elsif possessive_nouns.include?(next_word)
|
|
46
|
+
uses_their = true
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
if uses_their
|
|
50
|
+
errors << build_error(token, next_token, rule)
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
errors
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
# Build an error hash.
|
|
59
|
+
#
|
|
60
|
+
# @param token [Hash] The token with "there"
|
|
61
|
+
# @param next_token [Hash] The next token
|
|
62
|
+
# @param rule [Rule] The rule being checked
|
|
63
|
+
# @return [Hash] Error hash
|
|
64
|
+
def build_error(token, next_token, rule)
|
|
65
|
+
{
|
|
66
|
+
rule_id: rule.id,
|
|
67
|
+
position: token[:position],
|
|
68
|
+
message: rule.message,
|
|
69
|
+
suggestion: rule.suggestion,
|
|
70
|
+
context: "\"#{token[:token]} #{next_token[:token]}\"",
|
|
71
|
+
suggestions: [rule.suggestion]
|
|
72
|
+
}
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'base_matcher'
|
|
4
|
+
|
|
5
|
+
module Kotoshu
|
|
6
|
+
module Grammar
|
|
7
|
+
module PatternMatchers
|
|
8
|
+
# Matcher for a/an article usage rules.
|
|
9
|
+
#
|
|
10
|
+
# This matcher checks if "a" or "an" is used correctly before
|
|
11
|
+
# vowel and consonant sounds.
|
|
12
|
+
class VowelSoundMatcher < BaseMatcher
|
|
13
|
+
VOWEL_SOUNDS = %w[a e i o u].freeze
|
|
14
|
+
|
|
15
|
+
# Match tokens against the a/an pattern.
|
|
16
|
+
#
|
|
17
|
+
# @param tokens [Array<Hash>] Array of token hashes
|
|
18
|
+
# @param rule [Rule] The rule being checked
|
|
19
|
+
# @return [Array<Hash>] Array of error hashes
|
|
20
|
+
def match(tokens, rule)
|
|
21
|
+
errors = []
|
|
22
|
+
tokens.each_cons(2) do |prev_token, current_token|
|
|
23
|
+
prev_word = prev_token[:token]&.downcase
|
|
24
|
+
next unless %w[a an].include?(prev_word)
|
|
25
|
+
next unless prev_token[:pos_tag] == 'DET' || prev_token[:pos_tag].nil?
|
|
26
|
+
|
|
27
|
+
next_word = current_token[:token]
|
|
28
|
+
next if next_word.nil? || next_word.empty?
|
|
29
|
+
|
|
30
|
+
expected = article_for(next_word, rule)
|
|
31
|
+
if prev_word != expected
|
|
32
|
+
errors << build_error(prev_token, current_token, expected, rule)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
errors
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
# Determine the correct article for a word.
|
|
41
|
+
#
|
|
42
|
+
# @param word [String] The word to check
|
|
43
|
+
# @param rule [Rule] The rule with exceptions
|
|
44
|
+
# @return [String] "a" or "an"
|
|
45
|
+
def article_for(word, rule)
|
|
46
|
+
word_downcase = word.downcase
|
|
47
|
+
exceptions = rule.exceptions || {}
|
|
48
|
+
|
|
49
|
+
consonant_exceptions = exceptions['consonant_sound_exceptions'] || []
|
|
50
|
+
return 'a' if consonant_exceptions.include?(word_downcase)
|
|
51
|
+
|
|
52
|
+
silent_exceptions = exceptions['silent_consonant_exceptions'] || []
|
|
53
|
+
return 'an' if silent_exceptions.include?(word_downcase)
|
|
54
|
+
|
|
55
|
+
first_char = word_downcase[0]
|
|
56
|
+
VOWEL_SOUNDS.include?(first_char) ? 'an' : 'a'
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Build an error hash.
|
|
60
|
+
#
|
|
61
|
+
# @param prev_token [Hash] The previous token (article)
|
|
62
|
+
# @param current_token [Hash] The current token (word)
|
|
63
|
+
# @param expected [String] The expected article
|
|
64
|
+
# @param rule [Rule] The rule being checked
|
|
65
|
+
# @return [Hash] Error hash
|
|
66
|
+
def build_error(prev_token, current_token, expected, rule)
|
|
67
|
+
prev_word = prev_token[:token]
|
|
68
|
+
next_word = current_token[:token]
|
|
69
|
+
message = rule.message.gsub('{expected}', expected).gsub('{word}', next_word)
|
|
70
|
+
|
|
71
|
+
{
|
|
72
|
+
rule_id: rule.id,
|
|
73
|
+
position: prev_token[:position],
|
|
74
|
+
message: message,
|
|
75
|
+
suggestion: expected,
|
|
76
|
+
context: "#{prev_word} #{next_word}",
|
|
77
|
+
suggestions: [expected]
|
|
78
|
+
}
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Grammar
|
|
5
|
+
# Base class for grammar rules.
|
|
6
|
+
#
|
|
7
|
+
# All grammar rules inherit from this class and implement
|
|
8
|
+
# the #check method to validate tokens.
|
|
9
|
+
class Rule
|
|
10
|
+
attr_reader :id, :name, :category, :severity, :description,
|
|
11
|
+
:exceptions, :message, :suggestion
|
|
12
|
+
|
|
13
|
+
def initialize(id:, name:, category:, severity:, description:,
|
|
14
|
+
patterns:, exceptions: {}, message:, suggestion:)
|
|
15
|
+
@id = id
|
|
16
|
+
@name = name
|
|
17
|
+
@category = category
|
|
18
|
+
@severity = severity
|
|
19
|
+
@description = description
|
|
20
|
+
@patterns = patterns
|
|
21
|
+
@exceptions = exceptions
|
|
22
|
+
@message = message
|
|
23
|
+
@suggestion = suggestion
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Factory method to create Rule from YAML configuration.
|
|
27
|
+
#
|
|
28
|
+
# @param config [Hash] YAML configuration hash
|
|
29
|
+
# @return [Rule] A new rule instance
|
|
30
|
+
def self.from_yaml(config)
|
|
31
|
+
new(
|
|
32
|
+
id: config['id'],
|
|
33
|
+
name: config['name'],
|
|
34
|
+
category: config['category'],
|
|
35
|
+
severity: config['severity'],
|
|
36
|
+
description: config['description'],
|
|
37
|
+
patterns: config['patterns'],
|
|
38
|
+
exceptions: config['exceptions'] || {},
|
|
39
|
+
message: config['message'],
|
|
40
|
+
suggestion: config['suggestion']
|
|
41
|
+
)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Check tokens against this rule.
|
|
45
|
+
#
|
|
46
|
+
# @param tokens [Array<Hash>] Array of token hashes
|
|
47
|
+
# @return [Array<Hash>] Array of error hashes
|
|
48
|
+
def check(tokens)
|
|
49
|
+
errors = []
|
|
50
|
+
@patterns.each do |pattern|
|
|
51
|
+
pattern_errors = check_pattern(tokens, pattern)
|
|
52
|
+
errors.concat(pattern_errors)
|
|
53
|
+
end
|
|
54
|
+
errors
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
private
|
|
58
|
+
|
|
59
|
+
# Check a single pattern against tokens.
|
|
60
|
+
#
|
|
61
|
+
# @param tokens [Array<Hash>] Array of token hashes
|
|
62
|
+
# @param pattern [Hash] Pattern configuration hash
|
|
63
|
+
# @return [Array<Hash>] Array of error hashes
|
|
64
|
+
def check_pattern(tokens, pattern)
|
|
65
|
+
matcher = create_matcher(pattern)
|
|
66
|
+
matcher.match(tokens, self)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Create appropriate pattern matcher based on pattern type.
|
|
70
|
+
#
|
|
71
|
+
# @param pattern [Hash] Pattern configuration hash
|
|
72
|
+
# @return [PatternMatchers::BaseMatcher] A pattern matcher instance
|
|
73
|
+
def create_matcher(pattern)
|
|
74
|
+
conditions = pattern['conditions'] || []
|
|
75
|
+
return PatternMatchers::BaseMatcher.new(pattern) if conditions.empty?
|
|
76
|
+
|
|
77
|
+
condition_types = conditions.map { |c| c['type'] }
|
|
78
|
+
|
|
79
|
+
if condition_types.include?('vowel_check')
|
|
80
|
+
require_relative 'pattern_matchers/vowel_sound_matcher'
|
|
81
|
+
PatternMatchers::VowelSoundMatcher.new(pattern, exceptions)
|
|
82
|
+
elsif condition_types.include?('context_check')
|
|
83
|
+
require_relative 'pattern_matchers/possessive_context_matcher'
|
|
84
|
+
PatternMatchers::PossessiveContextMatcher.new(pattern, exceptions)
|
|
85
|
+
elsif condition_types.include?('distance_check')
|
|
86
|
+
require_relative 'pattern_matchers/double_negative_matcher'
|
|
87
|
+
PatternMatchers::DoubleNegativeMatcher.new(pattern, exceptions)
|
|
88
|
+
else
|
|
89
|
+
require_relative 'pattern_matchers/base_matcher'
|
|
90
|
+
PatternMatchers::BaseMatcher.new(pattern)
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'rule_loader'
|
|
4
|
+
require_relative '../configuration'
|
|
5
|
+
|
|
6
|
+
module Kotoshu
|
|
7
|
+
module Grammar
|
|
8
|
+
# Engine for loading and executing grammar rules from YAML configuration.
|
|
9
|
+
#
|
|
10
|
+
# This implements configuration-driven design where all linguistic data
|
|
11
|
+
# (rules, patterns, exceptions) is stored in YAML files, not hardcoded.
|
|
12
|
+
#
|
|
13
|
+
# @example Loading rules for English
|
|
14
|
+
# engine = RuleEngine.new(language: 'en')
|
|
15
|
+
# errors = engine.check(tokens)
|
|
16
|
+
#
|
|
17
|
+
class RuleEngine
|
|
18
|
+
attr_reader :language, :rules
|
|
19
|
+
|
|
20
|
+
# Create a new rule engine for a language.
|
|
21
|
+
#
|
|
22
|
+
# @param language [String] Language code (e.g., 'en', 'de', 'fr')
|
|
23
|
+
# @param rules_path [String, nil] Optional custom path to grammar rules
|
|
24
|
+
# @param dictionaries_path [String, nil] Optional custom path to dictionaries directory
|
|
25
|
+
def initialize(language:, rules_path: nil, dictionaries_path: nil)
|
|
26
|
+
@language = language
|
|
27
|
+
@rules_path = rules_path || default_rules_path(dictionaries_path)
|
|
28
|
+
@loader = RuleLoader.new(@rules_path)
|
|
29
|
+
@rules = @loader.load_rules
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Check tokens against all loaded rules.
|
|
33
|
+
#
|
|
34
|
+
# @param tokens [Array<Hash>] Array of token hashes with :token, :pos_tag, :position keys
|
|
35
|
+
# @return [Array<Hash>] Array of error hashes
|
|
36
|
+
def check(tokens)
|
|
37
|
+
errors = []
|
|
38
|
+
@rules.each do |rule|
|
|
39
|
+
rule_errors = rule.check(tokens)
|
|
40
|
+
errors.concat(rule_errors)
|
|
41
|
+
end
|
|
42
|
+
errors
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Get list of rule IDs.
|
|
46
|
+
#
|
|
47
|
+
# @return [Array<String>] Array of rule IDs
|
|
48
|
+
def rule_names
|
|
49
|
+
@rules.map(&:id)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Get a specific rule by ID.
|
|
53
|
+
#
|
|
54
|
+
# @param id [String] Rule ID
|
|
55
|
+
# @return [Rule, nil] The rule if found, nil otherwise
|
|
56
|
+
def get_rule(id)
|
|
57
|
+
@rules.find { |r| r.id == id }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Check if a rule exists.
|
|
61
|
+
#
|
|
62
|
+
# @param id [String] Rule ID
|
|
63
|
+
# @return [Boolean] True if rule exists
|
|
64
|
+
def rule_exists?(id)
|
|
65
|
+
@rules.any? { |r| r.id == id }
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
private
|
|
69
|
+
|
|
70
|
+
# Get default path to grammar rules for a language.
|
|
71
|
+
#
|
|
72
|
+
# @param dictionaries_path [String, nil] Optional custom dictionaries path
|
|
73
|
+
# @return [String] Path to grammar rules directory
|
|
74
|
+
def default_rules_path(dictionaries_path = nil)
|
|
75
|
+
base_path = dictionaries_path || default_dictionaries_path
|
|
76
|
+
File.join(base_path, @language, 'grammar')
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Get default dictionaries path.
|
|
80
|
+
#
|
|
81
|
+
# Checks in order:
|
|
82
|
+
# 1. Environment variable KOTOSHU_DICTIONARIES_PATH
|
|
83
|
+
# 2. Configuration.dictionaries_path
|
|
84
|
+
# 3. Default: dictionaries/ adjacent to gem root
|
|
85
|
+
#
|
|
86
|
+
# @return [String] Path to dictionaries directory
|
|
87
|
+
def default_dictionaries_path
|
|
88
|
+
# Check for environment variable first
|
|
89
|
+
if ENV['KOTOSHU_DICTIONARIES_PATH']
|
|
90
|
+
return ENV['KOTOSHU_DICTIONARIES_PATH']
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Check for configuration setting
|
|
94
|
+
config = Configuration.instance
|
|
95
|
+
if config.respond_to?(:dictionaries_path) && config.dictionaries_path
|
|
96
|
+
return config.dictionaries_path
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Default: dictionaries/ directory at project root
|
|
100
|
+
# The kotoshu gem is at src/kotoshu/kotoshu/, so dictionaries is at src/kotoshu/dictionaries
|
|
101
|
+
# From lib/kotoshu/grammar/:
|
|
102
|
+
# - grammar/ -> kotoshu/lib/kotoshu/ (1)
|
|
103
|
+
# - kotoshu/lib/kotoshu/ -> lib/kotoshu/ (2)
|
|
104
|
+
# - lib/kotoshu/ -> kotoshu/ (3)
|
|
105
|
+
# - kotoshu/ -> src/kotoshu/ (4)
|
|
106
|
+
# - Then add dictionaries/
|
|
107
|
+
__dir__ + '/../../../../dictionaries'
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'yaml'
|
|
4
|
+
require_relative 'rule'
|
|
5
|
+
|
|
6
|
+
module Kotoshu
|
|
7
|
+
module Grammar
|
|
8
|
+
# Loads grammar rules from YAML configuration files.
|
|
9
|
+
#
|
|
10
|
+
# This class reads rule definitions from YAML files in the
|
|
11
|
+
# dictionaries/{language}/grammar/ directory.
|
|
12
|
+
class RuleLoader
|
|
13
|
+
def initialize(rules_path)
|
|
14
|
+
@rules_path = rules_path
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Load all rules from the rules.yaml file.
|
|
18
|
+
#
|
|
19
|
+
# @return [Array<Rule>] Array of rule instances
|
|
20
|
+
def load_rules
|
|
21
|
+
rules_file = File.join(@rules_path, 'rules.yaml')
|
|
22
|
+
return [] unless File.exist?(rules_file)
|
|
23
|
+
|
|
24
|
+
config = YAML.load_file(rules_file)
|
|
25
|
+
return [] unless config && config['rules']
|
|
26
|
+
|
|
27
|
+
config['rules'].map { |rule_config| Rule.from_yaml(rule_config) }
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'grammar/rule_engine'
|
|
4
|
+
require_relative 'grammar/rule_loader'
|
|
5
|
+
require_relative 'grammar/rule'
|
|
6
|
+
require_relative 'grammar/pattern_matchers/base_matcher'
|
|
7
|
+
require_relative 'grammar/pattern_matchers/vowel_sound_matcher'
|
|
8
|
+
require_relative 'grammar/pattern_matchers/possessive_context_matcher'
|
|
9
|
+
require_relative 'grammar/pattern_matchers/double_negative_matcher'
|
|
10
|
+
|
|
11
|
+
module Kotoshu
|
|
12
|
+
# Grammar rules infrastructure for Kotoshu.
|
|
13
|
+
#
|
|
14
|
+
# This module provides configuration-driven grammar checking
|
|
15
|
+
# where all linguistic data is stored in YAML files.
|
|
16
|
+
module Grammar
|
|
17
|
+
end
|
|
18
|
+
end
|