kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ class Spellchecker
5
+ # Fluent checker for chainable configuration.
6
+ #
7
+ # Provides a convenient API for spell checking with method chaining.
8
+ #
9
+ # @example Basic usage
10
+ # result = Kotoshu.fluent.check("Hello wrold")
11
+ #
12
+ # @example With options
13
+ # Kotoshu.fluent
14
+ # .ignore_words(/https?:\/\/\S+/)
15
+ # .max_suggestions(5)
16
+ # .check("Hello wrold")
17
+ class FluentChecker
18
+ # @return [Spellchecker] The underlying spellchecker
19
+ attr_reader :spellchecker
20
+
21
+ # @return [Hash] Configuration options
22
+ attr_reader :options
23
+
24
+ # Create a new fluent checker.
25
+ #
26
+ # @param spellchecker [Spellchecker] The underlying spellchecker
27
+ # @param options [Hash] Configuration options
28
+ def initialize(spellchecker:, options: {})
29
+ @spellchecker = spellchecker
30
+ @options = options
31
+ @progress_callback = nil
32
+ @error_callback = nil
33
+ end
34
+
35
+ # Check text for spelling errors.
36
+ #
37
+ # @param text [String] Text to check
38
+ # @return [Models::Result::DocumentResult] Check result
39
+ def check(text)
40
+ @spellchecker.check(text)
41
+ end
42
+
43
+ # Ignore words matching pattern.
44
+ #
45
+ # @param pattern [Regexp] Pattern to ignore
46
+ # @return [FluentChecker] Self for chaining
47
+ #
48
+ # @example
49
+ # fluent.ignore_words(/https?:\/\/\S+/)
50
+ def ignore_words(pattern)
51
+ @options[:ignore_patterns] ||= []
52
+ @options[:ignore_patterns] << pattern
53
+ self
54
+ end
55
+
56
+ # Set maximum suggestions.
57
+ #
58
+ # @param max [Integer] Maximum suggestions
59
+ # @return [FluentChecker] Self for chaining
60
+ def max_suggestions(max)
61
+ @options[:max_suggestions] = max
62
+ self
63
+ end
64
+
65
+ # Set progress callback.
66
+ #
67
+ # @param block [Proc] Callback proc
68
+ # @return [FluentChecker] Self for chaining
69
+ def on_progress(&block)
70
+ @progress_callback = block
71
+ self
72
+ end
73
+
74
+ # Set error callback.
75
+ #
76
+ # @param block [Proc] Callback proc
77
+ # @return [FluentChecker] Self for chaining
78
+ def on_error(&block)
79
+ @error_callback = block
80
+ self
81
+ end
82
+
83
+ # Get the result.
84
+ #
85
+ # @return [Models::Result::ResultDocumentResult] Check result
86
+ def result
87
+ check(@text)
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Grammar
5
+ module PatternMatchers
6
+ # Base class for pattern matchers.
7
+ #
8
+ # Pattern matchers check token sequences against specific patterns
9
+ # defined in YAML configuration.
10
+ class BaseMatcher
11
+ def initialize(pattern, exceptions = {})
12
+ @pattern = pattern
13
+ @exceptions = exceptions
14
+ end
15
+
16
+ # Match tokens against the pattern.
17
+ #
18
+ # @param tokens [Array<Hash>] Array of token hashes
19
+ # @param rule [Rule] The rule being checked
20
+ # @return [Array<Hash>] Array of error hashes
21
+ def match(tokens, rule)
22
+ []
23
+ end
24
+
25
+ protected
26
+
27
+ # Extract target tokens from context specification.
28
+ #
29
+ # @param tokens [Array<Hash>] Array of token hashes
30
+ # @param context_spec [Hash] Context specification from pattern
31
+ # @return [Array<Hash>] Array of matching tokens with their indices
32
+ def extract_tokens_from_context(tokens, context_spec)
33
+ result = []
34
+ context_spec.each do |spec|
35
+ if spec['target_token']
36
+ tokens.each_with_index do |token, idx|
37
+ if token[:token]&.downcase == spec['target_token']
38
+ result << { token: token, index: idx }
39
+ end
40
+ end
41
+ end
42
+ end
43
+ result
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,105 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'base_matcher'
4
+
5
+ module Kotoshu
6
+ module Grammar
7
+ module PatternMatchers
8
+ # Matcher for double negative rules.
9
+ #
10
+ # This matcher detects when multiple negative words appear
11
+ # within a certain distance.
12
+ class DoubleNegativeMatcher < BaseMatcher
13
+ # Match tokens against the double negative pattern.
14
+ #
15
+ # @param tokens [Array<Hash>] Array of token hashes
16
+ # @param rule [Rule] The rule being checked
17
+ # @return [Array<Hash>] Array of error hashes
18
+ def match(tokens, rule)
19
+ errors = []
20
+ exceptions = rule.exceptions || {}
21
+ exception_phrases = exceptions['phrases'] || []
22
+
23
+ conditions = @pattern['conditions'] || []
24
+ distance_condition = conditions.find { |c| c['type'] == 'distance_check' }
25
+ max_distance = distance_condition&.dig('max_distance') || 15
26
+
27
+ negative_indices = []
28
+ tokens.each_with_index do |token, idx|
29
+ word = token[:token]&.downcase
30
+ next unless is_negative?(word)
31
+
32
+ # Skip "not only... but also" pattern
33
+ next if in_exception_phrase?(idx, tokens, exception_phrases)
34
+
35
+ negative_indices << idx
36
+ end
37
+
38
+ negative_indices.each_cons(2) do |idx1, idx2|
39
+ pos1 = tokens[idx1][:position]
40
+ pos2 = tokens[idx2][:position]
41
+ distance = pos2 - pos1
42
+ next if distance > max_distance
43
+
44
+ error = build_error(tokens, idx1, idx2, rule)
45
+ errors << error if error
46
+ end
47
+ errors
48
+ end
49
+
50
+ private
51
+
52
+ # Check if a word is a negative.
53
+ #
54
+ # @param word [String] The word to check
55
+ # @return [Boolean] True if the word is a negative
56
+ def is_negative?(word)
57
+ return false if word.nil? || word.empty?
58
+
59
+ negatives = %w[not no neither nobody never nothing nowhere hardly barely scarcely]
60
+ return true if negatives.include?(word)
61
+ return true if word.end_with?("n't")
62
+
63
+ false
64
+ end
65
+
66
+ # Check if an index is part of an exception phrase.
67
+ #
68
+ # @param idx [Integer] The token index
69
+ # @param tokens [Array<Hash>] Array of token hashes
70
+ # @param exception_phrases [Array<String>] Exception phrases
71
+ # @return [Boolean] True if in exception phrase
72
+ def in_exception_phrase?(idx, tokens, exception_phrases)
73
+ return false if exception_phrases.empty?
74
+
75
+ # Check "not only... but also" pattern
76
+ if idx > 0 && tokens[idx - 1][:token] == 'not' && tokens[idx + 1]&.dig(:token) == 'only'
77
+ return true
78
+ end
79
+
80
+ false
81
+ end
82
+
83
+ # Build an error hash.
84
+ #
85
+ # @param tokens [Array<Hash>] Array of token hashes
86
+ # @param idx1 [Integer] First negative index
87
+ # @param idx2 [Integer] Second negative index
88
+ # @param rule [Rule] The rule being checked
89
+ # @return [Hash] Error hash
90
+ def build_error(tokens, idx1, idx2, rule)
91
+ words = tokens[idx1..idx2].map { |t| t[:token] }.join(' ')
92
+
93
+ {
94
+ rule_id: rule.id,
95
+ position: tokens[idx1][:position],
96
+ message: rule.message,
97
+ suggestion: rule.suggestion,
98
+ context: words,
99
+ suggestions: rule.suggestion ? [rule.suggestion] : []
100
+ }
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'base_matcher'
4
+
5
+ module Kotoshu
6
+ module Grammar
7
+ module PatternMatchers
8
+ # Matcher for there/their/they're confusion rules.
9
+ #
10
+ # This matcher detects when "there" is used where "their"
11
+ # (possessive) is intended.
12
+ class PossessiveContextMatcher < BaseMatcher
13
+ # Match tokens against the there/their pattern.
14
+ #
15
+ # @param tokens [Array<Hash>] Array of token hashes
16
+ # @param rule [Rule] The rule being checked
17
+ # @return [Array<Hash>] Array of error hashes
18
+ def match(tokens, rule)
19
+ errors = []
20
+ exceptions = rule.exceptions || {}
21
+
22
+ location_indicators = exceptions['location_indicators'] || {}
23
+ location_verbs = location_indicators['verbs'] || []
24
+ possessive_nouns = location_indicators['possessive_nouns'] || []
25
+
26
+ tokens.each_with_index do |token, idx|
27
+ word = token[:token]&.downcase
28
+ next unless word == 'there'
29
+
30
+ next_token = tokens[idx + 1]
31
+ next unless next_token
32
+
33
+ next_word = next_token[:token]&.downcase
34
+
35
+ # Skip if followed by verb (location/existence context)
36
+ next if location_verbs.include?(next_word)
37
+
38
+ uses_their = false
39
+
40
+ # Check POS tags first
41
+ next_pos = next_token[:pos_tag]
42
+ if next_pos && ['NOUN', 'NOUN_PROPER', 'ADJ'].include?(next_pos)
43
+ uses_their = true
44
+ # Fallback to word list
45
+ elsif possessive_nouns.include?(next_word)
46
+ uses_their = true
47
+ end
48
+
49
+ if uses_their
50
+ errors << build_error(token, next_token, rule)
51
+ end
52
+ end
53
+ errors
54
+ end
55
+
56
+ private
57
+
58
+ # Build an error hash.
59
+ #
60
+ # @param token [Hash] The token with "there"
61
+ # @param next_token [Hash] The next token
62
+ # @param rule [Rule] The rule being checked
63
+ # @return [Hash] Error hash
64
+ def build_error(token, next_token, rule)
65
+ {
66
+ rule_id: rule.id,
67
+ position: token[:position],
68
+ message: rule.message,
69
+ suggestion: rule.suggestion,
70
+ context: "\"#{token[:token]} #{next_token[:token]}\"",
71
+ suggestions: [rule.suggestion]
72
+ }
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'base_matcher'
4
+
5
+ module Kotoshu
6
+ module Grammar
7
+ module PatternMatchers
8
+ # Matcher for a/an article usage rules.
9
+ #
10
+ # This matcher checks if "a" or "an" is used correctly before
11
+ # vowel and consonant sounds.
12
+ class VowelSoundMatcher < BaseMatcher
13
+ VOWEL_SOUNDS = %w[a e i o u].freeze
14
+
15
+ # Match tokens against the a/an pattern.
16
+ #
17
+ # @param tokens [Array<Hash>] Array of token hashes
18
+ # @param rule [Rule] The rule being checked
19
+ # @return [Array<Hash>] Array of error hashes
20
+ def match(tokens, rule)
21
+ errors = []
22
+ tokens.each_cons(2) do |prev_token, current_token|
23
+ prev_word = prev_token[:token]&.downcase
24
+ next unless %w[a an].include?(prev_word)
25
+ next unless prev_token[:pos_tag] == 'DET' || prev_token[:pos_tag].nil?
26
+
27
+ next_word = current_token[:token]
28
+ next if next_word.nil? || next_word.empty?
29
+
30
+ expected = article_for(next_word, rule)
31
+ if prev_word != expected
32
+ errors << build_error(prev_token, current_token, expected, rule)
33
+ end
34
+ end
35
+ errors
36
+ end
37
+
38
+ private
39
+
40
+ # Determine the correct article for a word.
41
+ #
42
+ # @param word [String] The word to check
43
+ # @param rule [Rule] The rule with exceptions
44
+ # @return [String] "a" or "an"
45
+ def article_for(word, rule)
46
+ word_downcase = word.downcase
47
+ exceptions = rule.exceptions || {}
48
+
49
+ consonant_exceptions = exceptions['consonant_sound_exceptions'] || []
50
+ return 'a' if consonant_exceptions.include?(word_downcase)
51
+
52
+ silent_exceptions = exceptions['silent_consonant_exceptions'] || []
53
+ return 'an' if silent_exceptions.include?(word_downcase)
54
+
55
+ first_char = word_downcase[0]
56
+ VOWEL_SOUNDS.include?(first_char) ? 'an' : 'a'
57
+ end
58
+
59
+ # Build an error hash.
60
+ #
61
+ # @param prev_token [Hash] The previous token (article)
62
+ # @param current_token [Hash] The current token (word)
63
+ # @param expected [String] The expected article
64
+ # @param rule [Rule] The rule being checked
65
+ # @return [Hash] Error hash
66
+ def build_error(prev_token, current_token, expected, rule)
67
+ prev_word = prev_token[:token]
68
+ next_word = current_token[:token]
69
+ message = rule.message.gsub('{expected}', expected).gsub('{word}', next_word)
70
+
71
+ {
72
+ rule_id: rule.id,
73
+ position: prev_token[:position],
74
+ message: message,
75
+ suggestion: expected,
76
+ context: "#{prev_word} #{next_word}",
77
+ suggestions: [expected]
78
+ }
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Grammar
5
+ # Base class for grammar rules.
6
+ #
7
+ # All grammar rules inherit from this class and implement
8
+ # the #check method to validate tokens.
9
+ class Rule
10
+ attr_reader :id, :name, :category, :severity, :description,
11
+ :exceptions, :message, :suggestion
12
+
13
+ def initialize(id:, name:, category:, severity:, description:,
14
+ patterns:, exceptions: {}, message:, suggestion:)
15
+ @id = id
16
+ @name = name
17
+ @category = category
18
+ @severity = severity
19
+ @description = description
20
+ @patterns = patterns
21
+ @exceptions = exceptions
22
+ @message = message
23
+ @suggestion = suggestion
24
+ end
25
+
26
+ # Factory method to create Rule from YAML configuration.
27
+ #
28
+ # @param config [Hash] YAML configuration hash
29
+ # @return [Rule] A new rule instance
30
+ def self.from_yaml(config)
31
+ new(
32
+ id: config['id'],
33
+ name: config['name'],
34
+ category: config['category'],
35
+ severity: config['severity'],
36
+ description: config['description'],
37
+ patterns: config['patterns'],
38
+ exceptions: config['exceptions'] || {},
39
+ message: config['message'],
40
+ suggestion: config['suggestion']
41
+ )
42
+ end
43
+
44
+ # Check tokens against this rule.
45
+ #
46
+ # @param tokens [Array<Hash>] Array of token hashes
47
+ # @return [Array<Hash>] Array of error hashes
48
+ def check(tokens)
49
+ errors = []
50
+ @patterns.each do |pattern|
51
+ pattern_errors = check_pattern(tokens, pattern)
52
+ errors.concat(pattern_errors)
53
+ end
54
+ errors
55
+ end
56
+
57
+ private
58
+
59
+ # Check a single pattern against tokens.
60
+ #
61
+ # @param tokens [Array<Hash>] Array of token hashes
62
+ # @param pattern [Hash] Pattern configuration hash
63
+ # @return [Array<Hash>] Array of error hashes
64
+ def check_pattern(tokens, pattern)
65
+ matcher = create_matcher(pattern)
66
+ matcher.match(tokens, self)
67
+ end
68
+
69
+ # Create appropriate pattern matcher based on pattern type.
70
+ #
71
+ # @param pattern [Hash] Pattern configuration hash
72
+ # @return [PatternMatchers::BaseMatcher] A pattern matcher instance
73
+ def create_matcher(pattern)
74
+ conditions = pattern['conditions'] || []
75
+ return PatternMatchers::BaseMatcher.new(pattern) if conditions.empty?
76
+
77
+ condition_types = conditions.map { |c| c['type'] }
78
+
79
+ if condition_types.include?('vowel_check')
80
+ require_relative 'pattern_matchers/vowel_sound_matcher'
81
+ PatternMatchers::VowelSoundMatcher.new(pattern, exceptions)
82
+ elsif condition_types.include?('context_check')
83
+ require_relative 'pattern_matchers/possessive_context_matcher'
84
+ PatternMatchers::PossessiveContextMatcher.new(pattern, exceptions)
85
+ elsif condition_types.include?('distance_check')
86
+ require_relative 'pattern_matchers/double_negative_matcher'
87
+ PatternMatchers::DoubleNegativeMatcher.new(pattern, exceptions)
88
+ else
89
+ require_relative 'pattern_matchers/base_matcher'
90
+ PatternMatchers::BaseMatcher.new(pattern)
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'rule_loader'
4
+ require_relative '../configuration'
5
+
6
+ module Kotoshu
7
+ module Grammar
8
+ # Engine for loading and executing grammar rules from YAML configuration.
9
+ #
10
+ # This implements configuration-driven design where all linguistic data
11
+ # (rules, patterns, exceptions) is stored in YAML files, not hardcoded.
12
+ #
13
+ # @example Loading rules for English
14
+ # engine = RuleEngine.new(language: 'en')
15
+ # errors = engine.check(tokens)
16
+ #
17
+ class RuleEngine
18
+ attr_reader :language, :rules
19
+
20
+ # Create a new rule engine for a language.
21
+ #
22
+ # @param language [String] Language code (e.g., 'en', 'de', 'fr')
23
+ # @param rules_path [String, nil] Optional custom path to grammar rules
24
+ # @param dictionaries_path [String, nil] Optional custom path to dictionaries directory
25
+ def initialize(language:, rules_path: nil, dictionaries_path: nil)
26
+ @language = language
27
+ @rules_path = rules_path || default_rules_path(dictionaries_path)
28
+ @loader = RuleLoader.new(@rules_path)
29
+ @rules = @loader.load_rules
30
+ end
31
+
32
+ # Check tokens against all loaded rules.
33
+ #
34
+ # @param tokens [Array<Hash>] Array of token hashes with :token, :pos_tag, :position keys
35
+ # @return [Array<Hash>] Array of error hashes
36
+ def check(tokens)
37
+ errors = []
38
+ @rules.each do |rule|
39
+ rule_errors = rule.check(tokens)
40
+ errors.concat(rule_errors)
41
+ end
42
+ errors
43
+ end
44
+
45
+ # Get list of rule IDs.
46
+ #
47
+ # @return [Array<String>] Array of rule IDs
48
+ def rule_names
49
+ @rules.map(&:id)
50
+ end
51
+
52
+ # Get a specific rule by ID.
53
+ #
54
+ # @param id [String] Rule ID
55
+ # @return [Rule, nil] The rule if found, nil otherwise
56
+ def get_rule(id)
57
+ @rules.find { |r| r.id == id }
58
+ end
59
+
60
+ # Check if a rule exists.
61
+ #
62
+ # @param id [String] Rule ID
63
+ # @return [Boolean] True if rule exists
64
+ def rule_exists?(id)
65
+ @rules.any? { |r| r.id == id }
66
+ end
67
+
68
+ private
69
+
70
+ # Get default path to grammar rules for a language.
71
+ #
72
+ # @param dictionaries_path [String, nil] Optional custom dictionaries path
73
+ # @return [String] Path to grammar rules directory
74
+ def default_rules_path(dictionaries_path = nil)
75
+ base_path = dictionaries_path || default_dictionaries_path
76
+ File.join(base_path, @language, 'grammar')
77
+ end
78
+
79
+ # Get default dictionaries path.
80
+ #
81
+ # Checks in order:
82
+ # 1. Environment variable KOTOSHU_DICTIONARIES_PATH
83
+ # 2. Configuration.dictionaries_path
84
+ # 3. Default: dictionaries/ adjacent to gem root
85
+ #
86
+ # @return [String] Path to dictionaries directory
87
+ def default_dictionaries_path
88
+ # Check for environment variable first
89
+ if ENV['KOTOSHU_DICTIONARIES_PATH']
90
+ return ENV['KOTOSHU_DICTIONARIES_PATH']
91
+ end
92
+
93
+ # Check for configuration setting
94
+ config = Configuration.instance
95
+ if config.respond_to?(:dictionaries_path) && config.dictionaries_path
96
+ return config.dictionaries_path
97
+ end
98
+
99
+ # Default: dictionaries/ directory at project root
100
+ # The kotoshu gem is at src/kotoshu/kotoshu/, so dictionaries is at src/kotoshu/dictionaries
101
+ # From lib/kotoshu/grammar/:
102
+ # - grammar/ -> kotoshu/lib/kotoshu/ (1)
103
+ # - kotoshu/lib/kotoshu/ -> lib/kotoshu/ (2)
104
+ # - lib/kotoshu/ -> kotoshu/ (3)
105
+ # - kotoshu/ -> src/kotoshu/ (4)
106
+ # - Then add dictionaries/
107
+ __dir__ + '/../../../../dictionaries'
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'yaml'
4
+ require_relative 'rule'
5
+
6
+ module Kotoshu
7
+ module Grammar
8
+ # Loads grammar rules from YAML configuration files.
9
+ #
10
+ # This class reads rule definitions from YAML files in the
11
+ # dictionaries/{language}/grammar/ directory.
12
+ class RuleLoader
13
+ def initialize(rules_path)
14
+ @rules_path = rules_path
15
+ end
16
+
17
+ # Load all rules from the rules.yaml file.
18
+ #
19
+ # @return [Array<Rule>] Array of rule instances
20
+ def load_rules
21
+ rules_file = File.join(@rules_path, 'rules.yaml')
22
+ return [] unless File.exist?(rules_file)
23
+
24
+ config = YAML.load_file(rules_file)
25
+ return [] unless config && config['rules']
26
+
27
+ config['rules'].map { |rule_config| Rule.from_yaml(rule_config) }
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'grammar/rule_engine'
4
+ require_relative 'grammar/rule_loader'
5
+ require_relative 'grammar/rule'
6
+ require_relative 'grammar/pattern_matchers/base_matcher'
7
+ require_relative 'grammar/pattern_matchers/vowel_sound_matcher'
8
+ require_relative 'grammar/pattern_matchers/possessive_context_matcher'
9
+ require_relative 'grammar/pattern_matchers/double_negative_matcher'
10
+
11
+ module Kotoshu
12
+ # Grammar rules infrastructure for Kotoshu.
13
+ #
14
+ # This module provides configuration-driven grammar checking
15
+ # where all linguistic data is stored in YAML files.
16
+ module Grammar
17
+ end
18
+ end