kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,459 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../readers/lookup_builder'
4
+ require_relative '../../components/spell_checker'
5
+ require_relative '../../components/pos_tagger'
6
+ require_relative '../../language/normalizer/base'
7
+
8
+ module Kotoshu
9
+ module Languages
10
+ # Spanish language implementation.
11
+ #
12
+ # Supports multiple dialects: es-ES, es-MX, es-AR, es-CO, es-PE, es-VE, es-CL, es-EC
13
+ #
14
+ # Full Hunspell integration with spell checking, POS tagging, and grammar rules
15
+ # specifically handling Spanish inverted punctuation and diacritics.
16
+ class Spanish < Language::Base
17
+ # Spanish spell checker with Hunspell integration.
18
+ class SpellChecker < Components::SpellChecker
19
+ attr_reader :aff_path, :dic_path, :script
20
+
21
+ # Spanish-specific character substitutions
22
+ SPANISH_SUBSTITUTIONS = {
23
+ 'á' => %w[a],
24
+ 'é' => %w[e],
25
+ 'í' => %w[i],
26
+ 'ó' => %w[o],
27
+ 'ú' => %w[u],
28
+ 'ü' => %w[u],
29
+ 'ñ' => %w[n],
30
+ '¿' => [],
31
+ '¡' => [],
32
+ }.freeze
33
+
34
+ def initialize(aff_path:, dic_path:, script: :latin, encoding: 'UTF-8')
35
+ @aff_path = aff_path
36
+ @dic_path = dic_path
37
+ @script = script
38
+ @encoding = encoding
39
+ @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
40
+ end
41
+
42
+ def check(word)
43
+ return { found: false, stem: nil, flags: [] } if word.nil? || word.empty?
44
+ first_form = @lookuper.good_forms(word).first
45
+ if first_form
46
+ { found: true, stem: first_form.stem || word, flags: first_form.flags&.to_a || [] }
47
+ else
48
+ { found: false, stem: nil, flags: [] }
49
+ end
50
+ end
51
+
52
+ def suggest(word, max_suggestions: 10)
53
+ return [] if word.nil? || word.empty?
54
+ first_form = @lookuper.good_forms(word).first
55
+ return [] if first_form
56
+ generate_suggestions(word, max_suggestions).take(max_suggestions)
57
+ end
58
+
59
+ def correct?(word)
60
+ check(word)[:found]
61
+ end
62
+
63
+ def lookuper
64
+ @lookuper
65
+ end
66
+
67
+ private
68
+
69
+ def calculate_distance(a, b)
70
+ return a.length if b.empty?
71
+ return b.length if a.empty?
72
+ matrix = Array.new(a.length + 1) { |i| [i] + [0] * b.length }
73
+ (1..b.length).each { |j| matrix[0][j] = j }
74
+ (1..a.length).each do |i|
75
+ (1..b.length).each do |j|
76
+ cost = a[i - 1] == b[j - 1] ? 0 : 1
77
+ matrix[i][j] = [matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost].min
78
+ end
79
+ end
80
+ matrix[a.length][b.length]
81
+ end
82
+
83
+ def calculate_score(original, suggestion, rank)
84
+ distance = calculate_distance(original, suggestion)
85
+ max_len = [original.length, suggestion.length].max
86
+ distance_score = 1.0 - (distance.to_f / max_len)
87
+ rank_penalty = rank * 0.05
88
+ [distance_score - rank_penalty, 0.0].max
89
+ end
90
+
91
+ def generate_suggestions(word, max_suggestions)
92
+ variations = []
93
+
94
+ # Missing accents and ñ
95
+ word.downcase.chars.each_with_index do |char, i|
96
+ SPANISH_SUBSTITUTIONS.each do |accented, variants|
97
+ variants.each do |variant|
98
+ if char == variant
99
+ accented_word = word.dup
100
+ accented_word[i] = accented
101
+ variations << accented_word if @lookuper.good_forms(accented_word).first
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ # Common substitutions
108
+ word.chars.each_with_index do |char, i|
109
+ next unless SPANISH_SUBSTITUTIONS.key?(char.downcase)
110
+ SPANISH_SUBSTITUTIONS[char.downcase].each do |sub|
111
+ next if sub.empty?
112
+ substituted = word.dup
113
+ substituted[i] = sub
114
+ variations << substituted if @lookuper.good_forms(substituted).first
115
+ end
116
+ end
117
+
118
+ # Doubled and deleted letters
119
+ word.chars.each_with_index do |char, i|
120
+ next if i == 0
121
+ doubled = word.dup
122
+ doubled.insert(i, char)
123
+ variations << doubled if @lookuper.good_forms(doubled).first
124
+ end
125
+
126
+ (0...word.length).each do |i|
127
+ deleted = word.dup
128
+ deleted.slice!(i)
129
+ next if deleted.empty?
130
+ variations << deleted if @lookuper.good_forms(deleted).first
131
+ end
132
+
133
+ variations.uniq!
134
+ variations.map do |suggestion|
135
+ { word: suggestion, distance: calculate_distance(word, suggestion), score: calculate_score(word, suggestion, 0) }
136
+ end.sort_by { |s| s[:distance] }
137
+ end
138
+ end
139
+
140
+ # Spanish tokenizer with ordinal and decimal handling.
141
+ class Tokenizer < Language::Tokenizer::SpanishTokenizer
142
+ end
143
+
144
+ # Spanish POS tagger.
145
+ class POSTagger < Components::PosTagger
146
+ FLAG_TO_POS = {
147
+ 'N' => 'NOUN', 'NN' => 'NOUN', 'NNS' => 'NOUN', 'NNP' => 'NOUN_PROPER',
148
+ 'V' => 'VERB', 'VB' => 'VERB', 'VBD' => 'VERB', 'VBG' => 'VERB', 'VBN' => 'VERB',
149
+ 'VBP' => 'VERB', 'VBZ' => 'VERB',
150
+ 'A' => 'ADJ', 'JJ' => 'ADJ', 'JJR' => 'ADJ', 'JJS' => 'ADJ',
151
+ 'R' => 'ADV', 'RB' => 'ADV', 'RBR' => 'ADV', 'RBS' => 'ADV',
152
+ 'D' => 'DET', 'DT' => 'DET', 'PDT' => 'DET',
153
+ 'P' => 'PRON', 'PP' => 'PRON', 'PRP' => 'PRON', 'PRP$' => 'PRON_POSS',
154
+ 'WP' => 'PRON', 'WP$' => 'PRON_POSS',
155
+ 'I' => 'PREP', 'IN' => 'PREP',
156
+ 'C' => 'CONJ', 'CC' => 'CONJ',
157
+ 'U' => 'PART', 'RP' => 'PART',
158
+ 'INTJ' => 'INTJ', 'UH' => 'INTJ',
159
+ 'CD' => 'NUM',
160
+ 'FW' => 'X',
161
+ 'PUNCT' => 'PUNCT', '.' => 'PUNCT', ',' => 'PUNCT', '!' => 'PUNCT',
162
+ '¿' => 'PUNCT', '¡' => 'PUNCT', '?' => 'PUNCT', ';' => 'PUNCT', ':' => 'PUNCT'
163
+ }.freeze
164
+
165
+ attr_reader :aff_path, :dic_path, :script
166
+
167
+ def initialize(aff_path:, dic_path:, script: :latin, encoding: 'UTF-8', flag_mapping: FLAG_TO_POS)
168
+ @aff_path = aff_path
169
+ @dic_path = dic_path
170
+ @script = script
171
+ @encoding = encoding
172
+ @flag_mapping = flag_mapping
173
+ @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
174
+ @lookup_cache = {}
175
+ end
176
+
177
+ def tag(tokens)
178
+ return [] if tokens.nil? || tokens.empty?
179
+ tokens.map do |token|
180
+ word = token[:token]
181
+ if word.nil? || word.empty?
182
+ token.merge(pos_tag: nil, lemma: nil)
183
+ else
184
+ lookup_result = lookup_with_pos(word)
185
+ token.merge(pos_tag: lookup_result[:pos_tag], lemma: lookup_result[:lemma] || word)
186
+ end
187
+ end
188
+ end
189
+
190
+ def flag_mapping
191
+ @flag_mapping
192
+ end
193
+
194
+ def flag_mapping=(mapping)
195
+ @flag_mapping = mapping
196
+ end
197
+
198
+ def clear_cache
199
+ @lookup_cache.clear
200
+ end
201
+
202
+ private
203
+
204
+ def lookup_with_pos(word)
205
+ return { pos_tag: nil, lemma: nil } if word.nil? || word.empty?
206
+ return @lookup_cache[word] if @lookup_cache.key?(word)
207
+ first_form = @lookuper.good_forms(word).first
208
+ pos_tag = derive_pos_tag(first_form)
209
+ cache_result = { pos_tag: pos_tag, lemma: first_form&.stem }
210
+ @lookup_cache[word] = cache_result
211
+ cache_result
212
+ end
213
+
214
+ def derive_pos_tag(result)
215
+ return nil unless result
216
+ flags = result.flags&.to_a || []
217
+ return guess_pos_from_affix(result) if flags.empty?
218
+ flags.each do |flag|
219
+ pos_tag = flag_to_pos(flag)
220
+ return pos_tag if pos_tag
221
+ end
222
+ guess_pos_from_affix(result)
223
+ end
224
+
225
+ def flag_to_pos(flag)
226
+ return @flag_mapping[flag] if @flag_mapping.key?(flag)
227
+ first_char = flag[0]
228
+ @flag_mapping[first_char]
229
+ end
230
+
231
+ def guess_pos_from_affix(result)
232
+ suffix = result.suffix
233
+ return guess_pos_from_suffix(suffix) if suffix
234
+ nil
235
+ end
236
+
237
+ def guess_pos_from_suffix(suffix)
238
+ # Spanish suffix patterns
239
+ return 'VERB' if suffix.match?(/^(ar|er|ir|ando|iendo|ado|ido|ó)$/)
240
+ return 'ADV' if suffix.match?(/^(mente)$/)
241
+ return 'NOUN' if suffix.match?(/^(ción|sión|miento|dad|eza|ismo|ista|or|nte|aje)$/)
242
+ return 'ADJ' if suffix.match?(/%(oso|oso|able|ible|ble|ico|ica|ante)$/)
243
+ nil
244
+ end
245
+ end
246
+
247
+ # Spanish grammar rules module.
248
+ module GrammarRules
249
+ class Rule
250
+ attr_reader :id, :name, :description
251
+
252
+ def initialize(id, name, description)
253
+ @id = id
254
+ @name = name
255
+ @description = description
256
+ end
257
+
258
+ def check(tokens)
259
+ raise NotImplementedError, "#{self.class} must implement #check"
260
+ end
261
+ end
262
+
263
+ # Rule: Inverted punctuation (¡, ¿)
264
+ class InvertedPunctuationRule < Rule
265
+ def initialize
266
+ super('ES_INVERTED_PUNCTUATION', 'Inverted Punctuation', 'Spanish requires inverted punctuation marks (¡, ¿) at the start of exclamations/questions.')
267
+ end
268
+
269
+ def check(tokens)
270
+ errors = []
271
+ tokens.each_with_index do |token, idx|
272
+ word = token[:token]
273
+ next if word.nil? || word.empty?
274
+
275
+ # Check for standard ? or ! without corresponding inverted marks
276
+ if word == '?' || word == '!'
277
+ # Look backwards to see if there's an inverted mark
278
+ found_inverted = false
279
+ (0...idx).reverse_each do |j|
280
+ check_token = tokens[j][:token]
281
+ if (word == '?' && check_token == '¿') || (word == '!' && check_token == '¡')
282
+ found_inverted = true
283
+ break
284
+ end
285
+ # Stop checking if we hit another sentence-ending punctuation
286
+ break if %w[. ? !].include?(check_token)
287
+ end
288
+
289
+ unless found_inverted
290
+ errors << {
291
+ rule_id: @id,
292
+ position: token[:position],
293
+ message: "Missing inverted punctuation mark: use '#{word == '?' ? '¿' : '¡'}' at the start",
294
+ suggestion: word == '?' ? '¿...?' : '¡...!',
295
+ context: word,
296
+ suggestions: [word == '?' ? '¿...?' : '¡...!']
297
+ }
298
+ end
299
+ end
300
+ end
301
+ errors
302
+ end
303
+ end
304
+
305
+ # Rule: Gender agreement
306
+ class GenderAgreementRule < Rule
307
+ def initialize
308
+ super('ES_GENDER_AGREEMENT', 'Gender Agreement', 'Nouns and adjectives must agree in gender.')
309
+ end
310
+
311
+ def check(tokens)
312
+ # Simplified implementation
313
+ []
314
+ end
315
+ end
316
+
317
+ class RuleRegistry
318
+ class << self
319
+ def default_rules
320
+ [InvertedPunctuationRule.new, GenderAgreementRule.new]
321
+ end
322
+
323
+ def get_rule(id)
324
+ default_rules.find { |rule| rule.id == id }
325
+ end
326
+ end
327
+ end
328
+ end
329
+
330
+ # Registration
331
+ register "es"
332
+ register "es-ES"
333
+ register "es-MX"
334
+ register "es-AR"
335
+ register "es-CO"
336
+ register "es-PE"
337
+ register "es-VE"
338
+ register "es-CL"
339
+ register "es-EC"
340
+ register "es-GT"
341
+ register "es-CU"
342
+ register "es-BO"
343
+ register "es-DO"
344
+ register "es-HN"
345
+ register "es-PY"
346
+ register "es-SV"
347
+ register "es-NI"
348
+ register "es-CR"
349
+ register "es-PA"
350
+ register "es-UY"
351
+ register "es-PR"
352
+
353
+ HUNSPELL_DICTIONARIES = {
354
+ 'es-ES' => {
355
+ aff: 'spec/integrational/fixtures/es_ES.aff',
356
+ dic: 'spec/integrational/fixtures/es_ES.dic'
357
+ },
358
+ 'es-MX' => {
359
+ aff: 'spec/integrational/fixtures/es_MX.aff',
360
+ dic: 'spec/integrational/fixtures/es_MX.dic'
361
+ }
362
+ }.freeze
363
+
364
+ VARIANT_NAMES = {
365
+ 'ES' => 'European',
366
+ 'MX' => 'Mexican',
367
+ 'AR' => 'Argentinian',
368
+ 'CO' => 'Colombian',
369
+ 'PE' => 'Peruvian',
370
+ 'VE' => 'Venezuelan',
371
+ 'CL' => 'Chilean',
372
+ 'EC' => 'Ecuadorian',
373
+ 'GT' => 'Guatemalan',
374
+ 'CU' => 'Cuban',
375
+ 'BO' => 'Bolivian',
376
+ 'DO' => 'Dominican',
377
+ 'HN' => 'Honduran',
378
+ 'PY' => 'Paraguayan',
379
+ 'SV' => 'Salvadoran',
380
+ 'NI' => 'Nicaraguan',
381
+ 'CR' => 'Costa Rican',
382
+ 'PA' => 'Panamanian',
383
+ 'UY' => 'Uruguayan',
384
+ 'PR' => 'Puerto Rican'
385
+ }.freeze
386
+
387
+ def initialize(code: "es", name: "Spanish", variant: nil)
388
+ variant ||= extract_region_code(code)
389
+ super(code: code, name: name, variant: variant)
390
+ @hunspell_paths = resolve_hunspell_paths(code)
391
+ end
392
+
393
+ def description
394
+ return name unless variant
395
+ variant_name = VARIANT_NAMES[variant] || variant
396
+ "#{name} (#{variant_name})"
397
+ end
398
+
399
+ def tokenizer
400
+ @tokenizer ||= Tokenizer.new
401
+ end
402
+
403
+ def normalizer
404
+ @normalizer ||= Language::Normalizer::Base.new
405
+ end
406
+
407
+ def dictionary_class
408
+ Dictionary::UnixWords
409
+ end
410
+
411
+ def default_dictionary_paths
412
+ case code
413
+ when "es-ES"
414
+ ["/usr/share/dict/spanish"]
415
+ when "es-MX"
416
+ ["/usr/share/dict/mexican"]
417
+ else
418
+ ["/usr/share/dict/words"]
419
+ end
420
+ end
421
+
422
+ def script_type
423
+ :latin
424
+ end
425
+
426
+ def create_spell_checker
427
+ SpellChecker.new(
428
+ aff_path: @hunspell_paths[:aff],
429
+ dic_path: @hunspell_paths[:dic],
430
+ script: :latin
431
+ )
432
+ end
433
+
434
+ def create_tokenizer
435
+ Tokenizer.new
436
+ end
437
+
438
+ def create_pos_tagger
439
+ POSTagger.new(
440
+ aff_path: @hunspell_paths[:aff],
441
+ dic_path: @hunspell_paths[:dic],
442
+ script: :latin,
443
+ flag_mapping: POSTagger::FLAG_TO_POS
444
+ )
445
+ end
446
+
447
+ private
448
+
449
+ def extract_region_code(code)
450
+ return nil unless code.include?("-")
451
+ code.split("-", 2).last.upcase
452
+ end
453
+
454
+ def resolve_hunspell_paths(code)
455
+ HUNSPELL_DICTIONARIES[code] || HUNSPELL_DICTIONARIES['es-ES']
456
+ end
457
+ end
458
+ end
459
+ end