kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,423 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../readers/lookup_builder'
4
+ require_relative '../../components/spell_checker'
5
+ require_relative '../../components/pos_tagger'
6
+ require_relative '../../language/normalizer/base'
7
+
8
+ module Kotoshu
9
+ module Languages
10
+ # Portuguese language implementation.
11
+ #
12
+ # Supports multiple dialects: pt-BR, pt-PT, pt-AO, pt-MZ, pt-GW, pt-CV
13
+ #
14
+ # Full Hunspell integration with spell checking, POS tagging, and grammar rules
15
+ # specifically handling Portuguese accents and Brazilian vs European differences.
16
+ class Portuguese < Language::Base
17
+ # Portuguese spell checker with Hunspell integration.
18
+ class SpellChecker < Components::SpellChecker
19
+ attr_reader :aff_path, :dic_path, :script
20
+
21
+ # Portuguese-specific character substitutions
22
+ PORTUGUESE_SUBSTITUTIONS = {
23
+ 'á' => %w[a],
24
+ 'â' => %w[a],
25
+ 'ã' => %w[a],
26
+ 'à' => %w[a],
27
+ 'é' => %w[e],
28
+ 'ê' => %w[e],
29
+ 'í' => %w[i],
30
+ 'ó' => %w[o],
31
+ 'ô' => %w[o],
32
+ 'õ' => %w[o],
33
+ 'ú' => %w[u],
34
+ 'ü' => %w[u],
35
+ 'ç' => %w[c],
36
+ }.freeze
37
+
38
+ def initialize(aff_path:, dic_path:, script: :latin, encoding: 'UTF-8')
39
+ @aff_path = aff_path
40
+ @dic_path = dic_path
41
+ @script = script
42
+ @encoding = encoding
43
+ @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
44
+ end
45
+
46
+ def check(word)
47
+ return { found: false, stem: nil, flags: [] } if word.nil? || word.empty?
48
+ first_form = @lookuper.good_forms(word).first
49
+ if first_form
50
+ { found: true, stem: first_form.stem || word, flags: first_form.flags&.to_a || [] }
51
+ else
52
+ { found: false, stem: nil, flags: [] }
53
+ end
54
+ end
55
+
56
+ def suggest(word, max_suggestions: 10)
57
+ return [] if word.nil? || word.empty?
58
+ first_form = @lookuper.good_forms(word).first
59
+ return [] if first_form
60
+ generate_suggestions(word, max_suggestions).take(max_suggestions)
61
+ end
62
+
63
+ def correct?(word)
64
+ check(word)[:found]
65
+ end
66
+
67
+ def lookuper
68
+ @lookuper
69
+ end
70
+
71
+ private
72
+
73
+ def calculate_distance(a, b)
74
+ return a.length if b.empty?
75
+ return b.length if a.empty?
76
+ matrix = Array.new(a.length + 1) { |i| [i] + [0] * b.length }
77
+ (1..b.length).each { |j| matrix[0][j] = j }
78
+ (1..a.length).each do |i|
79
+ (1..b.length).each do |j|
80
+ cost = a[i - 1] == b[j - 1] ? 0 : 1
81
+ matrix[i][j] = [matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost].min
82
+ end
83
+ end
84
+ matrix[a.length][b.length]
85
+ end
86
+
87
+ def calculate_score(original, suggestion, rank)
88
+ distance = calculate_distance(original, suggestion)
89
+ max_len = [original.length, suggestion.length].max
90
+ distance_score = 1.0 - (distance.to_f / max_len)
91
+ rank_penalty = rank * 0.05
92
+ [distance_score - rank_penalty, 0.0].max
93
+ end
94
+
95
+ def generate_suggestions(word, max_suggestions)
96
+ variations = []
97
+
98
+ # Missing accents
99
+ word.downcase.chars.each_with_index do |char, i|
100
+ PORTUGUESE_SUBSTITUTIONS.each do |accented, variants|
101
+ variants.each do |variant|
102
+ if char == variant
103
+ accented_word = word.dup
104
+ accented_word[i] = accented
105
+ variations << accented_word if @lookuper.good_forms(accented_word).first
106
+ end
107
+ end
108
+ end
109
+ end
110
+
111
+ # Common substitutions
112
+ word.chars.each_with_index do |char, i|
113
+ next unless PORTUGUESE_SUBSTITUTIONS.key?(char.downcase)
114
+ PORTUGUESE_SUBSTITUTIONS[char.downcase].each do |sub|
115
+ substituted = word.dup
116
+ substituted[i] = sub
117
+ variations << substituted if @lookuper.good_forms(substituted).first
118
+ end
119
+ end
120
+
121
+ # Doubled and deleted letters
122
+ word.chars.each_with_index do |char, i|
123
+ next if i == 0
124
+ doubled = word.dup
125
+ doubled.insert(i, char)
126
+ variations << doubled if @lookuper.good_forms(doubled).first
127
+ end
128
+
129
+ (0...word.length).each do |i|
130
+ deleted = word.dup
131
+ deleted.slice!(i)
132
+ next if deleted.empty?
133
+ variations << deleted if @lookuper.good_forms(deleted).first
134
+ end
135
+
136
+ variations.uniq!
137
+ variations.map do |suggestion|
138
+ { word: suggestion, distance: calculate_distance(word, suggestion), score: calculate_score(word, suggestion, 0) }
139
+ end.sort_by { |s| s[:distance] }
140
+ end
141
+ end
142
+
143
+ # Portuguese tokenizer with number and date handling.
144
+ class Tokenizer < Language::Tokenizer::PortugueseTokenizer
145
+ end
146
+
147
+ # Portuguese POS tagger.
148
+ class POSTagger < Components::PosTagger
149
+ FLAG_TO_POS = {
150
+ 'N' => 'NOUN', 'NN' => 'NOUN', 'NNS' => 'NOUN', 'NNP' => 'NOUN_PROPER',
151
+ 'V' => 'VERB', 'VB' => 'VERB', 'VBD' => 'VERB', 'VBG' => 'VERB', 'VBN' => 'VERB',
152
+ 'VBP' => 'VERB', 'VBZ' => 'VERB',
153
+ 'A' => 'ADJ', 'JJ' => 'ADJ', 'JJR' => 'ADJ', 'JJS' => 'ADJ',
154
+ 'R' => 'ADV', 'RB' => 'ADV', 'RBR' => 'ADV', 'RBS' => 'ADV',
155
+ 'D' => 'DET', 'DT' => 'DET', 'PDT' => 'DET',
156
+ 'P' => 'PRON', 'PP' => 'PRON', 'PRP' => 'PRON', 'PRP$' => 'PRON_POSS',
157
+ 'WP' => 'PRON', 'WP$' => 'PRON_POSS',
158
+ 'I' => 'PREP', 'IN' => 'PREP',
159
+ 'C' => 'CONJ', 'CC' => 'CONJ',
160
+ 'U' => 'PART', 'RP' => 'PART',
161
+ 'INTJ' => 'INTJ', 'UH' => 'INTJ',
162
+ 'CD' => 'NUM',
163
+ 'FW' => 'X',
164
+ 'PUNCT' => 'PUNCT', '.' => 'PUNCT', ',' => 'PUNCT', '!' => 'PUNCT',
165
+ '?' => 'PUNCT', ';' => 'PUNCT', ':' => 'PUNCT'
166
+ }.freeze
167
+
168
+ attr_reader :aff_path, :dic_path, :script
169
+
170
+ def initialize(aff_path:, dic_path:, script: :latin, encoding: 'UTF-8', flag_mapping: FLAG_TO_POS)
171
+ @aff_path = aff_path
172
+ @dic_path = dic_path
173
+ @script = script
174
+ @encoding = encoding
175
+ @flag_mapping = flag_mapping
176
+ @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
177
+ @lookup_cache = {}
178
+ end
179
+
180
+ def tag(tokens)
181
+ return [] if tokens.nil? || tokens.empty?
182
+ tokens.map do |token|
183
+ word = token[:token]
184
+ if word.nil? || word.empty?
185
+ token.merge(pos_tag: nil, lemma: nil)
186
+ else
187
+ lookup_result = lookup_with_pos(word)
188
+ token.merge(pos_tag: lookup_result[:pos_tag], lemma: lookup_result[:lemma] || word)
189
+ end
190
+ end
191
+ end
192
+
193
+ def flag_mapping
194
+ @flag_mapping
195
+ end
196
+
197
+ def flag_mapping=(mapping)
198
+ @flag_mapping = mapping
199
+ end
200
+
201
+ def clear_cache
202
+ @lookup_cache.clear
203
+ end
204
+
205
+ private
206
+
207
+ def lookup_with_pos(word)
208
+ return { pos_tag: nil, lemma: nil } if word.nil? || word.empty?
209
+ return @lookup_cache[word] if @lookup_cache.key?(word)
210
+ first_form = @lookuper.good_forms(word).first
211
+ pos_tag = derive_pos_tag(first_form)
212
+ cache_result = { pos_tag: pos_tag, lemma: first_form&.stem }
213
+ @lookup_cache[word] = cache_result
214
+ cache_result
215
+ end
216
+
217
+ def derive_pos_tag(result)
218
+ return nil unless result
219
+ flags = result.flags&.to_a || []
220
+ return guess_pos_from_affix(result) if flags.empty?
221
+ flags.each do |flag|
222
+ pos_tag = flag_to_pos(flag)
223
+ return pos_tag if pos_tag
224
+ end
225
+ guess_pos_from_affix(result)
226
+ end
227
+
228
+ def flag_to_pos(flag)
229
+ return @flag_mapping[flag] if @flag_mapping.key?(flag)
230
+ first_char = flag[0]
231
+ @flag_mapping[first_char]
232
+ end
233
+
234
+ def guess_pos_from_affix(result)
235
+ suffix = result.suffix
236
+ return guess_pos_from_suffix(suffix) if suffix
237
+ nil
238
+ end
239
+
240
+ def guess_pos_from_suffix(suffix)
241
+ return 'VERB' if suffix.match?(/^(ar|er|ir|ando|endo|indo|ado|ido)$/)
242
+ return 'ADV' if suffix.end_with?('mente')
243
+ return 'NOUN' if suffix.match?(/^(ção|são|mento|dade|eza|ismo|ista|or|nte)$/)
244
+ return 'ADJ' if suffix.match?(/%(oso|ável|ível|ico|ica|ante)$/)
245
+ nil
246
+ end
247
+ end
248
+
249
+ # Portuguese grammar rules module.
250
+ module GrammarRules
251
+ class Rule
252
+ attr_reader :id, :name, :description
253
+
254
+ def initialize(id, name, description)
255
+ @id = id
256
+ @name = name
257
+ @description = description
258
+ end
259
+
260
+ def check(tokens)
261
+ raise NotImplementedError, "#{self.class} must implement #check"
262
+ end
263
+ end
264
+
265
+ # Rule: Personal infinitive agreement
266
+ class PersonalInfinitiveRule < Rule
267
+ def initialize
268
+ super('PT_PERSONAL_INFINITIVE', 'Personal Infinitive', 'Personal infinitive must agree with the subject.')
269
+ end
270
+
271
+ def check(tokens)
272
+ # Simplified implementation
273
+ []
274
+ end
275
+ end
276
+
277
+ # Rule: Crase (à vs a)
278
+ class CraseRule < Rule
279
+ def initialize
280
+ super('PT_CRASE', 'Crase Usage', 'Use crase (à) before feminine nouns indicating place/time.')
281
+ end
282
+
283
+ def check(tokens)
284
+ errors = []
285
+ tokens.each_cons(2) do |prev_token, current_token|
286
+ prev_word = prev_token[:token]&.downcase
287
+ next unless %w[a ema].include?(prev_word)
288
+
289
+ # Check if next word starts with 'a' sound and is feminine
290
+ next_word = current_token[:token]
291
+ next if next_word.nil? || next_word.empty?
292
+
293
+ if next_word&.match?(/^[aáãâä]/i)
294
+ # Suggest using crase
295
+ errors << {
296
+ rule_id: @id,
297
+ position: prev_token[:position],
298
+ message: "Possible crase usage needed: '#{prev_word}' -> 'à'",
299
+ suggestion: 'à',
300
+ context: "#{prev_word} #{next_word}",
301
+ suggestions: ['à']
302
+ }
303
+ end
304
+ end
305
+ errors
306
+ end
307
+ end
308
+
309
+ class RuleRegistry
310
+ class << self
311
+ def default_rules
312
+ [PersonalInfinitiveRule.new, CraseRule.new]
313
+ end
314
+
315
+ def get_rule(id)
316
+ default_rules.find { |rule| rule.id == id }
317
+ end
318
+ end
319
+ end
320
+ end
321
+
322
+ # Registration
323
+ register "pt"
324
+ register "pt-BR"
325
+ register "pt-PT"
326
+ register "pt-AO"
327
+ register "pt-MZ"
328
+ register "pt-GW"
329
+ register "pt-CV"
330
+
331
+ HUNSPELL_DICTIONARIES = {
332
+ 'pt-BR' => {
333
+ aff: 'spec/integrational/fixtures/pt_BR.aff',
334
+ dic: 'spec/integrational/fixtures/pt_BR.dic'
335
+ },
336
+ 'pt-PT' => {
337
+ aff: 'spec/integrational/fixtures/pt_PT.aff',
338
+ dic: 'spec/integrational/fixtures/pt_PT.dic'
339
+ }
340
+ }.freeze
341
+
342
+ VARIANT_NAMES = {
343
+ 'BR' => 'Brazilian',
344
+ 'PT' => 'European',
345
+ 'AO' => 'Angolan',
346
+ 'MZ' => 'Mozambican',
347
+ 'GW' => 'Guinea-Bissau',
348
+ 'CV' => 'Cape Verdean'
349
+ }.freeze
350
+
351
+ def initialize(code: "pt", name: "Portuguese", variant: nil)
352
+ variant ||= extract_region_code(code)
353
+ super(code: code, name: name, variant: variant)
354
+ @hunspell_paths = resolve_hunspell_paths(code)
355
+ end
356
+
357
+ def description
358
+ return name unless variant
359
+ variant_name = VARIANT_NAMES[variant] || variant
360
+ "#{name} (#{variant_name})"
361
+ end
362
+
363
+ def tokenizer
364
+ @tokenizer ||= Tokenizer.new
365
+ end
366
+
367
+ def normalizer
368
+ @normalizer ||= Language::Normalizer::Base.new
369
+ end
370
+
371
+ def dictionary_class
372
+ Dictionary::UnixWords
373
+ end
374
+
375
+ def default_dictionary_paths
376
+ case code
377
+ when "pt-BR"
378
+ ["/usr/share/dict/brazilian"]
379
+ when "pt-PT"
380
+ ["/usr/share/dict/portuguese"]
381
+ else
382
+ ["/usr/share/dict/words"]
383
+ end
384
+ end
385
+
386
+ def script_type
387
+ :latin
388
+ end
389
+
390
+ def create_spell_checker
391
+ SpellChecker.new(
392
+ aff_path: @hunspell_paths[:aff],
393
+ dic_path: @hunspell_paths[:dic],
394
+ script: :latin
395
+ )
396
+ end
397
+
398
+ def create_tokenizer
399
+ Tokenizer.new
400
+ end
401
+
402
+ def create_pos_tagger
403
+ POSTagger.new(
404
+ aff_path: @hunspell_paths[:aff],
405
+ dic_path: @hunspell_paths[:dic],
406
+ script: :latin,
407
+ flag_mapping: POSTagger::FLAG_TO_POS
408
+ )
409
+ end
410
+
411
+ private
412
+
413
+ def extract_region_code(code)
414
+ return nil unless code.include?("-")
415
+ code.split("-", 2).last.upcase
416
+ end
417
+
418
+ def resolve_hunspell_paths(code)
419
+ HUNSPELL_DICTIONARIES[code] || HUNSPELL_DICTIONARIES['pt-BR']
420
+ end
421
+ end
422
+ end
423
+ end