kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,493 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../readers/lookup_builder'
4
+ require_relative '../../components/spell_checker'
5
+ require_relative '../../components/pos_tagger'
6
+ require_relative '../../language/normalizer/base'
7
+
8
+ module Kotoshu
9
+ module Languages
10
+ # French language implementation.
11
+ #
12
+ # Supports multiple dialects: fr-FR, fr-CA, fr-BE, fr-CH, fr-LU, fr-MC
13
+ #
14
+ # Full Hunspell integration with spell checking, POS tagging, and grammar rules.
15
+ class French < Language::Base
16
+ # French spell checker with Hunspell integration.
17
+ #
18
+ # Uses the Lookup algorithm with Hunspell-format dictionaries
19
+ # and French-specific character handling (accents, ligatures).
20
+ class SpellChecker < Components::SpellChecker
21
+ attr_reader :aff_path, :dic_path, :script
22
+
23
+ # French-specific character substitutions for suggestions
24
+ FRENCH_SUBSTITUTIONS = {
25
+ 'à' => %w[a],
26
+ 'â' => %w[a],
27
+ 'ä' => %w[a],
28
+ 'é' => %w[e],
29
+ 'è' => %w[e],
30
+ 'ê' => %w[e],
31
+ 'ë' => %w[e],
32
+ 'î' => %w[i],
33
+ 'ï' => %w[i],
34
+ 'ô' => %w[o],
35
+ 'ö' => %w[o],
36
+ 'ù' => %w[u],
37
+ 'û' => %w[u],
38
+ 'ü' => %w[u],
39
+ 'ç' => %w[c],
40
+ 'œ' => %w[oe],
41
+ 'æ' => %w[ae],
42
+ # Common French errors
43
+ 'c' => %w[ç], # garçon vs garcon
44
+ 'e' => %w[é è ê], # café vs caffe
45
+ 'a' => %w[à], # à vs a
46
+ }.freeze
47
+
48
+ def initialize(aff_path:, dic_path:, script: :latin, encoding: 'UTF-8')
49
+ @aff_path = aff_path
50
+ @dic_path = dic_path
51
+ @script = script
52
+ @encoding = encoding
53
+ @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
54
+ end
55
+
56
+ def check(word)
57
+ return { found: false, stem: nil, flags: [] } if word.nil? || word.empty?
58
+ first_form = @lookuper.good_forms(word).first
59
+ if first_form
60
+ { found: true, stem: first_form.stem || word, flags: first_form.flags&.to_a || [] }
61
+ else
62
+ { found: false, stem: nil, flags: [] }
63
+ end
64
+ end
65
+
66
+ def suggest(word, max_suggestions: 10)
67
+ return [] if word.nil? || word.empty?
68
+ first_form = @lookuper.good_forms(word).first
69
+ return [] if first_form
70
+ generate_suggestions(word, max_suggestions).take(max_suggestions)
71
+ end
72
+
73
+ def correct?(word)
74
+ check(word)[:found]
75
+ end
76
+
77
+ def lookuper
78
+ @lookuper
79
+ end
80
+
81
+ private
82
+
83
+ def calculate_distance(a, b)
84
+ return a.length if b.empty?
85
+ return b.length if a.empty?
86
+ matrix = Array.new(a.length + 1) { |i| [i] + [0] * b.length }
87
+ (1..b.length).each { |j| matrix[0][j] = j }
88
+ (1..a.length).each do |i|
89
+ (1..b.length).each do |j|
90
+ cost = a[i - 1] == b[j - 1] ? 0 : 1
91
+ matrix[i][j] = [matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost].min
92
+ end
93
+ end
94
+ matrix[a.length][b.length]
95
+ end
96
+
97
+ def calculate_score(original, suggestion, rank)
98
+ distance = calculate_distance(original, suggestion)
99
+ max_len = [original.length, suggestion.length].max
100
+ distance_score = 1.0 - (distance.to_f / max_len)
101
+ rank_penalty = rank * 0.05
102
+ [distance_score - rank_penalty, 0.0].max
103
+ end
104
+
105
+ def generate_suggestions(word, max_suggestions)
106
+ variations = []
107
+
108
+ # Missing accents
109
+ word.downcase.chars.each_with_index do |char, i|
110
+ FRENCH_SUBSTITUTIONS.each do |accented, unaccented_variants|
111
+ unaccented_variants.each do |variant|
112
+ if char == variant
113
+ unaccented_word = word.dup
114
+ unaccented_word[i] = accented
115
+ variations << unaccented_word if @lookuper.good_forms(unaccented_word).first
116
+ end
117
+ end
118
+ end
119
+ end
120
+
121
+ # Doubled letters
122
+ word.chars.each_with_index do |char, i|
123
+ next if i == 0
124
+ doubled = word.dup
125
+ doubled.insert(i, char)
126
+ variations << doubled if @lookuper.good_forms(doubled).first
127
+ end
128
+
129
+ # Deleted letters
130
+ (0...word.length).each do |i|
131
+ deleted = word.dup
132
+ deleted.slice!(i)
133
+ next if deleted.empty?
134
+ variations << deleted if @lookuper.good_forms(deleted).first
135
+ end
136
+
137
+ # Common substitutions
138
+ word.chars.each_with_index do |char, i|
139
+ next unless FRENCH_SUBSTITUTIONS.key?(char.downcase)
140
+ FRENCH_SUBSTITUTIONS[char.downcase].each do |sub|
141
+ substituted = word.dup
142
+ substituted[i] = sub
143
+ variations << substituted if @lookuper.good_forms(substituted).first
144
+ end
145
+ end
146
+
147
+ variations.uniq!
148
+ variations.map do |suggestion|
149
+ { word: suggestion, distance: calculate_distance(word, suggestion), score: calculate_score(word, suggestion, 0) }
150
+ end.sort_by { |s| s[:distance] }
151
+ end
152
+ end
153
+
154
+ # French tokenizer with contraction handling.
155
+ class Tokenizer < Language::Tokenizer::FrenchTokenizer
156
+ end
157
+
158
+ # French POS tagger.
159
+ #
160
+ # Derives POS tags from Hunspell flags using French-specific mappings.
161
+ class POSTagger < Components::PosTagger
162
+ # French POS flag mappings based on Hunspell French dictionaries
163
+ FLAG_TO_POS = {
164
+ # Nouns
165
+ 'N' => 'NOUN', 'NN' => 'NOUN', 'NNS' => 'NOUN', 'NNP' => 'NOUN_PROPER',
166
+ # Verbs
167
+ 'V' => 'VERB', 'VB' => 'VERB', 'VBD' => 'VERB', 'VBG' => 'VERB', 'VBN' => 'VERB',
168
+ 'VBP' => 'VERB', 'VBZ' => 'VERB',
169
+ # Adjectives
170
+ 'A' => 'ADJ', 'JJ' => 'ADJ', 'JJR' => 'ADJ', 'JJS' => 'ADJ',
171
+ # Adverbs
172
+ 'R' => 'ADV', 'RB' => 'ADV', 'RBR' => 'ADV', 'RBS' => 'ADV',
173
+ # Determiners
174
+ 'D' => 'DET', 'DT' => 'DET', 'PDT' => 'DET',
175
+ # Pronouns
176
+ 'P' => 'PRON', 'PP' => 'PRON', 'PRP' => 'PRON', 'PRP$' => 'PRON_POSS',
177
+ 'WP' => 'PRON', 'WP$' => 'PRON_POSS',
178
+ # Prepositions
179
+ 'I' => 'PREP', 'IN' => 'PREP',
180
+ # Conjunctions
181
+ 'C' => 'CONJ', 'CC' => 'CONJ',
182
+ # Particles
183
+ 'U' => 'PART', 'RP' => 'PART',
184
+ # Interjections
185
+ 'INTJ' => 'INTJ', 'UH' => 'INTJ',
186
+ # Numbers
187
+ 'CD' => 'NUM',
188
+ # Foreign words
189
+ 'FW' => 'X',
190
+ # Punctuation
191
+ 'PUNCT' => 'PUNCT', '.' => 'PUNCT', ',' => 'PUNCT', '!' => 'PUNCT',
192
+ '?' => 'PUNCT', ';' => 'PUNCT', ':' => 'PUNCT'
193
+ }.freeze
194
+
195
+ attr_reader :aff_path, :dic_path, :script
196
+
197
+ def initialize(aff_path:, dic_path:, script: :latin, encoding: 'UTF-8', flag_mapping: FLAG_TO_POS)
198
+ @aff_path = aff_path
199
+ @dic_path = dic_path
200
+ @script = script
201
+ @encoding = encoding
202
+ @flag_mapping = flag_mapping
203
+ @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
204
+ @lookup_cache = {}
205
+ end
206
+
207
+ def tag(tokens)
208
+ return [] if tokens.nil? || tokens.empty?
209
+ tokens.map do |token|
210
+ word = token[:token]
211
+ if word.nil? || word.empty?
212
+ token.merge(pos_tag: nil, lemma: nil)
213
+ else
214
+ lookup_result = lookup_with_pos(word)
215
+ token.merge(pos_tag: lookup_result[:pos_tag], lemma: lookup_result[:lemma] || word)
216
+ end
217
+ end
218
+ end
219
+
220
+ def flag_mapping
221
+ @flag_mapping
222
+ end
223
+
224
+ def flag_mapping=(mapping)
225
+ @flag_mapping = mapping
226
+ end
227
+
228
+ def clear_cache
229
+ @lookup_cache.clear
230
+ end
231
+
232
+ private
233
+
234
+ def lookup_with_pos(word)
235
+ return { pos_tag: nil, lemma: nil } if word.nil? || word.empty?
236
+ return @lookup_cache[word] if @lookup_cache.key?(word)
237
+ first_form = @lookuper.good_forms(word).first
238
+ pos_tag = derive_pos_tag(first_form)
239
+ cache_result = { pos_tag: pos_tag, lemma: first_form&.stem }
240
+ @lookup_cache[word] = cache_result
241
+ cache_result
242
+ end
243
+
244
+ def derive_pos_tag(result)
245
+ return nil unless result
246
+ flags = result.flags&.to_a || []
247
+ return guess_pos_from_affix(result) if flags.empty?
248
+ flags.each do |flag|
249
+ pos_tag = flag_to_pos(flag)
250
+ return pos_tag if pos_tag
251
+ end
252
+ guess_pos_from_affix(result)
253
+ end
254
+
255
+ def flag_to_pos(flag)
256
+ return @flag_mapping[flag] if @flag_mapping.key?(flag)
257
+ first_char = flag[0]
258
+ @flag_mapping[first_char]
259
+ end
260
+
261
+ def guess_pos_from_affix(result)
262
+ suffix = result.suffix
263
+ return guess_pos_from_suffix(suffix) if suffix
264
+ nil
265
+ end
266
+
267
+ def guess_pos_from_suffix(suffix)
268
+ # French suffix patterns
269
+ return 'VERB' if suffix.match?(/^(er|ir|re|is|it|issent|issons|issez)$/)
270
+ return 'ADV' if suffix.end_with?('ment')
271
+ return 'NOUN' if suffix.match?(/^(tion|sion|ment|age|ure|ée|ée)$/)
272
+ return 'ADJ' if suffix.match?(/^(if|ive|eux|euse|able|ible)$/)
273
+ nil
274
+ end
275
+ end
276
+
277
+ # French grammar rules module.
278
+ module GrammarRules
279
+ # Base class for French grammar rules.
280
+ class Rule
281
+ attr_reader :id, :name, :description
282
+
283
+ def initialize(id, name, description)
284
+ @id = id
285
+ @name = name
286
+ @description = description
287
+ end
288
+
289
+ def check(tokens)
290
+ raise NotImplementedError, "#{self.class} must implement #check"
291
+ end
292
+
293
+ def applies?(tokens, index)
294
+ true
295
+ end
296
+ end
297
+
298
+ # Rule: Article agreement with gender/number.
299
+ class ArticleAgreementRule < Rule
300
+ MASCULINE_SINGULAR = %w[le un].freeze
301
+ FEMININE_SINGULAR = %w[la une].freeze
302
+ PLURAL = %w[les des].freeze
303
+
304
+ def initialize
305
+ super('FR_ARTICLE_AGREEMENT', 'Article Agreement', 'Articles must agree with noun gender and number.')
306
+ end
307
+
308
+ def check(tokens)
309
+ errors = []
310
+ tokens.each_cons(2) do |article_token, noun_token|
311
+ article = article_token[:token]&.downcase
312
+ next unless MASCULINE_SINGULAR.include?(article) ||
313
+ FEMININE_SINGULAR.include?(article) ||
314
+ PLURAL.include?(article)
315
+
316
+ # This is a simplified check - full implementation would need dictionary lookup
317
+ # for gender/number information
318
+ next unless article_token[:pos_tag] == 'DET'
319
+
320
+ noun = noun_token[:token]
321
+ # Check for common patterns
322
+ if noun&.end_with?('e') && MASCULINE_SINGULAR.include?(article)
323
+ # Possibly incorrect: masculine article with feminine-looking noun
324
+ errors << {
325
+ rule_id: @id,
326
+ position: article_token[:position],
327
+ message: "Article agreement: check if '#{noun}' is feminine",
328
+ suggestion: nil,
329
+ context: "#{article} #{noun}",
330
+ suggestions: ['la', 'une']
331
+ }
332
+ end
333
+ end
334
+ errors
335
+ end
336
+ end
337
+
338
+ # Rule: Double negation in French (correct usage).
339
+ class FrenchNegationRule < Rule
340
+ NEGATION_PARTICLES = %w[ne n'].freeze
341
+ SECOND_PARTICLES = %w[pas plus jamais rien personne].freeze
342
+
343
+ def initialize
344
+ super('FR_NEGATION', 'French Negation', 'French uses double negation (ne...pas).')
345
+ end
346
+
347
+ def check(tokens)
348
+ errors = []
349
+ tokens.each_with_index do |token, idx|
350
+ word = token[:token]&.downcase
351
+ next unless NEGATION_PARTICLES.include?(word)
352
+
353
+ # Check if second negation particle exists within reasonable distance
354
+ found_second = false
355
+ ((idx + 1)...[idx + 5, tokens.length].min).each do |j|
356
+ next_word = tokens[j][:token]&.downcase
357
+ if SECOND_PARTICLES.include?(next_word)
358
+ found_second = true
359
+ break
360
+ end
361
+ end
362
+
363
+ unless found_second
364
+ errors << {
365
+ rule_id: @id,
366
+ position: token[:position],
367
+ message: "Incomplete negation: French requires double negation (ne...pas)",
368
+ suggestion: 'Add pas or another negation particle',
369
+ context: word,
370
+ suggestions: ['ne...pas', 'ne...pas']
371
+ }
372
+ end
373
+ end
374
+ errors
375
+ end
376
+ end
377
+
378
+ # Rule registry for French.
379
+ class RuleRegistry
380
+ class << self
381
+ def default_rules
382
+ [ArticleAgreementRule.new, FrenchNegationRule.new]
383
+ end
384
+
385
+ def get_rule(id)
386
+ default_rules.find { |rule| rule.id == id }
387
+ end
388
+ end
389
+ end
390
+ end
391
+
392
+ # Registration
393
+ register "fr"
394
+ register "fr-FR"
395
+ register "fr-CA"
396
+ register "fr-BE"
397
+ register "fr-CH"
398
+ register "fr-LU"
399
+ register "fr-MC"
400
+
401
+ HUNSPELL_DICTIONARIES = {
402
+ 'fr-FR' => {
403
+ aff: 'spec/integrational/fixtures/fr_FR.aff',
404
+ dic: 'spec/integrational/fixtures/fr_FR.dic'
405
+ },
406
+ 'fr-CA' => {
407
+ aff: 'spec/integrational/fixtures/fr_CA.aff',
408
+ dic: 'spec/integrational/fixtures/fr_CA.dic'
409
+ }
410
+ }.freeze
411
+
412
+ VARIANT_NAMES = {
413
+ 'FR' => 'France',
414
+ 'CA' => 'Canadian',
415
+ 'BE' => 'Belgian',
416
+ 'CH' => 'Swiss',
417
+ 'LU' => 'Luxembourgish',
418
+ 'MC' => 'Monégasque'
419
+ }.freeze
420
+
421
+ def initialize(code: "fr", name: "French", variant: nil)
422
+ variant ||= extract_region_code(code)
423
+ super(code: code, name: name, variant: variant)
424
+ @hunspell_paths = resolve_hunspell_paths(code)
425
+ end
426
+
427
+ def description
428
+ return name unless variant
429
+ variant_name = VARIANT_NAMES[variant] || variant
430
+ "#{name} (#{variant_name})"
431
+ end
432
+
433
+ def tokenizer
434
+ @tokenizer ||= Tokenizer.new
435
+ end
436
+
437
+ def normalizer
438
+ @normalizer ||= Language::Normalizer::Base.new
439
+ end
440
+
441
+ def dictionary_class
442
+ Dictionary::UnixWords
443
+ end
444
+
445
+ def default_dictionary_paths
446
+ case code
447
+ when "fr-FR"
448
+ ["/usr/share/dict/french"]
449
+ when "fr-CA"
450
+ ["/usr/share/dict/french-CA"]
451
+ else
452
+ ["/usr/share/dict/words"]
453
+ end
454
+ end
455
+
456
+ def script_type
457
+ :latin
458
+ end
459
+
460
+ def create_spell_checker
461
+ SpellChecker.new(
462
+ aff_path: @hunspell_paths[:aff],
463
+ dic_path: @hunspell_paths[:dic],
464
+ script: :latin
465
+ )
466
+ end
467
+
468
+ def create_tokenizer
469
+ Tokenizer.new
470
+ end
471
+
472
+ def create_pos_tagger
473
+ POSTagger.new(
474
+ aff_path: @hunspell_paths[:aff],
475
+ dic_path: @hunspell_paths[:dic],
476
+ script: :latin,
477
+ flag_mapping: POSTagger::FLAG_TO_POS
478
+ )
479
+ end
480
+
481
+ private
482
+
483
+ def extract_region_code(code)
484
+ return nil unless code.include?("-")
485
+ code.split("-", 2).last.upcase
486
+ end
487
+
488
+ def resolve_hunspell_paths(code)
489
+ HUNSPELL_DICTIONARIES[code] || HUNSPELL_DICTIONARIES['fr-FR']
490
+ end
491
+ end
492
+ end
493
+ end