kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,546 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../readers/lookup_builder'
4
+ require_relative '../../components/spell_checker'
5
+ require_relative '../../components/pos_tagger'
6
+ require_relative '../../language/normalizer/base'
7
+
8
+ module Kotoshu
9
+ module Languages
10
+ # German language implementation.
11
+ #
12
+ # Supports multiple dialects: de-DE, de-AT, de-CH, de-BE, de-IT, de-LI, de-LU
13
+ #
14
+ # Full Hunspell integration with spell checking, POS tagging, and grammar rules
15
+ # specifically handling German compound words and capitalization.
16
+ class German < Language::Base
17
+ # German spell checker with Hunspell integration.
18
+ #
19
+ # Uses the Lookup algorithm with Hunspell-format dictionaries
20
+ # and handles German-specific features (umlauts, ß, compound words).
21
+ class SpellChecker < Components::SpellChecker
22
+ attr_reader :aff_path, :dic_path, :script
23
+
24
+ # German-specific character substitutions for suggestions
25
+ GERMAN_SUBSTITUTIONS = {
26
+ # Umlauts
27
+ 'ä' => %w[a ae],
28
+ 'ö' => %w[o oe],
29
+ 'ü' => %w[u ue],
30
+ 'ß' => %w[ss sz],
31
+ # Common German errors
32
+ 'a' => %w[ä],
33
+ 'o' => %w[ö],
34
+ 'u' => %w[ü],
35
+ 's' => %w[ß],
36
+ }.freeze
37
+
38
+ def initialize(aff_path:, dic_path:, script: :latin, encoding: 'UTF-8')
39
+ @aff_path = aff_path
40
+ @dic_path = dic_path
41
+ @script = script
42
+ @encoding = encoding
43
+ @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
44
+ end
45
+
46
+ def check(word)
47
+ return { found: false, stem: nil, flags: [] } if word.nil? || word.empty?
48
+ # Try exact match first
49
+ first_form = @lookuper.good_forms(word).first
50
+ return { found: true, stem: first_form.stem || word, flags: first_form.flags&.to_a || [] } if first_form
51
+
52
+ # Try lowercase version (German nouns are capitalized)
53
+ unless word == word.downcase
54
+ lowercase_form = @lookuper.good_forms(word.downcase).first
55
+ if lowercase_form
56
+ return { found: true, stem: lowercase_form.stem || word.downcase, flags: lowercase_form.flags&.to_a || [] }
57
+ end
58
+ end
59
+
60
+ { found: false, stem: nil, flags: [] }
61
+ end
62
+
63
+ def suggest(word, max_suggestions: 10)
64
+ return [] if word.nil? || word.empty?
65
+ first_form = @lookuper.good_forms(word).first
66
+ return [] if first_form
67
+ generate_suggestions(word, max_suggestions).take(max_suggestions)
68
+ end
69
+
70
+ def correct?(word)
71
+ check(word)[:found]
72
+ end
73
+
74
+ def lookuper
75
+ @lookuper
76
+ end
77
+
78
+ private
79
+
80
+ def calculate_distance(a, b)
81
+ return a.length if b.empty?
82
+ return b.length if a.empty?
83
+ matrix = Array.new(a.length + 1) { |i| [i] + [0] * b.length }
84
+ (1..b.length).each { |j| matrix[0][j] = j }
85
+ (1..a.length).each do |i|
86
+ (1..b.length).each do |j|
87
+ cost = a[i - 1] == b[j - 1] ? 0 : 1
88
+ matrix[i][j] = [matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost].min
89
+ end
90
+ end
91
+ matrix[a.length][b.length]
92
+ end
93
+
94
+ def calculate_score(original, suggestion, rank)
95
+ distance = calculate_distance(original, suggestion)
96
+ max_len = [original.length, suggestion.length].max
97
+ distance_score = 1.0 - (distance.to_f / max_len)
98
+ rank_penalty = rank * 0.05
99
+ [distance_score - rank_penalty, 0.0].max
100
+ end
101
+
102
+ def generate_suggestions(word, max_suggestions)
103
+ variations = []
104
+
105
+ # Missing umlauts
106
+ word.downcase.chars.each_with_index do |char, i|
107
+ GERMAN_SUBSTITUTIONS.each do |umlaut, variants|
108
+ variants.each do |variant|
109
+ if char == variant
110
+ umlaut_word = word.dup
111
+ umlaut_word[i] = umlaut
112
+ variations << umlaut_word if @lookuper.good_forms(umlaut_word).first
113
+ end
114
+ end
115
+ end
116
+ end
117
+
118
+ # ß vs ss
119
+ if word.include?('ss')
120
+ eszett_word = word.gsub('ss', 'ß')
121
+ variations << eszett_word if @lookuper.good_forms(eszett_word).first
122
+ elsif word.include?('ß')
123
+ double_s_word = word.gsub('ß', 'ss')
124
+ variations << double_s_word if @lookuper.good_forms(double_s_word).first
125
+ end
126
+
127
+ # Capitalization (German nouns are capitalized)
128
+ if word == word.downcase
129
+ capitalized_word = word.capitalize
130
+ variations << capitalized_word if @lookuper.good_forms(capitalized_word).first
131
+ end
132
+
133
+ # Doubled letters
134
+ word.chars.each_with_index do |char, i|
135
+ next if i == 0
136
+ doubled = word.dup
137
+ doubled.insert(i, char)
138
+ variations << doubled if @lookuper.good_forms(doubled).first
139
+ end
140
+
141
+ # Deleted letters
142
+ (0...word.length).each do |i|
143
+ deleted = word.dup
144
+ deleted.slice!(i)
145
+ next if deleted.empty?
146
+ variations << deleted if @lookuper.good_forms(deleted).first
147
+ end
148
+
149
+ # Compound word splitting (German has long compound words)
150
+ if word.length > 10
151
+ # Try splitting common compound patterns
152
+ common_prefixes = %w[Arbeits Baum Bau Bauern Berg Buch Dach Dollar Dorf Ein Frauen Feuer Finanz Flug Franz
153
+ Frei Haupt Haus Hoch Jahr Jung Kinder Klein Konsum Land Lehr Leben Leute Mann MarktMein Milli
154
+ Morgen Mutter Natur Papier Polizei Post Post Problem Recht Rhein Rot Sache Schule Schiff Schritt
155
+ Schiff See Sozial Stadt Stein Steuer Strom Tag Teil Tier Tor Tour Typ Uhr Umwelt Unter Volk
156
+ Wasser Weg Welt Wein Welt Zeit]
157
+ common_prefixes.each do |prefix|
158
+ if word.start_with?(prefix)
159
+ split_word = prefix + ' ' + word[prefix.length..]
160
+ # Check if both parts are valid
161
+ prefix_valid = @lookuper.good_forms(prefix).first
162
+ suffix_valid = @lookuper.good_forms(word[prefix.length..]).first
163
+ if prefix_valid && suffix_valid
164
+ variations << split_word
165
+ end
166
+ end
167
+ end
168
+ end
169
+
170
+ variations.uniq!
171
+ variations.map do |suggestion|
172
+ { word: suggestion, distance: calculate_distance(word, suggestion), score: calculate_score(word, suggestion, 0) }
173
+ end.sort_by { |s| s[:distance] }
174
+ end
175
+ end
176
+
177
+ # German tokenizer with special character handling.
178
+ class Tokenizer < Language::Tokenizer::GermanTokenizer
179
+ end
180
+
181
+ # German POS tagger.
182
+ #
183
+ # Derives POS tags from Hunspell flags using German-specific mappings.
184
+ class POSTagger < Components::PosTagger
185
+ # German POS flag mappings based on Hunspell German dictionaries
186
+ FLAG_TO_POS = {
187
+ # Nouns (German nouns are capitalized)
188
+ 'N' => 'NOUN', 'NN' => 'NOUN', 'NNS' => 'NOUN', 'NNP' => 'NOUN_PROPER',
189
+ 'Sub' => 'NOUN',
190
+ # Verbs
191
+ 'V' => 'VERB', 'VB' => 'VERB', 'VBD' => 'VERB', 'VBG' => 'VERB', 'VBN' => 'VERB',
192
+ 'VBP' => 'VERB', 'VBZ' => 'VERB',
193
+ 'Vfin' => 'VERB', 'Vinf' => 'VERB', 'Vpp' => 'VERB',
194
+ # Adjectives
195
+ 'A' => 'ADJ', 'JJ' => 'ADJ', 'JJR' => 'ADJ', 'JJS' => 'ADJ',
196
+ 'Adj' => 'ADJ',
197
+ # Adverbs
198
+ 'R' => 'ADV', 'RB' => 'ADV', 'RBR' => 'ADV', 'RBS' => 'ADV',
199
+ 'Adv' => 'ADV',
200
+ # Determiners
201
+ 'D' => 'DET', 'DT' => 'DET', 'PDT' => 'DET',
202
+ 'Art' => 'DET',
203
+ # Pronouns
204
+ 'P' => 'PRON', 'PP' => 'PRON', 'PRP' => 'PRON', 'PRP$' => 'PRON_POSS',
205
+ 'WP' => 'PRON', 'WP$' => 'PRON_POSS',
206
+ 'Pro' => 'PRON',
207
+ # Prepositions
208
+ 'I' => 'PREP', 'IN' => 'PREP',
209
+ 'Prä' => 'PREP',
210
+ # Conjunctions
211
+ 'C' => 'CONJ', 'CC' => 'CONJ',
212
+ 'Kon' => 'CONJ',
213
+ # Particles
214
+ 'U' => 'PART', 'RP' => 'PART',
215
+ 'Pt' => 'PART',
216
+ # Interjections
217
+ 'INTJ' => 'INTJ', 'UH' => 'INTJ',
218
+ 'Int' => 'INTJ',
219
+ # Numbers
220
+ 'CD' => 'NUM',
221
+ 'Num' => 'NUM',
222
+ # Foreign words
223
+ 'FW' => 'X',
224
+ # Punctuation
225
+ 'PUNCT' => 'PUNCT', '.' => 'PUNCT', ',' => 'PUNCT', '!' => 'PUNCT',
226
+ '?' => 'PUNCT', ';' => 'PUNCT', ':' => 'PUNCT'
227
+ }.freeze
228
+
229
+ attr_reader :aff_path, :dic_path, :script
230
+
231
+ def initialize(aff_path:, dic_path:, script: :latin, encoding: 'UTF-8', flag_mapping: FLAG_TO_POS)
232
+ @aff_path = aff_path
233
+ @dic_path = dic_path
234
+ @script = script
235
+ @encoding = encoding
236
+ @flag_mapping = flag_mapping
237
+ @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
238
+ @lookup_cache = {}
239
+ end
240
+
241
+ def tag(tokens)
242
+ return [] if tokens.nil? || tokens.empty?
243
+ tokens.map do |token|
244
+ word = token[:token]
245
+ if word.nil? || word.empty?
246
+ token.merge(pos_tag: nil, lemma: nil)
247
+ else
248
+ lookup_result = lookup_with_pos(word)
249
+ token.merge(pos_tag: lookup_result[:pos_tag], lemma: lookup_result[:lemma] || word)
250
+ end
251
+ end
252
+ end
253
+
254
+ def flag_mapping
255
+ @flag_mapping
256
+ end
257
+
258
+ def flag_mapping=(mapping)
259
+ @flag_mapping = mapping
260
+ end
261
+
262
+ def clear_cache
263
+ @lookup_cache.clear
264
+ end
265
+
266
+ private
267
+
268
+ def lookup_with_pos(word)
269
+ return { pos_tag: nil, lemma: nil } if word.nil? || word.empty?
270
+ return @lookup_cache[word] if @lookup_cache.key?(word)
271
+
272
+ # German nouns are capitalized - try lowercase if capitalized doesn't work
273
+ first_form = @lookuper.good_forms(word).first
274
+ unless first_form
275
+ if word == word.capitalize && word.length > 1
276
+ first_form = @lookuper.good_forms(word.downcase).first
277
+ end
278
+ end
279
+
280
+ pos_tag = derive_pos_tag(first_form)
281
+ cache_result = { pos_tag: pos_tag, lemma: first_form&.stem }
282
+ @lookup_cache[word] = cache_result
283
+ cache_result
284
+ end
285
+
286
+ def derive_pos_tag(result)
287
+ return nil unless result
288
+ flags = result.flags&.to_a || []
289
+ return guess_pos_from_affix(result) if flags.empty?
290
+ flags.each do |flag|
291
+ pos_tag = flag_to_pos(flag)
292
+ return pos_tag if pos_tag
293
+ end
294
+ guess_pos_from_affix(result)
295
+ end
296
+
297
+ def flag_to_pos(flag)
298
+ return @flag_mapping[flag] if @flag_mapping.key?(flag)
299
+ first_char = flag[0]
300
+ @flag_mapping[first_char]
301
+ end
302
+
303
+ def guess_pos_from_affix(result)
304
+ suffix = result.suffix
305
+ return guess_pos_from_suffix(suffix) if suffix
306
+ nil
307
+ end
308
+
309
+ def guess_pos_from_suffix(suffix)
310
+ # German suffix patterns
311
+ return 'VERB' if suffix.match?(/^(en|eln|ern|ten|tet|t|is|ieren)$/)
312
+ return 'ADV' if suffix.match?(/^(lich|weise|lings|maß|mäßig)$/)
313
+ return 'NOUN' if suffix.match?(/^(ung|heit|keit|schaft|tion|ismus|tum|ling|ner|eur)$/)
314
+ return 'ADJ' if suffix.match?(/^(isch|ig|lich|bar|sam|haft|los|mäßig)$/)
315
+ nil
316
+ end
317
+ end
318
+
319
+ # German grammar rules module.
320
+ module GrammarRules
321
+ # Base class for German grammar rules.
322
+ class Rule
323
+ attr_reader :id, :name, :description
324
+
325
+ def initialize(id, name, description)
326
+ @id = id
327
+ @name = name
328
+ @description = description
329
+ end
330
+
331
+ def check(tokens)
332
+ raise NotImplementedError, "#{self.class} must implement #check"
333
+ end
334
+
335
+ def applies?(tokens, index)
336
+ true
337
+ end
338
+ end
339
+
340
+ # Rule: German noun capitalization.
341
+ class NounCapitalizationRule < Rule
342
+ # Common German noun suffixes
343
+ NOUN_SUFFIXES = %w[ung heit keit schaft tion ismus tum ling ner eur
344
+ able ibil ig igkeit lich sam los losung].freeze
345
+
346
+ def initialize
347
+ super('DE_NOUN_CAPITALIZATION', 'Noun Capitalization', 'German nouns must be capitalized.')
348
+ end
349
+
350
+ def check(tokens)
351
+ errors = []
352
+ tokens.each_with_index do |token, idx|
353
+ word = token[:token]
354
+ next if word.nil? || word.empty?
355
+ next if word == word.capitalize # Already capitalized
356
+ next if word.length < 3 # Too short
357
+ next unless word.match?(/^[a-zäöüß]+$/i) # Only letters
358
+
359
+ # Check if it looks like a noun (has noun suffix or is in noun position)
360
+ if word.end_with?(*NOUN_SUFFIXES)
361
+ errors << {
362
+ rule_id: @id,
363
+ position: token[:position],
364
+ message: "German nouns must be capitalized: '#{word}'",
365
+ suggestion: word.capitalize,
366
+ context: word,
367
+ suggestions: [word.capitalize]
368
+ }
369
+ end
370
+
371
+ # Check position: after determiners often indicates a noun
372
+ if idx > 0
373
+ prev_token = tokens[idx - 1][:token]&.downcase
374
+ if %w[der die das ein eine einem einen einer eines].include?(prev_token)
375
+ if word == word.downcase && word.length > 2
376
+ errors << {
377
+ rule_id: @id,
378
+ position: token[:position],
379
+ message: "German nouns must be capitalized after articles: '#{word}'",
380
+ suggestion: word.capitalize,
381
+ context: "#{prev_token} #{word}",
382
+ suggestions: [word.capitalize]
383
+ }
384
+ end
385
+ end
386
+ end
387
+ end
388
+ errors
389
+ end
390
+ end
391
+
392
+ # Rule: Compound word spacing (German compounds are written together).
393
+ class CompoundSpacingRule < Rule
394
+ def initialize
395
+ super('DE_COMPOUND_SPACING', 'Compound Spacing', 'German compound words should not have spaces.')
396
+ end
397
+
398
+ def check(tokens)
399
+ errors = []
400
+ tokens.each_with_index do |token, idx|
401
+ next unless idx < tokens.length - 1
402
+
403
+ word1 = token[:token]
404
+ word2 = tokens[idx + 1][:token]
405
+ next if word1.nil? || word2.nil?
406
+
407
+ # Check if both are lowercase (might be parts of a compound)
408
+ if word1.match?(/^[a-zäöüß]+$/) && word2.match?(/^[a-zäöüß]+$/)
409
+ # Suggest they might be a compound word
410
+ compound = word1 + word2
411
+ errors << {
412
+ rule_id: @id,
413
+ position: token[:position],
414
+ message: "Possible compound word: '#{word1} #{word2}' should be '#{compound}'",
415
+ suggestion: compound,
416
+ context: "#{word1} #{word2}",
417
+ suggestions: [compound]
418
+ }
419
+ end
420
+ end
421
+ errors
422
+ end
423
+ end
424
+
425
+ # Rule registry for German.
426
+ class RuleRegistry
427
+ class << self
428
+ def default_rules
429
+ [NounCapitalizationRule.new, CompoundSpacingRule.new]
430
+ end
431
+
432
+ def get_rule(id)
433
+ default_rules.find { |rule| rule.id == id }
434
+ end
435
+ end
436
+ end
437
+ end
438
+
439
+ # Registration
440
+ register "de"
441
+ register "de-DE"
442
+ register "de-AT"
443
+ register "de-CH"
444
+ register "de-BE"
445
+ register "de-IT"
446
+ register "de-LI"
447
+ register "de-LU"
448
+
449
+ HUNSPELL_DICTIONARIES = {
450
+ 'de-DE' => {
451
+ aff: 'spec/integrational/fixtures/de_DE.aff',
452
+ dic: 'spec/integrational/fixtures/de_DE.dic'
453
+ },
454
+ 'de-AT' => {
455
+ aff: 'spec/integrational/fixtures/de_AT.aff',
456
+ dic: 'spec/integrational/fixtures/de_AT.dic'
457
+ },
458
+ 'de-CH' => {
459
+ aff: 'spec/integrational/fixtures/de_CH.aff',
460
+ dic: 'spec/integrational/fixtures/de_CH.dic'
461
+ }
462
+ }.freeze
463
+
464
+ VARIANT_NAMES = {
465
+ 'DE' => 'German',
466
+ 'AT' => 'Austrian',
467
+ 'CH' => 'Swiss',
468
+ 'BE' => 'Belgian',
469
+ 'IT' => 'South Tyrolean',
470
+ 'LI' => 'Liechtenstein',
471
+ 'LU' => 'Luxembourgish'
472
+ }.freeze
473
+
474
+ def initialize(code: "de", name: "German", variant: nil)
475
+ variant ||= extract_region_code(code)
476
+ super(code: code, name: name, variant: variant)
477
+ @hunspell_paths = resolve_hunspell_paths(code)
478
+ end
479
+
480
+ def description
481
+ return name unless variant
482
+ variant_name = VARIANT_NAMES[variant] || variant
483
+ "#{name} (#{variant_name})"
484
+ end
485
+
486
+ def tokenizer
487
+ @tokenizer ||= Tokenizer.new
488
+ end
489
+
490
+ def normalizer
491
+ @normalizer ||= Language::Normalizer::Base.new
492
+ end
493
+
494
+ def dictionary_class
495
+ Dictionary::UnixWords
496
+ end
497
+
498
+ def default_dictionary_paths
499
+ case code
500
+ when "de-DE", "de-AT", "de-BE"
501
+ ["/usr/share/dict/german"]
502
+ when "de-CH"
503
+ ["/usr/share/dict/swiss-german"]
504
+ else
505
+ ["/usr/share/dict/words"]
506
+ end
507
+ end
508
+
509
+ def script_type
510
+ :latin
511
+ end
512
+
513
+ def create_spell_checker
514
+ SpellChecker.new(
515
+ aff_path: @hunspell_paths[:aff],
516
+ dic_path: @hunspell_paths[:dic],
517
+ script: :latin
518
+ )
519
+ end
520
+
521
+ def create_tokenizer
522
+ Tokenizer.new
523
+ end
524
+
525
+ def create_pos_tagger
526
+ POSTagger.new(
527
+ aff_path: @hunspell_paths[:aff],
528
+ dic_path: @hunspell_paths[:dic],
529
+ script: :latin,
530
+ flag_mapping: POSTagger::FLAG_TO_POS
531
+ )
532
+ end
533
+
534
+ private
535
+
536
+ def extract_region_code(code)
537
+ return nil unless code.include?("-")
538
+ code.split("-", 2).last.upcase
539
+ end
540
+
541
+ def resolve_hunspell_paths(code)
542
+ HUNSPELL_DICTIONARIES[code] || HUNSPELL_DICTIONARIES['de-DE']
543
+ end
544
+ end
545
+ end
546
+ end