kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../../readers/lookup_builder'
|
|
4
|
+
require_relative '../../components/spell_checker'
|
|
5
|
+
require_relative '../../components/pos_tagger'
|
|
6
|
+
require_relative '../../language/normalizer/base'
|
|
7
|
+
|
|
8
|
+
module Kotoshu
|
|
9
|
+
module Languages
|
|
10
|
+
# German language implementation.
|
|
11
|
+
#
|
|
12
|
+
# Supports multiple dialects: de-DE, de-AT, de-CH, de-BE, de-IT, de-LI, de-LU
|
|
13
|
+
#
|
|
14
|
+
# Full Hunspell integration with spell checking, POS tagging, and grammar rules
|
|
15
|
+
# specifically handling German compound words and capitalization.
|
|
16
|
+
class German < Language::Base
|
|
17
|
+
# German spell checker with Hunspell integration.
|
|
18
|
+
#
|
|
19
|
+
# Uses the Lookup algorithm with Hunspell-format dictionaries
|
|
20
|
+
# and handles German-specific features (umlauts, ß, compound words).
|
|
21
|
+
class SpellChecker < Components::SpellChecker
|
|
22
|
+
attr_reader :aff_path, :dic_path, :script
|
|
23
|
+
|
|
24
|
+
# German-specific character substitutions for suggestions
|
|
25
|
+
GERMAN_SUBSTITUTIONS = {
|
|
26
|
+
# Umlauts
|
|
27
|
+
'ä' => %w[a ae],
|
|
28
|
+
'ö' => %w[o oe],
|
|
29
|
+
'ü' => %w[u ue],
|
|
30
|
+
'ß' => %w[ss sz],
|
|
31
|
+
# Common German errors
|
|
32
|
+
'a' => %w[ä],
|
|
33
|
+
'o' => %w[ö],
|
|
34
|
+
'u' => %w[ü],
|
|
35
|
+
's' => %w[ß],
|
|
36
|
+
}.freeze
|
|
37
|
+
|
|
38
|
+
def initialize(aff_path:, dic_path:, script: :latin, encoding: 'UTF-8')
|
|
39
|
+
@aff_path = aff_path
|
|
40
|
+
@dic_path = dic_path
|
|
41
|
+
@script = script
|
|
42
|
+
@encoding = encoding
|
|
43
|
+
@lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def check(word)
|
|
47
|
+
return { found: false, stem: nil, flags: [] } if word.nil? || word.empty?
|
|
48
|
+
# Try exact match first
|
|
49
|
+
first_form = @lookuper.good_forms(word).first
|
|
50
|
+
return { found: true, stem: first_form.stem || word, flags: first_form.flags&.to_a || [] } if first_form
|
|
51
|
+
|
|
52
|
+
# Try lowercase version (German nouns are capitalized)
|
|
53
|
+
unless word == word.downcase
|
|
54
|
+
lowercase_form = @lookuper.good_forms(word.downcase).first
|
|
55
|
+
if lowercase_form
|
|
56
|
+
return { found: true, stem: lowercase_form.stem || word.downcase, flags: lowercase_form.flags&.to_a || [] }
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
{ found: false, stem: nil, flags: [] }
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def suggest(word, max_suggestions: 10)
|
|
64
|
+
return [] if word.nil? || word.empty?
|
|
65
|
+
first_form = @lookuper.good_forms(word).first
|
|
66
|
+
return [] if first_form
|
|
67
|
+
generate_suggestions(word, max_suggestions).take(max_suggestions)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def correct?(word)
|
|
71
|
+
check(word)[:found]
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def lookuper
|
|
75
|
+
@lookuper
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
private
|
|
79
|
+
|
|
80
|
+
def calculate_distance(a, b)
|
|
81
|
+
return a.length if b.empty?
|
|
82
|
+
return b.length if a.empty?
|
|
83
|
+
matrix = Array.new(a.length + 1) { |i| [i] + [0] * b.length }
|
|
84
|
+
(1..b.length).each { |j| matrix[0][j] = j }
|
|
85
|
+
(1..a.length).each do |i|
|
|
86
|
+
(1..b.length).each do |j|
|
|
87
|
+
cost = a[i - 1] == b[j - 1] ? 0 : 1
|
|
88
|
+
matrix[i][j] = [matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost].min
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
matrix[a.length][b.length]
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def calculate_score(original, suggestion, rank)
|
|
95
|
+
distance = calculate_distance(original, suggestion)
|
|
96
|
+
max_len = [original.length, suggestion.length].max
|
|
97
|
+
distance_score = 1.0 - (distance.to_f / max_len)
|
|
98
|
+
rank_penalty = rank * 0.05
|
|
99
|
+
[distance_score - rank_penalty, 0.0].max
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def generate_suggestions(word, max_suggestions)
|
|
103
|
+
variations = []
|
|
104
|
+
|
|
105
|
+
# Missing umlauts
|
|
106
|
+
word.downcase.chars.each_with_index do |char, i|
|
|
107
|
+
GERMAN_SUBSTITUTIONS.each do |umlaut, variants|
|
|
108
|
+
variants.each do |variant|
|
|
109
|
+
if char == variant
|
|
110
|
+
umlaut_word = word.dup
|
|
111
|
+
umlaut_word[i] = umlaut
|
|
112
|
+
variations << umlaut_word if @lookuper.good_forms(umlaut_word).first
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# ß vs ss
|
|
119
|
+
if word.include?('ss')
|
|
120
|
+
eszett_word = word.gsub('ss', 'ß')
|
|
121
|
+
variations << eszett_word if @lookuper.good_forms(eszett_word).first
|
|
122
|
+
elsif word.include?('ß')
|
|
123
|
+
double_s_word = word.gsub('ß', 'ss')
|
|
124
|
+
variations << double_s_word if @lookuper.good_forms(double_s_word).first
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Capitalization (German nouns are capitalized)
|
|
128
|
+
if word == word.downcase
|
|
129
|
+
capitalized_word = word.capitalize
|
|
130
|
+
variations << capitalized_word if @lookuper.good_forms(capitalized_word).first
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Doubled letters
|
|
134
|
+
word.chars.each_with_index do |char, i|
|
|
135
|
+
next if i == 0
|
|
136
|
+
doubled = word.dup
|
|
137
|
+
doubled.insert(i, char)
|
|
138
|
+
variations << doubled if @lookuper.good_forms(doubled).first
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Deleted letters
|
|
142
|
+
(0...word.length).each do |i|
|
|
143
|
+
deleted = word.dup
|
|
144
|
+
deleted.slice!(i)
|
|
145
|
+
next if deleted.empty?
|
|
146
|
+
variations << deleted if @lookuper.good_forms(deleted).first
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Compound word splitting (German has long compound words)
|
|
150
|
+
if word.length > 10
|
|
151
|
+
# Try splitting common compound patterns
|
|
152
|
+
common_prefixes = %w[Arbeits Baum Bau Bauern Berg Buch Dach Dollar Dorf Ein Frauen Feuer Finanz Flug Franz
|
|
153
|
+
Frei Haupt Haus Hoch Jahr Jung Kinder Klein Konsum Land Lehr Leben Leute Mann MarktMein Milli
|
|
154
|
+
Morgen Mutter Natur Papier Polizei Post Post Problem Recht Rhein Rot Sache Schule Schiff Schritt
|
|
155
|
+
Schiff See Sozial Stadt Stein Steuer Strom Tag Teil Tier Tor Tour Typ Uhr Umwelt Unter Volk
|
|
156
|
+
Wasser Weg Welt Wein Welt Zeit]
|
|
157
|
+
common_prefixes.each do |prefix|
|
|
158
|
+
if word.start_with?(prefix)
|
|
159
|
+
split_word = prefix + ' ' + word[prefix.length..]
|
|
160
|
+
# Check if both parts are valid
|
|
161
|
+
prefix_valid = @lookuper.good_forms(prefix).first
|
|
162
|
+
suffix_valid = @lookuper.good_forms(word[prefix.length..]).first
|
|
163
|
+
if prefix_valid && suffix_valid
|
|
164
|
+
variations << split_word
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
variations.uniq!
|
|
171
|
+
variations.map do |suggestion|
|
|
172
|
+
{ word: suggestion, distance: calculate_distance(word, suggestion), score: calculate_score(word, suggestion, 0) }
|
|
173
|
+
end.sort_by { |s| s[:distance] }
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# German tokenizer with special character handling.
|
|
178
|
+
class Tokenizer < Language::Tokenizer::GermanTokenizer
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# German POS tagger.
|
|
182
|
+
#
|
|
183
|
+
# Derives POS tags from Hunspell flags using German-specific mappings.
|
|
184
|
+
class POSTagger < Components::PosTagger
|
|
185
|
+
# German POS flag mappings based on Hunspell German dictionaries
|
|
186
|
+
FLAG_TO_POS = {
|
|
187
|
+
# Nouns (German nouns are capitalized)
|
|
188
|
+
'N' => 'NOUN', 'NN' => 'NOUN', 'NNS' => 'NOUN', 'NNP' => 'NOUN_PROPER',
|
|
189
|
+
'Sub' => 'NOUN',
|
|
190
|
+
# Verbs
|
|
191
|
+
'V' => 'VERB', 'VB' => 'VERB', 'VBD' => 'VERB', 'VBG' => 'VERB', 'VBN' => 'VERB',
|
|
192
|
+
'VBP' => 'VERB', 'VBZ' => 'VERB',
|
|
193
|
+
'Vfin' => 'VERB', 'Vinf' => 'VERB', 'Vpp' => 'VERB',
|
|
194
|
+
# Adjectives
|
|
195
|
+
'A' => 'ADJ', 'JJ' => 'ADJ', 'JJR' => 'ADJ', 'JJS' => 'ADJ',
|
|
196
|
+
'Adj' => 'ADJ',
|
|
197
|
+
# Adverbs
|
|
198
|
+
'R' => 'ADV', 'RB' => 'ADV', 'RBR' => 'ADV', 'RBS' => 'ADV',
|
|
199
|
+
'Adv' => 'ADV',
|
|
200
|
+
# Determiners
|
|
201
|
+
'D' => 'DET', 'DT' => 'DET', 'PDT' => 'DET',
|
|
202
|
+
'Art' => 'DET',
|
|
203
|
+
# Pronouns
|
|
204
|
+
'P' => 'PRON', 'PP' => 'PRON', 'PRP' => 'PRON', 'PRP$' => 'PRON_POSS',
|
|
205
|
+
'WP' => 'PRON', 'WP$' => 'PRON_POSS',
|
|
206
|
+
'Pro' => 'PRON',
|
|
207
|
+
# Prepositions
|
|
208
|
+
'I' => 'PREP', 'IN' => 'PREP',
|
|
209
|
+
'Prä' => 'PREP',
|
|
210
|
+
# Conjunctions
|
|
211
|
+
'C' => 'CONJ', 'CC' => 'CONJ',
|
|
212
|
+
'Kon' => 'CONJ',
|
|
213
|
+
# Particles
|
|
214
|
+
'U' => 'PART', 'RP' => 'PART',
|
|
215
|
+
'Pt' => 'PART',
|
|
216
|
+
# Interjections
|
|
217
|
+
'INTJ' => 'INTJ', 'UH' => 'INTJ',
|
|
218
|
+
'Int' => 'INTJ',
|
|
219
|
+
# Numbers
|
|
220
|
+
'CD' => 'NUM',
|
|
221
|
+
'Num' => 'NUM',
|
|
222
|
+
# Foreign words
|
|
223
|
+
'FW' => 'X',
|
|
224
|
+
# Punctuation
|
|
225
|
+
'PUNCT' => 'PUNCT', '.' => 'PUNCT', ',' => 'PUNCT', '!' => 'PUNCT',
|
|
226
|
+
'?' => 'PUNCT', ';' => 'PUNCT', ':' => 'PUNCT'
|
|
227
|
+
}.freeze
|
|
228
|
+
|
|
229
|
+
attr_reader :aff_path, :dic_path, :script
|
|
230
|
+
|
|
231
|
+
def initialize(aff_path:, dic_path:, script: :latin, encoding: 'UTF-8', flag_mapping: FLAG_TO_POS)
|
|
232
|
+
@aff_path = aff_path
|
|
233
|
+
@dic_path = dic_path
|
|
234
|
+
@script = script
|
|
235
|
+
@encoding = encoding
|
|
236
|
+
@flag_mapping = flag_mapping
|
|
237
|
+
@lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
|
|
238
|
+
@lookup_cache = {}
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def tag(tokens)
|
|
242
|
+
return [] if tokens.nil? || tokens.empty?
|
|
243
|
+
tokens.map do |token|
|
|
244
|
+
word = token[:token]
|
|
245
|
+
if word.nil? || word.empty?
|
|
246
|
+
token.merge(pos_tag: nil, lemma: nil)
|
|
247
|
+
else
|
|
248
|
+
lookup_result = lookup_with_pos(word)
|
|
249
|
+
token.merge(pos_tag: lookup_result[:pos_tag], lemma: lookup_result[:lemma] || word)
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
def flag_mapping
|
|
255
|
+
@flag_mapping
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
def flag_mapping=(mapping)
|
|
259
|
+
@flag_mapping = mapping
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
def clear_cache
|
|
263
|
+
@lookup_cache.clear
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
private
|
|
267
|
+
|
|
268
|
+
def lookup_with_pos(word)
|
|
269
|
+
return { pos_tag: nil, lemma: nil } if word.nil? || word.empty?
|
|
270
|
+
return @lookup_cache[word] if @lookup_cache.key?(word)
|
|
271
|
+
|
|
272
|
+
# German nouns are capitalized - try lowercase if capitalized doesn't work
|
|
273
|
+
first_form = @lookuper.good_forms(word).first
|
|
274
|
+
unless first_form
|
|
275
|
+
if word == word.capitalize && word.length > 1
|
|
276
|
+
first_form = @lookuper.good_forms(word.downcase).first
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
pos_tag = derive_pos_tag(first_form)
|
|
281
|
+
cache_result = { pos_tag: pos_tag, lemma: first_form&.stem }
|
|
282
|
+
@lookup_cache[word] = cache_result
|
|
283
|
+
cache_result
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
def derive_pos_tag(result)
|
|
287
|
+
return nil unless result
|
|
288
|
+
flags = result.flags&.to_a || []
|
|
289
|
+
return guess_pos_from_affix(result) if flags.empty?
|
|
290
|
+
flags.each do |flag|
|
|
291
|
+
pos_tag = flag_to_pos(flag)
|
|
292
|
+
return pos_tag if pos_tag
|
|
293
|
+
end
|
|
294
|
+
guess_pos_from_affix(result)
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
def flag_to_pos(flag)
|
|
298
|
+
return @flag_mapping[flag] if @flag_mapping.key?(flag)
|
|
299
|
+
first_char = flag[0]
|
|
300
|
+
@flag_mapping[first_char]
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
def guess_pos_from_affix(result)
|
|
304
|
+
suffix = result.suffix
|
|
305
|
+
return guess_pos_from_suffix(suffix) if suffix
|
|
306
|
+
nil
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
def guess_pos_from_suffix(suffix)
|
|
310
|
+
# German suffix patterns
|
|
311
|
+
return 'VERB' if suffix.match?(/^(en|eln|ern|ten|tet|t|is|ieren)$/)
|
|
312
|
+
return 'ADV' if suffix.match?(/^(lich|weise|lings|maß|mäßig)$/)
|
|
313
|
+
return 'NOUN' if suffix.match?(/^(ung|heit|keit|schaft|tion|ismus|tum|ling|ner|eur)$/)
|
|
314
|
+
return 'ADJ' if suffix.match?(/^(isch|ig|lich|bar|sam|haft|los|mäßig)$/)
|
|
315
|
+
nil
|
|
316
|
+
end
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
# German grammar rules module.
|
|
320
|
+
module GrammarRules
|
|
321
|
+
# Base class for German grammar rules.
|
|
322
|
+
class Rule
|
|
323
|
+
attr_reader :id, :name, :description
|
|
324
|
+
|
|
325
|
+
def initialize(id, name, description)
|
|
326
|
+
@id = id
|
|
327
|
+
@name = name
|
|
328
|
+
@description = description
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
def check(tokens)
|
|
332
|
+
raise NotImplementedError, "#{self.class} must implement #check"
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
def applies?(tokens, index)
|
|
336
|
+
true
|
|
337
|
+
end
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
# Rule: German noun capitalization.
|
|
341
|
+
class NounCapitalizationRule < Rule
|
|
342
|
+
# Common German noun suffixes
|
|
343
|
+
NOUN_SUFFIXES = %w[ung heit keit schaft tion ismus tum ling ner eur
|
|
344
|
+
able ibil ig igkeit lich sam los losung].freeze
|
|
345
|
+
|
|
346
|
+
def initialize
|
|
347
|
+
super('DE_NOUN_CAPITALIZATION', 'Noun Capitalization', 'German nouns must be capitalized.')
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
def check(tokens)
|
|
351
|
+
errors = []
|
|
352
|
+
tokens.each_with_index do |token, idx|
|
|
353
|
+
word = token[:token]
|
|
354
|
+
next if word.nil? || word.empty?
|
|
355
|
+
next if word == word.capitalize # Already capitalized
|
|
356
|
+
next if word.length < 3 # Too short
|
|
357
|
+
next unless word.match?(/^[a-zäöüß]+$/i) # Only letters
|
|
358
|
+
|
|
359
|
+
# Check if it looks like a noun (has noun suffix or is in noun position)
|
|
360
|
+
if word.end_with?(*NOUN_SUFFIXES)
|
|
361
|
+
errors << {
|
|
362
|
+
rule_id: @id,
|
|
363
|
+
position: token[:position],
|
|
364
|
+
message: "German nouns must be capitalized: '#{word}'",
|
|
365
|
+
suggestion: word.capitalize,
|
|
366
|
+
context: word,
|
|
367
|
+
suggestions: [word.capitalize]
|
|
368
|
+
}
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
# Check position: after determiners often indicates a noun
|
|
372
|
+
if idx > 0
|
|
373
|
+
prev_token = tokens[idx - 1][:token]&.downcase
|
|
374
|
+
if %w[der die das ein eine einem einen einer eines].include?(prev_token)
|
|
375
|
+
if word == word.downcase && word.length > 2
|
|
376
|
+
errors << {
|
|
377
|
+
rule_id: @id,
|
|
378
|
+
position: token[:position],
|
|
379
|
+
message: "German nouns must be capitalized after articles: '#{word}'",
|
|
380
|
+
suggestion: word.capitalize,
|
|
381
|
+
context: "#{prev_token} #{word}",
|
|
382
|
+
suggestions: [word.capitalize]
|
|
383
|
+
}
|
|
384
|
+
end
|
|
385
|
+
end
|
|
386
|
+
end
|
|
387
|
+
end
|
|
388
|
+
errors
|
|
389
|
+
end
|
|
390
|
+
end
|
|
391
|
+
|
|
392
|
+
# Rule: Compound word spacing (German compounds are written together).
|
|
393
|
+
class CompoundSpacingRule < Rule
|
|
394
|
+
def initialize
|
|
395
|
+
super('DE_COMPOUND_SPACING', 'Compound Spacing', 'German compound words should not have spaces.')
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
def check(tokens)
|
|
399
|
+
errors = []
|
|
400
|
+
tokens.each_with_index do |token, idx|
|
|
401
|
+
next unless idx < tokens.length - 1
|
|
402
|
+
|
|
403
|
+
word1 = token[:token]
|
|
404
|
+
word2 = tokens[idx + 1][:token]
|
|
405
|
+
next if word1.nil? || word2.nil?
|
|
406
|
+
|
|
407
|
+
# Check if both are lowercase (might be parts of a compound)
|
|
408
|
+
if word1.match?(/^[a-zäöüß]+$/) && word2.match?(/^[a-zäöüß]+$/)
|
|
409
|
+
# Suggest they might be a compound word
|
|
410
|
+
compound = word1 + word2
|
|
411
|
+
errors << {
|
|
412
|
+
rule_id: @id,
|
|
413
|
+
position: token[:position],
|
|
414
|
+
message: "Possible compound word: '#{word1} #{word2}' should be '#{compound}'",
|
|
415
|
+
suggestion: compound,
|
|
416
|
+
context: "#{word1} #{word2}",
|
|
417
|
+
suggestions: [compound]
|
|
418
|
+
}
|
|
419
|
+
end
|
|
420
|
+
end
|
|
421
|
+
errors
|
|
422
|
+
end
|
|
423
|
+
end
|
|
424
|
+
|
|
425
|
+
# Rule registry for German.
|
|
426
|
+
class RuleRegistry
|
|
427
|
+
class << self
|
|
428
|
+
def default_rules
|
|
429
|
+
[NounCapitalizationRule.new, CompoundSpacingRule.new]
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
def get_rule(id)
|
|
433
|
+
default_rules.find { |rule| rule.id == id }
|
|
434
|
+
end
|
|
435
|
+
end
|
|
436
|
+
end
|
|
437
|
+
end
|
|
438
|
+
|
|
439
|
+
# Registration
|
|
440
|
+
register "de"
|
|
441
|
+
register "de-DE"
|
|
442
|
+
register "de-AT"
|
|
443
|
+
register "de-CH"
|
|
444
|
+
register "de-BE"
|
|
445
|
+
register "de-IT"
|
|
446
|
+
register "de-LI"
|
|
447
|
+
register "de-LU"
|
|
448
|
+
|
|
449
|
+
HUNSPELL_DICTIONARIES = {
|
|
450
|
+
'de-DE' => {
|
|
451
|
+
aff: 'spec/integrational/fixtures/de_DE.aff',
|
|
452
|
+
dic: 'spec/integrational/fixtures/de_DE.dic'
|
|
453
|
+
},
|
|
454
|
+
'de-AT' => {
|
|
455
|
+
aff: 'spec/integrational/fixtures/de_AT.aff',
|
|
456
|
+
dic: 'spec/integrational/fixtures/de_AT.dic'
|
|
457
|
+
},
|
|
458
|
+
'de-CH' => {
|
|
459
|
+
aff: 'spec/integrational/fixtures/de_CH.aff',
|
|
460
|
+
dic: 'spec/integrational/fixtures/de_CH.dic'
|
|
461
|
+
}
|
|
462
|
+
}.freeze
|
|
463
|
+
|
|
464
|
+
VARIANT_NAMES = {
|
|
465
|
+
'DE' => 'German',
|
|
466
|
+
'AT' => 'Austrian',
|
|
467
|
+
'CH' => 'Swiss',
|
|
468
|
+
'BE' => 'Belgian',
|
|
469
|
+
'IT' => 'South Tyrolean',
|
|
470
|
+
'LI' => 'Liechtenstein',
|
|
471
|
+
'LU' => 'Luxembourgish'
|
|
472
|
+
}.freeze
|
|
473
|
+
|
|
474
|
+
def initialize(code: "de", name: "German", variant: nil)
|
|
475
|
+
variant ||= extract_region_code(code)
|
|
476
|
+
super(code: code, name: name, variant: variant)
|
|
477
|
+
@hunspell_paths = resolve_hunspell_paths(code)
|
|
478
|
+
end
|
|
479
|
+
|
|
480
|
+
def description
|
|
481
|
+
return name unless variant
|
|
482
|
+
variant_name = VARIANT_NAMES[variant] || variant
|
|
483
|
+
"#{name} (#{variant_name})"
|
|
484
|
+
end
|
|
485
|
+
|
|
486
|
+
def tokenizer
|
|
487
|
+
@tokenizer ||= Tokenizer.new
|
|
488
|
+
end
|
|
489
|
+
|
|
490
|
+
def normalizer
|
|
491
|
+
@normalizer ||= Language::Normalizer::Base.new
|
|
492
|
+
end
|
|
493
|
+
|
|
494
|
+
def dictionary_class
|
|
495
|
+
Dictionary::UnixWords
|
|
496
|
+
end
|
|
497
|
+
|
|
498
|
+
def default_dictionary_paths
|
|
499
|
+
case code
|
|
500
|
+
when "de-DE", "de-AT", "de-BE"
|
|
501
|
+
["/usr/share/dict/german"]
|
|
502
|
+
when "de-CH"
|
|
503
|
+
["/usr/share/dict/swiss-german"]
|
|
504
|
+
else
|
|
505
|
+
["/usr/share/dict/words"]
|
|
506
|
+
end
|
|
507
|
+
end
|
|
508
|
+
|
|
509
|
+
def script_type
|
|
510
|
+
:latin
|
|
511
|
+
end
|
|
512
|
+
|
|
513
|
+
def create_spell_checker
|
|
514
|
+
SpellChecker.new(
|
|
515
|
+
aff_path: @hunspell_paths[:aff],
|
|
516
|
+
dic_path: @hunspell_paths[:dic],
|
|
517
|
+
script: :latin
|
|
518
|
+
)
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
def create_tokenizer
|
|
522
|
+
Tokenizer.new
|
|
523
|
+
end
|
|
524
|
+
|
|
525
|
+
def create_pos_tagger
|
|
526
|
+
POSTagger.new(
|
|
527
|
+
aff_path: @hunspell_paths[:aff],
|
|
528
|
+
dic_path: @hunspell_paths[:dic],
|
|
529
|
+
script: :latin,
|
|
530
|
+
flag_mapping: POSTagger::FLAG_TO_POS
|
|
531
|
+
)
|
|
532
|
+
end
|
|
533
|
+
|
|
534
|
+
private
|
|
535
|
+
|
|
536
|
+
def extract_region_code(code)
|
|
537
|
+
return nil unless code.include?("-")
|
|
538
|
+
code.split("-", 2).last.upcase
|
|
539
|
+
end
|
|
540
|
+
|
|
541
|
+
def resolve_hunspell_paths(code)
|
|
542
|
+
HUNSPELL_DICTIONARIES[code] || HUNSPELL_DICTIONARIES['de-DE']
|
|
543
|
+
end
|
|
544
|
+
end
|
|
545
|
+
end
|
|
546
|
+
end
|