kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../../readers/lookup_builder'
|
|
4
|
+
require_relative '../../components/spell_checker'
|
|
5
|
+
require_relative '../../components/pos_tagger'
|
|
6
|
+
require_relative '../../language/normalizer/base'
|
|
7
|
+
|
|
8
|
+
module Kotoshu
|
|
9
|
+
module Languages
|
|
10
|
+
# French language implementation.
|
|
11
|
+
#
|
|
12
|
+
# Supports multiple dialects: fr-FR, fr-CA, fr-BE, fr-CH, fr-LU, fr-MC
|
|
13
|
+
#
|
|
14
|
+
# Full Hunspell integration with spell checking, POS tagging, and grammar rules.
|
|
15
|
+
class French < Language::Base
|
|
16
|
+
# French spell checker with Hunspell integration.
|
|
17
|
+
#
|
|
18
|
+
# Uses the Lookup algorithm with Hunspell-format dictionaries
|
|
19
|
+
# and French-specific character handling (accents, ligatures).
|
|
20
|
+
class SpellChecker < Components::SpellChecker
|
|
21
|
+
attr_reader :aff_path, :dic_path, :script
|
|
22
|
+
|
|
23
|
+
# French-specific character substitutions for suggestions
|
|
24
|
+
FRENCH_SUBSTITUTIONS = {
|
|
25
|
+
'à' => %w[a],
|
|
26
|
+
'â' => %w[a],
|
|
27
|
+
'ä' => %w[a],
|
|
28
|
+
'é' => %w[e],
|
|
29
|
+
'è' => %w[e],
|
|
30
|
+
'ê' => %w[e],
|
|
31
|
+
'ë' => %w[e],
|
|
32
|
+
'î' => %w[i],
|
|
33
|
+
'ï' => %w[i],
|
|
34
|
+
'ô' => %w[o],
|
|
35
|
+
'ö' => %w[o],
|
|
36
|
+
'ù' => %w[u],
|
|
37
|
+
'û' => %w[u],
|
|
38
|
+
'ü' => %w[u],
|
|
39
|
+
'ç' => %w[c],
|
|
40
|
+
'œ' => %w[oe],
|
|
41
|
+
'æ' => %w[ae],
|
|
42
|
+
# Common French errors
|
|
43
|
+
'c' => %w[ç], # garçon vs garcon
|
|
44
|
+
'e' => %w[é è ê], # café vs caffe
|
|
45
|
+
'a' => %w[à], # à vs a
|
|
46
|
+
}.freeze
|
|
47
|
+
|
|
48
|
+
def initialize(aff_path:, dic_path:, script: :latin, encoding: 'UTF-8')
|
|
49
|
+
@aff_path = aff_path
|
|
50
|
+
@dic_path = dic_path
|
|
51
|
+
@script = script
|
|
52
|
+
@encoding = encoding
|
|
53
|
+
@lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def check(word)
|
|
57
|
+
return { found: false, stem: nil, flags: [] } if word.nil? || word.empty?
|
|
58
|
+
first_form = @lookuper.good_forms(word).first
|
|
59
|
+
if first_form
|
|
60
|
+
{ found: true, stem: first_form.stem || word, flags: first_form.flags&.to_a || [] }
|
|
61
|
+
else
|
|
62
|
+
{ found: false, stem: nil, flags: [] }
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def suggest(word, max_suggestions: 10)
|
|
67
|
+
return [] if word.nil? || word.empty?
|
|
68
|
+
first_form = @lookuper.good_forms(word).first
|
|
69
|
+
return [] if first_form
|
|
70
|
+
generate_suggestions(word, max_suggestions).take(max_suggestions)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def correct?(word)
|
|
74
|
+
check(word)[:found]
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def lookuper
|
|
78
|
+
@lookuper
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
private
|
|
82
|
+
|
|
83
|
+
def calculate_distance(a, b)
|
|
84
|
+
return a.length if b.empty?
|
|
85
|
+
return b.length if a.empty?
|
|
86
|
+
matrix = Array.new(a.length + 1) { |i| [i] + [0] * b.length }
|
|
87
|
+
(1..b.length).each { |j| matrix[0][j] = j }
|
|
88
|
+
(1..a.length).each do |i|
|
|
89
|
+
(1..b.length).each do |j|
|
|
90
|
+
cost = a[i - 1] == b[j - 1] ? 0 : 1
|
|
91
|
+
matrix[i][j] = [matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost].min
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
matrix[a.length][b.length]
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def calculate_score(original, suggestion, rank)
|
|
98
|
+
distance = calculate_distance(original, suggestion)
|
|
99
|
+
max_len = [original.length, suggestion.length].max
|
|
100
|
+
distance_score = 1.0 - (distance.to_f / max_len)
|
|
101
|
+
rank_penalty = rank * 0.05
|
|
102
|
+
[distance_score - rank_penalty, 0.0].max
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def generate_suggestions(word, max_suggestions)
|
|
106
|
+
variations = []
|
|
107
|
+
|
|
108
|
+
# Missing accents
|
|
109
|
+
word.downcase.chars.each_with_index do |char, i|
|
|
110
|
+
FRENCH_SUBSTITUTIONS.each do |accented, unaccented_variants|
|
|
111
|
+
unaccented_variants.each do |variant|
|
|
112
|
+
if char == variant
|
|
113
|
+
unaccented_word = word.dup
|
|
114
|
+
unaccented_word[i] = accented
|
|
115
|
+
variations << unaccented_word if @lookuper.good_forms(unaccented_word).first
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Doubled letters
|
|
122
|
+
word.chars.each_with_index do |char, i|
|
|
123
|
+
next if i == 0
|
|
124
|
+
doubled = word.dup
|
|
125
|
+
doubled.insert(i, char)
|
|
126
|
+
variations << doubled if @lookuper.good_forms(doubled).first
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Deleted letters
|
|
130
|
+
(0...word.length).each do |i|
|
|
131
|
+
deleted = word.dup
|
|
132
|
+
deleted.slice!(i)
|
|
133
|
+
next if deleted.empty?
|
|
134
|
+
variations << deleted if @lookuper.good_forms(deleted).first
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Common substitutions
|
|
138
|
+
word.chars.each_with_index do |char, i|
|
|
139
|
+
next unless FRENCH_SUBSTITUTIONS.key?(char.downcase)
|
|
140
|
+
FRENCH_SUBSTITUTIONS[char.downcase].each do |sub|
|
|
141
|
+
substituted = word.dup
|
|
142
|
+
substituted[i] = sub
|
|
143
|
+
variations << substituted if @lookuper.good_forms(substituted).first
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
variations.uniq!
|
|
148
|
+
variations.map do |suggestion|
|
|
149
|
+
{ word: suggestion, distance: calculate_distance(word, suggestion), score: calculate_score(word, suggestion, 0) }
|
|
150
|
+
end.sort_by { |s| s[:distance] }
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# French tokenizer with contraction handling.
|
|
155
|
+
class Tokenizer < Language::Tokenizer::FrenchTokenizer
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# French POS tagger.
|
|
159
|
+
#
|
|
160
|
+
# Derives POS tags from Hunspell flags using French-specific mappings.
|
|
161
|
+
class POSTagger < Components::PosTagger
|
|
162
|
+
# French POS flag mappings based on Hunspell French dictionaries
|
|
163
|
+
FLAG_TO_POS = {
|
|
164
|
+
# Nouns
|
|
165
|
+
'N' => 'NOUN', 'NN' => 'NOUN', 'NNS' => 'NOUN', 'NNP' => 'NOUN_PROPER',
|
|
166
|
+
# Verbs
|
|
167
|
+
'V' => 'VERB', 'VB' => 'VERB', 'VBD' => 'VERB', 'VBG' => 'VERB', 'VBN' => 'VERB',
|
|
168
|
+
'VBP' => 'VERB', 'VBZ' => 'VERB',
|
|
169
|
+
# Adjectives
|
|
170
|
+
'A' => 'ADJ', 'JJ' => 'ADJ', 'JJR' => 'ADJ', 'JJS' => 'ADJ',
|
|
171
|
+
# Adverbs
|
|
172
|
+
'R' => 'ADV', 'RB' => 'ADV', 'RBR' => 'ADV', 'RBS' => 'ADV',
|
|
173
|
+
# Determiners
|
|
174
|
+
'D' => 'DET', 'DT' => 'DET', 'PDT' => 'DET',
|
|
175
|
+
# Pronouns
|
|
176
|
+
'P' => 'PRON', 'PP' => 'PRON', 'PRP' => 'PRON', 'PRP$' => 'PRON_POSS',
|
|
177
|
+
'WP' => 'PRON', 'WP$' => 'PRON_POSS',
|
|
178
|
+
# Prepositions
|
|
179
|
+
'I' => 'PREP', 'IN' => 'PREP',
|
|
180
|
+
# Conjunctions
|
|
181
|
+
'C' => 'CONJ', 'CC' => 'CONJ',
|
|
182
|
+
# Particles
|
|
183
|
+
'U' => 'PART', 'RP' => 'PART',
|
|
184
|
+
# Interjections
|
|
185
|
+
'INTJ' => 'INTJ', 'UH' => 'INTJ',
|
|
186
|
+
# Numbers
|
|
187
|
+
'CD' => 'NUM',
|
|
188
|
+
# Foreign words
|
|
189
|
+
'FW' => 'X',
|
|
190
|
+
# Punctuation
|
|
191
|
+
'PUNCT' => 'PUNCT', '.' => 'PUNCT', ',' => 'PUNCT', '!' => 'PUNCT',
|
|
192
|
+
'?' => 'PUNCT', ';' => 'PUNCT', ':' => 'PUNCT'
|
|
193
|
+
}.freeze
|
|
194
|
+
|
|
195
|
+
attr_reader :aff_path, :dic_path, :script
|
|
196
|
+
|
|
197
|
+
def initialize(aff_path:, dic_path:, script: :latin, encoding: 'UTF-8', flag_mapping: FLAG_TO_POS)
|
|
198
|
+
@aff_path = aff_path
|
|
199
|
+
@dic_path = dic_path
|
|
200
|
+
@script = script
|
|
201
|
+
@encoding = encoding
|
|
202
|
+
@flag_mapping = flag_mapping
|
|
203
|
+
@lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
|
|
204
|
+
@lookup_cache = {}
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def tag(tokens)
|
|
208
|
+
return [] if tokens.nil? || tokens.empty?
|
|
209
|
+
tokens.map do |token|
|
|
210
|
+
word = token[:token]
|
|
211
|
+
if word.nil? || word.empty?
|
|
212
|
+
token.merge(pos_tag: nil, lemma: nil)
|
|
213
|
+
else
|
|
214
|
+
lookup_result = lookup_with_pos(word)
|
|
215
|
+
token.merge(pos_tag: lookup_result[:pos_tag], lemma: lookup_result[:lemma] || word)
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def flag_mapping
|
|
221
|
+
@flag_mapping
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
def flag_mapping=(mapping)
|
|
225
|
+
@flag_mapping = mapping
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def clear_cache
|
|
229
|
+
@lookup_cache.clear
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
private
|
|
233
|
+
|
|
234
|
+
def lookup_with_pos(word)
|
|
235
|
+
return { pos_tag: nil, lemma: nil } if word.nil? || word.empty?
|
|
236
|
+
return @lookup_cache[word] if @lookup_cache.key?(word)
|
|
237
|
+
first_form = @lookuper.good_forms(word).first
|
|
238
|
+
pos_tag = derive_pos_tag(first_form)
|
|
239
|
+
cache_result = { pos_tag: pos_tag, lemma: first_form&.stem }
|
|
240
|
+
@lookup_cache[word] = cache_result
|
|
241
|
+
cache_result
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def derive_pos_tag(result)
|
|
245
|
+
return nil unless result
|
|
246
|
+
flags = result.flags&.to_a || []
|
|
247
|
+
return guess_pos_from_affix(result) if flags.empty?
|
|
248
|
+
flags.each do |flag|
|
|
249
|
+
pos_tag = flag_to_pos(flag)
|
|
250
|
+
return pos_tag if pos_tag
|
|
251
|
+
end
|
|
252
|
+
guess_pos_from_affix(result)
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def flag_to_pos(flag)
|
|
256
|
+
return @flag_mapping[flag] if @flag_mapping.key?(flag)
|
|
257
|
+
first_char = flag[0]
|
|
258
|
+
@flag_mapping[first_char]
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
def guess_pos_from_affix(result)
|
|
262
|
+
suffix = result.suffix
|
|
263
|
+
return guess_pos_from_suffix(suffix) if suffix
|
|
264
|
+
nil
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
def guess_pos_from_suffix(suffix)
|
|
268
|
+
# French suffix patterns
|
|
269
|
+
return 'VERB' if suffix.match?(/^(er|ir|re|is|it|issent|issons|issez)$/)
|
|
270
|
+
return 'ADV' if suffix.end_with?('ment')
|
|
271
|
+
return 'NOUN' if suffix.match?(/^(tion|sion|ment|age|ure|ée|ée)$/)
|
|
272
|
+
return 'ADJ' if suffix.match?(/^(if|ive|eux|euse|able|ible)$/)
|
|
273
|
+
nil
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
# French grammar rules module.
|
|
278
|
+
module GrammarRules
|
|
279
|
+
# Base class for French grammar rules.
|
|
280
|
+
class Rule
|
|
281
|
+
attr_reader :id, :name, :description
|
|
282
|
+
|
|
283
|
+
def initialize(id, name, description)
|
|
284
|
+
@id = id
|
|
285
|
+
@name = name
|
|
286
|
+
@description = description
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
def check(tokens)
|
|
290
|
+
raise NotImplementedError, "#{self.class} must implement #check"
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
def applies?(tokens, index)
|
|
294
|
+
true
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
# Rule: Article agreement with gender/number.
|
|
299
|
+
class ArticleAgreementRule < Rule
|
|
300
|
+
MASCULINE_SINGULAR = %w[le un].freeze
|
|
301
|
+
FEMININE_SINGULAR = %w[la une].freeze
|
|
302
|
+
PLURAL = %w[les des].freeze
|
|
303
|
+
|
|
304
|
+
def initialize
|
|
305
|
+
super('FR_ARTICLE_AGREEMENT', 'Article Agreement', 'Articles must agree with noun gender and number.')
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def check(tokens)
|
|
309
|
+
errors = []
|
|
310
|
+
tokens.each_cons(2) do |article_token, noun_token|
|
|
311
|
+
article = article_token[:token]&.downcase
|
|
312
|
+
next unless MASCULINE_SINGULAR.include?(article) ||
|
|
313
|
+
FEMININE_SINGULAR.include?(article) ||
|
|
314
|
+
PLURAL.include?(article)
|
|
315
|
+
|
|
316
|
+
# This is a simplified check - full implementation would need dictionary lookup
|
|
317
|
+
# for gender/number information
|
|
318
|
+
next unless article_token[:pos_tag] == 'DET'
|
|
319
|
+
|
|
320
|
+
noun = noun_token[:token]
|
|
321
|
+
# Check for common patterns
|
|
322
|
+
if noun&.end_with?('e') && MASCULINE_SINGULAR.include?(article)
|
|
323
|
+
# Possibly incorrect: masculine article with feminine-looking noun
|
|
324
|
+
errors << {
|
|
325
|
+
rule_id: @id,
|
|
326
|
+
position: article_token[:position],
|
|
327
|
+
message: "Article agreement: check if '#{noun}' is feminine",
|
|
328
|
+
suggestion: nil,
|
|
329
|
+
context: "#{article} #{noun}",
|
|
330
|
+
suggestions: ['la', 'une']
|
|
331
|
+
}
|
|
332
|
+
end
|
|
333
|
+
end
|
|
334
|
+
errors
|
|
335
|
+
end
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
# Rule: Double negation in French (correct usage).
|
|
339
|
+
class FrenchNegationRule < Rule
|
|
340
|
+
NEGATION_PARTICLES = %w[ne n'].freeze
|
|
341
|
+
SECOND_PARTICLES = %w[pas plus jamais rien personne].freeze
|
|
342
|
+
|
|
343
|
+
def initialize
|
|
344
|
+
super('FR_NEGATION', 'French Negation', 'French uses double negation (ne...pas).')
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
def check(tokens)
|
|
348
|
+
errors = []
|
|
349
|
+
tokens.each_with_index do |token, idx|
|
|
350
|
+
word = token[:token]&.downcase
|
|
351
|
+
next unless NEGATION_PARTICLES.include?(word)
|
|
352
|
+
|
|
353
|
+
# Check if second negation particle exists within reasonable distance
|
|
354
|
+
found_second = false
|
|
355
|
+
((idx + 1)...[idx + 5, tokens.length].min).each do |j|
|
|
356
|
+
next_word = tokens[j][:token]&.downcase
|
|
357
|
+
if SECOND_PARTICLES.include?(next_word)
|
|
358
|
+
found_second = true
|
|
359
|
+
break
|
|
360
|
+
end
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
unless found_second
|
|
364
|
+
errors << {
|
|
365
|
+
rule_id: @id,
|
|
366
|
+
position: token[:position],
|
|
367
|
+
message: "Incomplete negation: French requires double negation (ne...pas)",
|
|
368
|
+
suggestion: 'Add pas or another negation particle',
|
|
369
|
+
context: word,
|
|
370
|
+
suggestions: ['ne...pas', 'ne...pas']
|
|
371
|
+
}
|
|
372
|
+
end
|
|
373
|
+
end
|
|
374
|
+
errors
|
|
375
|
+
end
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
# Rule registry for French.
|
|
379
|
+
class RuleRegistry
|
|
380
|
+
class << self
|
|
381
|
+
def default_rules
|
|
382
|
+
[ArticleAgreementRule.new, FrenchNegationRule.new]
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
def get_rule(id)
|
|
386
|
+
default_rules.find { |rule| rule.id == id }
|
|
387
|
+
end
|
|
388
|
+
end
|
|
389
|
+
end
|
|
390
|
+
end
|
|
391
|
+
|
|
392
|
+
# Registration
|
|
393
|
+
register "fr"
|
|
394
|
+
register "fr-FR"
|
|
395
|
+
register "fr-CA"
|
|
396
|
+
register "fr-BE"
|
|
397
|
+
register "fr-CH"
|
|
398
|
+
register "fr-LU"
|
|
399
|
+
register "fr-MC"
|
|
400
|
+
|
|
401
|
+
HUNSPELL_DICTIONARIES = {
|
|
402
|
+
'fr-FR' => {
|
|
403
|
+
aff: 'spec/integrational/fixtures/fr_FR.aff',
|
|
404
|
+
dic: 'spec/integrational/fixtures/fr_FR.dic'
|
|
405
|
+
},
|
|
406
|
+
'fr-CA' => {
|
|
407
|
+
aff: 'spec/integrational/fixtures/fr_CA.aff',
|
|
408
|
+
dic: 'spec/integrational/fixtures/fr_CA.dic'
|
|
409
|
+
}
|
|
410
|
+
}.freeze
|
|
411
|
+
|
|
412
|
+
VARIANT_NAMES = {
|
|
413
|
+
'FR' => 'France',
|
|
414
|
+
'CA' => 'Canadian',
|
|
415
|
+
'BE' => 'Belgian',
|
|
416
|
+
'CH' => 'Swiss',
|
|
417
|
+
'LU' => 'Luxembourgish',
|
|
418
|
+
'MC' => 'Monégasque'
|
|
419
|
+
}.freeze
|
|
420
|
+
|
|
421
|
+
def initialize(code: "fr", name: "French", variant: nil)
|
|
422
|
+
variant ||= extract_region_code(code)
|
|
423
|
+
super(code: code, name: name, variant: variant)
|
|
424
|
+
@hunspell_paths = resolve_hunspell_paths(code)
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
def description
|
|
428
|
+
return name unless variant
|
|
429
|
+
variant_name = VARIANT_NAMES[variant] || variant
|
|
430
|
+
"#{name} (#{variant_name})"
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
def tokenizer
|
|
434
|
+
@tokenizer ||= Tokenizer.new
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
def normalizer
|
|
438
|
+
@normalizer ||= Language::Normalizer::Base.new
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
def dictionary_class
|
|
442
|
+
Dictionary::UnixWords
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
def default_dictionary_paths
|
|
446
|
+
case code
|
|
447
|
+
when "fr-FR"
|
|
448
|
+
["/usr/share/dict/french"]
|
|
449
|
+
when "fr-CA"
|
|
450
|
+
["/usr/share/dict/french-CA"]
|
|
451
|
+
else
|
|
452
|
+
["/usr/share/dict/words"]
|
|
453
|
+
end
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
def script_type
|
|
457
|
+
:latin
|
|
458
|
+
end
|
|
459
|
+
|
|
460
|
+
def create_spell_checker
|
|
461
|
+
SpellChecker.new(
|
|
462
|
+
aff_path: @hunspell_paths[:aff],
|
|
463
|
+
dic_path: @hunspell_paths[:dic],
|
|
464
|
+
script: :latin
|
|
465
|
+
)
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
def create_tokenizer
|
|
469
|
+
Tokenizer.new
|
|
470
|
+
end
|
|
471
|
+
|
|
472
|
+
def create_pos_tagger
|
|
473
|
+
POSTagger.new(
|
|
474
|
+
aff_path: @hunspell_paths[:aff],
|
|
475
|
+
dic_path: @hunspell_paths[:dic],
|
|
476
|
+
script: :latin,
|
|
477
|
+
flag_mapping: POSTagger::FLAG_TO_POS
|
|
478
|
+
)
|
|
479
|
+
end
|
|
480
|
+
|
|
481
|
+
private
|
|
482
|
+
|
|
483
|
+
def extract_region_code(code)
|
|
484
|
+
return nil unless code.include?("-")
|
|
485
|
+
code.split("-", 2).last.upcase
|
|
486
|
+
end
|
|
487
|
+
|
|
488
|
+
def resolve_hunspell_paths(code)
|
|
489
|
+
HUNSPELL_DICTIONARIES[code] || HUNSPELL_DICTIONARIES['fr-FR']
|
|
490
|
+
end
|
|
491
|
+
end
|
|
492
|
+
end
|
|
493
|
+
end
|