kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'aff_data'
|
|
4
|
+
require_relative 'file_reader'
|
|
5
|
+
|
|
6
|
+
module Kotoshu
|
|
7
|
+
module Readers
|
|
8
|
+
# AFF file reader for Hunspell affix files.
|
|
9
|
+
#
|
|
10
|
+
# This class reads .aff files and creates an Aff data structure.
|
|
11
|
+
#
|
|
12
|
+
# @example Reading an aff file
|
|
13
|
+
# reader = AffReader.new('en_US.aff')
|
|
14
|
+
# aff = reader.read
|
|
15
|
+
class AffReader
|
|
16
|
+
# Directives that are single boolean flags
|
|
17
|
+
BOOLEAN_DIRECTIVES = %w[
|
|
18
|
+
COMPLEXPREFIXES FULLSTRIP NOSPLITSUGS CHECKSHARPS
|
|
19
|
+
CHECKCOMPOUNDCASE CHECKCOMPOUNDDUP CHECKCOMPOUNDREP CHECKCOMPOUNDTRIPLE
|
|
20
|
+
SIMPLIFIEDTRIPLE ONLYMAXDIFF COMPOUNDMORESUFFIXES
|
|
21
|
+
].freeze
|
|
22
|
+
|
|
23
|
+
# Directives that are single string values
|
|
24
|
+
STRING_DIRECTIVES = %w[SET FLAG KEY TRY WORDCHARS LANG].freeze
|
|
25
|
+
|
|
26
|
+
# Directives that are single integer values
|
|
27
|
+
INTEGER_DIRECTIVES = %w[MAXDIFF MAXNGRAMSUGS MAXCPDSUGS COMPOUNDMIN COMPOUNDWORDMAX].freeze
|
|
28
|
+
|
|
29
|
+
# Directives that are single flag values
|
|
30
|
+
FLAG_DIRECTIVES = %w[
|
|
31
|
+
NOSUGGEST KEEPCASE CIRCUMFIX NEEDAFFIX FORBIDDENWORD WARN
|
|
32
|
+
COMPOUNDFLAG COMPOUNDBEGIN COMPOUNDMIDDLE COMPOUNDEND
|
|
33
|
+
ONLYINCOMPOUND COMPOUNDPERMITFLAG COMPOUNDFORBIDFLAG FORCEUCASE
|
|
34
|
+
SUBSTANDARD SYLLABLENUM COMPOUNDROOT
|
|
35
|
+
].freeze
|
|
36
|
+
|
|
37
|
+
# Outdated directive names and their synonyms
|
|
38
|
+
SYNONYMS = {
|
|
39
|
+
'PSEUDOROOT' => 'NEEDAFFIX',
|
|
40
|
+
'COMPOUNDLAST' => 'COMPOUNDEND'
|
|
41
|
+
}.freeze
|
|
42
|
+
|
|
43
|
+
attr_reader :path, :encoding, :flag_format
|
|
44
|
+
|
|
45
|
+
# Create a new AFF reader.
|
|
46
|
+
#
|
|
47
|
+
# @param path [String] Path to the .aff file
|
|
48
|
+
# @param encoding [String] File encoding (default: 'UTF-8');
|
|
49
|
+
# overridden by the file's SET directive when present
|
|
50
|
+
def initialize(path, encoding: 'UTF-8')
|
|
51
|
+
@path = path
|
|
52
|
+
@encoding = detect_encoding(path) || encoding
|
|
53
|
+
@flag_format = 'short'
|
|
54
|
+
@flag_synonyms = {}
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Read the aff file and return the aff data structure.
|
|
58
|
+
#
|
|
59
|
+
# @param source [FileReader, nil] Optional file reader to use instead of creating a new one
|
|
60
|
+
# @return [Hash] The aff data structure
|
|
61
|
+
def read(source = nil)
|
|
62
|
+
reader = source || FileReader.new(@path, @encoding)
|
|
63
|
+
|
|
64
|
+
data = {
|
|
65
|
+
'SFX' => {},
|
|
66
|
+
'PFX' => {},
|
|
67
|
+
'FLAG' => 'short'
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
reader.each do |_line_no, line|
|
|
71
|
+
dir_value = read_directive(reader, line)
|
|
72
|
+
next unless dir_value
|
|
73
|
+
|
|
74
|
+
directive, value = dir_value
|
|
75
|
+
|
|
76
|
+
# Update flag format when FLAG directive is encountered (BEFORE using it)
|
|
77
|
+
if directive == 'FLAG'
|
|
78
|
+
@flag_format = value
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Re-parse FLAG directive value now that @flag_format is updated
|
|
82
|
+
if directive == 'FLAG' && value.is_a?(String)
|
|
83
|
+
# No re-parsing needed for FLAG, just update the format
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# SFX/PFX have multiple entries
|
|
87
|
+
if %w[SFX PFX].include?(directive)
|
|
88
|
+
data[directive][value.first.flag] = value
|
|
89
|
+
else
|
|
90
|
+
data[directive] = value
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Update flag synonyms when AF directive is encountered (AFTER storing it)
|
|
94
|
+
if directive == 'AF'
|
|
95
|
+
@flag_synonyms = value
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Note: We don't reset_encoding during iteration because it closes
|
|
99
|
+
# the file and breaks the iteration. The FileReader is initialized
|
|
100
|
+
# with UTF-8 encoding which handles most cases.
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
data
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
private
|
|
107
|
+
|
|
108
|
+
# Read a directive from a line.
|
|
109
|
+
#
|
|
110
|
+
# @param reader [FileReader] The file reader
|
|
111
|
+
# @param line [String] The line to parse
|
|
112
|
+
# @return [Array, nil] [directive, value] or nil
|
|
113
|
+
def read_directive(reader, line)
|
|
114
|
+
parts = line.split(/\s+/)
|
|
115
|
+
return nil if parts.empty?
|
|
116
|
+
|
|
117
|
+
name = parts[0]
|
|
118
|
+
|
|
119
|
+
# Check if it looks like a directive (all caps)
|
|
120
|
+
return nil unless name =~ /^[A-Z]+$/
|
|
121
|
+
|
|
122
|
+
# Handle synonyms
|
|
123
|
+
name = SYNONYMS[name] || name
|
|
124
|
+
|
|
125
|
+
value = read_value(reader, name, parts[1..])
|
|
126
|
+
|
|
127
|
+
return nil if value.nil?
|
|
128
|
+
|
|
129
|
+
[name, value]
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Read the value for a directive.
|
|
133
|
+
#
|
|
134
|
+
# @param reader [FileReader] The file reader
|
|
135
|
+
# @param directive [String] The directive name
|
|
136
|
+
# @param values [Array<String>] Values from the line
|
|
137
|
+
# @return [Object] The parsed value
|
|
138
|
+
def read_value(reader, directive, values)
|
|
139
|
+
value = values.first
|
|
140
|
+
|
|
141
|
+
# String directives
|
|
142
|
+
if STRING_DIRECTIVES.include?(directive)
|
|
143
|
+
return value
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Integer directives
|
|
147
|
+
if INTEGER_DIRECTIVES.include?(directive)
|
|
148
|
+
return value&.to_i
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Flag directives
|
|
152
|
+
if FLAG_DIRECTIVES.include?(directive)
|
|
153
|
+
return parse_flag(value)
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Boolean directives
|
|
157
|
+
if BOOLEAN_DIRECTIVES.include?(directive)
|
|
158
|
+
return true
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# IGNORE directive
|
|
162
|
+
if directive == 'IGNORE'
|
|
163
|
+
return Ignore.new(value || '')
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# BREAK directive
|
|
167
|
+
if directive == 'BREAK'
|
|
168
|
+
count = value&.to_i || 0
|
|
169
|
+
return read_array(reader, count).map { |parts| BreakPattern.new(parts.first) }
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# COMPOUNDRULE directive
|
|
173
|
+
if directive == 'COMPOUNDRULE'
|
|
174
|
+
count = value&.to_i || 0
|
|
175
|
+
return read_array(reader, count).map { |parts| CompoundRule.new(parts.first) }
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# ICONV/OCONV directives
|
|
179
|
+
if %w[ICONV OCONV].include?(directive)
|
|
180
|
+
count = value&.to_i || 0
|
|
181
|
+
pairs = read_array(reader, count).map { |parts| [parts[0], parts[1] || ''] }
|
|
182
|
+
return ConvTable.new(pairs)
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# REP directive
|
|
186
|
+
if directive == 'REP'
|
|
187
|
+
count = value&.to_i || 0
|
|
188
|
+
return read_array(reader, count).map { |parts| RepPattern.new(parts[0], parts[1] || '') }
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# MAP directive
|
|
192
|
+
if directive == 'MAP'
|
|
193
|
+
count = value&.to_i || 0
|
|
194
|
+
return read_array(reader, count).map do |parts|
|
|
195
|
+
chars = parts.first || ''
|
|
196
|
+
# Parse MAP format: "aàâä" or "ß(ss)" - split by parentheses or individual chars
|
|
197
|
+
# Parenthesized groups like "(ss)" should be kept as a single string "ss"
|
|
198
|
+
chars.scan(/(\([^()]+\)|[^()])/).flatten.map do |group|
|
|
199
|
+
# Remove parentheses from parenthesized groups, keep as single string
|
|
200
|
+
# For single characters, keep as is
|
|
201
|
+
if group.start_with?('(') && group.end_with?(')')
|
|
202
|
+
group[1..-2] # Remove parentheses, keep content as single string
|
|
203
|
+
else
|
|
204
|
+
group # Keep single character as is
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# SFX/PFX directives
|
|
211
|
+
if %w[SFX PFX].include?(directive)
|
|
212
|
+
flag, crossproduct, count = values[0], values[1], values[2]&.to_i || 0
|
|
213
|
+
type = directive == 'PFX' ? :prefix : :suffix
|
|
214
|
+
|
|
215
|
+
affixes = read_array(reader, count).map do |parts|
|
|
216
|
+
# Format: FLAG strip add condition [morph_data]
|
|
217
|
+
# After read_array (which skips directive), parts[0] is FLAG again
|
|
218
|
+
# So we skip parts[0] and use: parts[1]=strip, parts[2]=add, parts[3]=condition
|
|
219
|
+
strip = parts[1] == '0' ? '' : (parts[1] || '')
|
|
220
|
+
add = parts[2] || ''
|
|
221
|
+
condition = parts[3] || '.'
|
|
222
|
+
|
|
223
|
+
# Handle flags in add field: "able/CD" -> add="able", flags=["C", "D"]
|
|
224
|
+
if add.include?('/')
|
|
225
|
+
add_str, _, flags_str = add.rpartition('/')
|
|
226
|
+
else
|
|
227
|
+
add_str = add
|
|
228
|
+
flags_str = ''
|
|
229
|
+
end
|
|
230
|
+
flags = flags_str.empty? ? Set.new : parse_flags(flags_str).to_set
|
|
231
|
+
|
|
232
|
+
Affix.new(
|
|
233
|
+
type:,
|
|
234
|
+
flag:,
|
|
235
|
+
crossproduct: crossproduct == 'Y',
|
|
236
|
+
strip:,
|
|
237
|
+
add: add_str == '0' ? '' : add_str,
|
|
238
|
+
condition:,
|
|
239
|
+
flags:
|
|
240
|
+
)
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
return affixes
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
# CHECKCOMPOUNDPATTERN directive
|
|
247
|
+
if directive == 'CHECKCOMPOUNDPATTERN'
|
|
248
|
+
count = value&.to_i || 0
|
|
249
|
+
return read_array(reader, count).map do |parts|
|
|
250
|
+
CompoundPattern.new(parts[0], parts[1] || '', parts[2])
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
# AF directive (flag synonyms)
|
|
255
|
+
if directive == 'AF'
|
|
256
|
+
count = value&.to_i || 0
|
|
257
|
+
result = {}
|
|
258
|
+
read_array(reader, count).each_with_index do |parts, i|
|
|
259
|
+
# AF directives always use single-character flags (short format)
|
|
260
|
+
# regardless of the main FLAG format
|
|
261
|
+
flags = parts.first.chars
|
|
262
|
+
result[(i + 1).to_s] = flags.to_set
|
|
263
|
+
end
|
|
264
|
+
return result
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
# AM directive
|
|
268
|
+
if directive == 'AM'
|
|
269
|
+
count = value&.to_i || 0
|
|
270
|
+
result = {}
|
|
271
|
+
read_array(reader, count).each_with_index do |parts, i|
|
|
272
|
+
result[(i + 1).to_s] = parts.to_set
|
|
273
|
+
end
|
|
274
|
+
return result
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
# COMPOUNDSYLLABLE directive
|
|
278
|
+
if directive == 'COMPOUNDSYLLABLE'
|
|
279
|
+
return [value&.to_i, values[1]]
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
# PHONE directive
|
|
283
|
+
if directive == 'PHONE'
|
|
284
|
+
count = value&.to_i || 0
|
|
285
|
+
table = read_array(reader, count).map { |parts| [parts[0], parts[1] || '_'] }
|
|
286
|
+
return PhonetTable.new(table)
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# Unknown directive - return nil
|
|
290
|
+
nil
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
# Read an array of values from the reader.
|
|
294
|
+
#
|
|
295
|
+
# @param reader [FileReader] The file reader
|
|
296
|
+
# @param count [Integer] Number of lines to read
|
|
297
|
+
# @return [Array<Array<String>>] Array of parsed lines
|
|
298
|
+
def read_array(reader, count)
|
|
299
|
+
result = []
|
|
300
|
+
count.times do
|
|
301
|
+
line_no, line = reader.next
|
|
302
|
+
parts = line.split(/\s+/)
|
|
303
|
+
# Skip the directive name at the beginning
|
|
304
|
+
result << parts[1..] if parts.length > 1
|
|
305
|
+
end
|
|
306
|
+
result
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
# Parse a single flag.
|
|
310
|
+
#
|
|
311
|
+
# @param string [String] Flag string
|
|
312
|
+
# @return [String] Parsed flag
|
|
313
|
+
def parse_flag(string)
|
|
314
|
+
parse_flags(string).first
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
# Parse multiple flags.
|
|
318
|
+
#
|
|
319
|
+
# @param string [String] Flag string
|
|
320
|
+
# @return [Array<String>] Parsed flags
|
|
321
|
+
def parse_flags(string)
|
|
322
|
+
return [] if string.nil? || string.empty?
|
|
323
|
+
|
|
324
|
+
# Check flag synonyms (only if the key exists in @flag_synonyms)
|
|
325
|
+
if @flag_synonyms&.key?(string)
|
|
326
|
+
return @flag_synonyms[string].to_a
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
case @flag_format
|
|
330
|
+
when 'short'
|
|
331
|
+
string.chars
|
|
332
|
+
when 'long'
|
|
333
|
+
string.scan(/../)
|
|
334
|
+
when 'num'
|
|
335
|
+
string.scan(/\d+/)
|
|
336
|
+
when 'UTF-8'
|
|
337
|
+
string.chars
|
|
338
|
+
else
|
|
339
|
+
raise ArgumentError, "Unknown flag format: #{@flag_format}"
|
|
340
|
+
end
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
# Detect the file's encoding from its SET directive.
|
|
344
|
+
# Pre-scans the first ~4KB of the file in binary mode so we can
|
|
345
|
+
# reopen with the correct encoding before the FileReader consumes it.
|
|
346
|
+
#
|
|
347
|
+
# @param path [String] Path to the .aff file
|
|
348
|
+
# @return [String, nil] Encoding name (e.g., "ISO8859-1", "UTF-8") or nil
|
|
349
|
+
def detect_encoding(path)
|
|
350
|
+
return nil unless File.file?(path)
|
|
351
|
+
|
|
352
|
+
snippet = File.open(path, "rb") { |f| f.read(4096) }
|
|
353
|
+
match = snippet.match(/^SET\s+(\S+)/)
|
|
354
|
+
return nil unless match
|
|
355
|
+
|
|
356
|
+
normalize_encoding_name(match[1])
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
# Normalize Hunspell encoding names to Ruby encoding names.
|
|
360
|
+
#
|
|
361
|
+
# @param name [String] Hunspell encoding identifier
|
|
362
|
+
# @return [String] Ruby encoding name
|
|
363
|
+
def normalize_encoding_name(name)
|
|
364
|
+
return name if name.upcase == "UTF-8"
|
|
365
|
+
|
|
366
|
+
normalized = name.upcase.delete("-")
|
|
367
|
+
if normalized.start_with?("ISO8859")
|
|
368
|
+
"ISO-8859-#{normalized.sub("ISO8859", "")}"
|
|
369
|
+
else
|
|
370
|
+
name
|
|
371
|
+
end
|
|
372
|
+
end
|
|
373
|
+
end
|
|
374
|
+
end
|
|
375
|
+
end
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Readers
|
|
5
|
+
# Base class for checking affix conditions.
|
|
6
|
+
#
|
|
7
|
+
# Hunspell affix rules specify conditions that the stem must match
|
|
8
|
+
# before an affix can be applied. Different scripts may have different
|
|
9
|
+
# interpretations of these conditions.
|
|
10
|
+
#
|
|
11
|
+
# @example Latin script condition checking
|
|
12
|
+
# checker = LatinScriptConditionChecker.compile('[^y]')
|
|
13
|
+
# checker.matches?('try') # => true (doesn't end with 'y')
|
|
14
|
+
# checker.matches?('fly') # => false (ends with 'y')
|
|
15
|
+
#
|
|
16
|
+
# @abstract Subclasses must implement the matches? method
|
|
17
|
+
class ConditionChecker
|
|
18
|
+
# Compile a condition string into a checker.
|
|
19
|
+
#
|
|
20
|
+
# @param condition [String] The condition string from .aff file
|
|
21
|
+
# @param script [Symbol] The script type (:latin, :arabic, :hebrew, etc.)
|
|
22
|
+
# @return [ConditionChecker] A checker instance
|
|
23
|
+
def self.compile(condition, script: :latin)
|
|
24
|
+
case script
|
|
25
|
+
when :latin
|
|
26
|
+
LatinScriptConditionChecker.compile(condition)
|
|
27
|
+
else
|
|
28
|
+
# For other scripts, create a passthrough checker
|
|
29
|
+
# (condition is not applied)
|
|
30
|
+
PassthroughConditionChecker.new
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Check if the given stem matches this condition.
|
|
35
|
+
#
|
|
36
|
+
# @param stem [String] The stem to check
|
|
37
|
+
# @return [Boolean] True if the stem matches
|
|
38
|
+
def matches?(stem)
|
|
39
|
+
raise NotImplementedError, "#{self.class} must implement #matches?"
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Passthrough condition checker that always returns true.
|
|
44
|
+
#
|
|
45
|
+
# Used for scripts where Hunspell conditions don't apply or aren't supported.
|
|
46
|
+
class PassthroughConditionChecker < ConditionChecker
|
|
47
|
+
def matches?(stem)
|
|
48
|
+
true
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Condition checker for Latin-script dictionaries.
|
|
53
|
+
#
|
|
54
|
+
# Handles Hunspell condition syntax for Latin scripts:
|
|
55
|
+
# - '.' matches any stem
|
|
56
|
+
# - 'y' or 'abc' (single char or string) matches stems ending with that string
|
|
57
|
+
# - '[abc]' matches stems ending with 'a', 'b', or 'c'
|
|
58
|
+
# - '[^y]' matches stems NOT ending with 'y'
|
|
59
|
+
# - '[0-9]' matches stems ending with a digit
|
|
60
|
+
# - '[aeiou]y' matches stems ending with vowel + 'y' (multi-char pattern)
|
|
61
|
+
# - '[^aeiou]y' matches stems ending with consonant + 'y' (multi-char pattern)
|
|
62
|
+
#
|
|
63
|
+
# This is NOT suitable for RTL scripts or CJK languages.
|
|
64
|
+
class LatinScriptConditionChecker < ConditionChecker
|
|
65
|
+
attr_reader :pattern, :condition, :type
|
|
66
|
+
|
|
67
|
+
# Compile a condition string.
|
|
68
|
+
#
|
|
69
|
+
# @param condition [String] The condition string (e.g., '[^y]', '[abc]', 'y', '.', '[aeiou]y')
|
|
70
|
+
# @return [LatinScriptConditionChecker] A checker instance
|
|
71
|
+
def self.compile(condition)
|
|
72
|
+
return new(condition: nil, type: :any) if condition == '.'
|
|
73
|
+
|
|
74
|
+
# Check if it's a bracket expression: [abc] or [^y] or [aeiou]y or [^aeiou]y
|
|
75
|
+
# Note: [aeiou]y means "ends with vowel + y", not "ends with one of [aeiou]y"
|
|
76
|
+
if condition =~ /^\[([^\]]+)\]/
|
|
77
|
+
content = $1
|
|
78
|
+
negated = content.start_with?('^')
|
|
79
|
+
|
|
80
|
+
# Check if this is a multi-char pattern like [aeiou]y or [^aeiou]y
|
|
81
|
+
# These should be used as regex patterns directly
|
|
82
|
+
if content.length > 1
|
|
83
|
+
# For multi-char patterns, use the whole condition as a regex
|
|
84
|
+
new(condition: condition, type: :regex)
|
|
85
|
+
elsif negated
|
|
86
|
+
# Single character negation: [^x]
|
|
87
|
+
chars = content[1..]
|
|
88
|
+
new(condition: chars, type: :not_ends_with)
|
|
89
|
+
else
|
|
90
|
+
# Single character set: [x]
|
|
91
|
+
new(condition: content, type: :ends_with_any)
|
|
92
|
+
end
|
|
93
|
+
else
|
|
94
|
+
# Bare character or string - matches stems ENDING with this string
|
|
95
|
+
new(condition: condition, type: :ends_with)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def initialize(condition:, type:)
|
|
100
|
+
@condition = condition
|
|
101
|
+
@type = type
|
|
102
|
+
@regex_pattern = compile_regex if type == :regex
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Compile a regex pattern for multi-character conditions.
|
|
106
|
+
#
|
|
107
|
+
# @return [Regexp, nil] Compiled regex or nil
|
|
108
|
+
def compile_regex
|
|
109
|
+
return nil unless @condition
|
|
110
|
+
|
|
111
|
+
# Convert Hunspell condition to Ruby regex
|
|
112
|
+
# [^aeiou]y -> /[^aeiou]y$/
|
|
113
|
+
# [aeiou]y -> /[aeiou]y$/
|
|
114
|
+
Regexp.new(@condition + '$')
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Check if the stem matches the condition.
|
|
118
|
+
#
|
|
119
|
+
# @param stem [String] The stem to check
|
|
120
|
+
# @return [Boolean] True if the stem matches
|
|
121
|
+
def matches?(stem)
|
|
122
|
+
case @type
|
|
123
|
+
when :any
|
|
124
|
+
true
|
|
125
|
+
when :ends_with
|
|
126
|
+
stem.end_with?(@condition)
|
|
127
|
+
when :ends_with_any
|
|
128
|
+
@condition.chars.any? { |char| stem.end_with?(char) }
|
|
129
|
+
when :not_ends_with
|
|
130
|
+
# Check that stem doesn't end with ANY of the characters in the condition
|
|
131
|
+
@condition.chars.none? { |char| stem.end_with?(char) }
|
|
132
|
+
when :regex
|
|
133
|
+
@regex_pattern.match?(stem)
|
|
134
|
+
when :equals
|
|
135
|
+
stem == @condition
|
|
136
|
+
else
|
|
137
|
+
false
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
end
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'file_reader'
|
|
4
|
+
|
|
5
|
+
module Kotoshu
|
|
6
|
+
module Readers
|
|
7
|
+
# Word entry from the dictionary file.
|
|
8
|
+
#
|
|
9
|
+
# @attr stem [String] The word stem
|
|
10
|
+
# @attr flags [Set<String>] Morphological flags
|
|
11
|
+
Word = Struct.new(:stem, :flags, keyword_init: true) do
|
|
12
|
+
# Create a word from a dictionary line.
|
|
13
|
+
#
|
|
14
|
+
# @param line [String] The dictionary line
|
|
15
|
+
# @param context [Hash] The reading context (for flag parsing)
|
|
16
|
+
# @return [Word] The parsed word
|
|
17
|
+
def self.from_line(line, context = {})
|
|
18
|
+
parts = line.split('/')
|
|
19
|
+
stem = parts[0].strip
|
|
20
|
+
flags_str = parts[1]
|
|
21
|
+
|
|
22
|
+
flags = if flags_str && context[:flag_format]
|
|
23
|
+
parse_flags(flags_str, context[:flag_format], context[:flag_synonyms])
|
|
24
|
+
elsif flags_str
|
|
25
|
+
flags_str.chars.to_set
|
|
26
|
+
else
|
|
27
|
+
Set.new
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
new(stem:, flags:)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Parse flags from string.
|
|
34
|
+
#
|
|
35
|
+
# @param string [String] Flag string
|
|
36
|
+
# @param flag_format [String] Flag format ('short', 'long', 'num', 'UTF-8')
|
|
37
|
+
# @param flag_synonyms [Hash] Flag synonyms map
|
|
38
|
+
# @return [Set<String>] Parsed flags
|
|
39
|
+
def self.parse_flags(string, flag_format, flag_synonyms = {})
|
|
40
|
+
return Set.new if string.nil? || string.empty?
|
|
41
|
+
|
|
42
|
+
# Check flag synonyms
|
|
43
|
+
if flag_synonyms && string =~ /^\d+$/
|
|
44
|
+
return flag_synonyms[string] || Set.new
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
case flag_format
|
|
48
|
+
when 'short'
|
|
49
|
+
string.chars.to_set
|
|
50
|
+
when 'long'
|
|
51
|
+
string.scan(/../).to_set
|
|
52
|
+
when 'num'
|
|
53
|
+
string.scan(/\d+/).to_set
|
|
54
|
+
when 'UTF-8'
|
|
55
|
+
string.chars.to_set
|
|
56
|
+
else
|
|
57
|
+
string.chars.to_set
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# DIC file reader for Hunspell dictionary files.
|
|
63
|
+
#
|
|
64
|
+
# This class reads .dic files and creates a list of Word entries.
|
|
65
|
+
#
|
|
66
|
+
# @example Reading a dic file
|
|
67
|
+
# reader = DicReader.new('en_US.dic', flag_format: 'short')
|
|
68
|
+
# words = reader.read
|
|
69
|
+
class DicReader
|
|
70
|
+
attr_reader :path, :encoding, :flag_format, :flag_synonyms
|
|
71
|
+
|
|
72
|
+
# Create a new DIC reader.
|
|
73
|
+
#
|
|
74
|
+
# @param path [String] Path to the .dic file
|
|
75
|
+
# @param encoding [String] File encoding (default: 'UTF-8')
|
|
76
|
+
# @param flag_format [String] Flag format ('short', 'long', 'num', 'UTF-8')
|
|
77
|
+
# @param flag_synonyms [Hash] Flag synonyms map
|
|
78
|
+
def initialize(path, encoding: 'UTF-8', flag_format: 'short', flag_synonyms: {})
|
|
79
|
+
@path = path
|
|
80
|
+
@encoding = encoding
|
|
81
|
+
@flag_format = flag_format
|
|
82
|
+
@flag_synonyms = flag_synonyms
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Read the dic file and return a list of Word entries.
|
|
86
|
+
#
|
|
87
|
+
# @return [Array<Word>] List of word entries
|
|
88
|
+
def read
|
|
89
|
+
reader = FileReader.new(@path, @encoding)
|
|
90
|
+
|
|
91
|
+
words = []
|
|
92
|
+
first_line = true
|
|
93
|
+
expected_count = 0
|
|
94
|
+
|
|
95
|
+
reader.each do |_line_no, line|
|
|
96
|
+
if first_line
|
|
97
|
+
# First line is word count
|
|
98
|
+
expected_count = line.to_i
|
|
99
|
+
first_line = false
|
|
100
|
+
next
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Skip empty lines
|
|
104
|
+
next if line.empty?
|
|
105
|
+
|
|
106
|
+
# Parse word
|
|
107
|
+
word = Word.from_line(line, flag_format: @flag_format, flag_synonyms: @flag_synonyms)
|
|
108
|
+
words << word
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Verify word count
|
|
112
|
+
# Note: We don't raise an error if count doesn't match, as some dictionaries have different formats
|
|
113
|
+
|
|
114
|
+
words
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|