kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Readers
|
|
5
|
+
# Affix data class for Hunspell affix rules.
|
|
6
|
+
#
|
|
7
|
+
# This class represents a prefix or suffix affix rule.
|
|
8
|
+
#
|
|
9
|
+
# @attr flag [String] The flag character identifying this rule
|
|
10
|
+
# @attr crossproduct [Boolean] Whether this is a cross-product rule
|
|
11
|
+
# @attr strip [String] Characters to strip from the word
|
|
12
|
+
# @attr add [String] Characters to add to the word
|
|
13
|
+
# @attr condition [String] Condition for applying this rule
|
|
14
|
+
# @attr flags [Set<String>] Additional flags
|
|
15
|
+
#
|
|
16
|
+
# @example Creating a suffix affix
|
|
17
|
+
# Affix.new(
|
|
18
|
+
# type: :suffix,
|
|
19
|
+
# flag: 'H',
|
|
20
|
+
# crossproduct: false,
|
|
21
|
+
# strip: 'y',
|
|
22
|
+
# add: 'ieth',
|
|
23
|
+
# condition: 'y',
|
|
24
|
+
# flags: Set.new
|
|
25
|
+
# )
|
|
26
|
+
class Affix
|
|
27
|
+
attr_reader :type, :flag, :crossproduct, :strip, :add, :condition, :flags
|
|
28
|
+
|
|
29
|
+
# Create a new affix.
|
|
30
|
+
#
|
|
31
|
+
# @param type [Symbol] :prefix or :suffix
|
|
32
|
+
# @param flag [String] Flag character
|
|
33
|
+
# @param crossproduct [Boolean] Whether cross-product
|
|
34
|
+
# @param strip [String] Characters to strip
|
|
35
|
+
# @param add [String] Characters to add
|
|
36
|
+
# @param condition [String] Condition regex
|
|
37
|
+
# @param flags [Set<String>] Additional flags
|
|
38
|
+
def initialize(type:, flag:, crossproduct:, strip:, add:, condition:, flags: Set.new)
|
|
39
|
+
@type = type
|
|
40
|
+
@flag = flag
|
|
41
|
+
@crossproduct = crossproduct
|
|
42
|
+
@strip = strip
|
|
43
|
+
@add = add
|
|
44
|
+
@condition = condition
|
|
45
|
+
@flags = flags
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Check if this is a prefix.
|
|
49
|
+
#
|
|
50
|
+
# @return [Boolean] True if prefix
|
|
51
|
+
def prefix?
|
|
52
|
+
@type == :prefix
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Check if this is a suffix.
|
|
56
|
+
#
|
|
57
|
+
# @return [Boolean] True if suffix
|
|
58
|
+
def suffix?
|
|
59
|
+
@type == :suffix
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# String representation.
|
|
63
|
+
#
|
|
64
|
+
# @return [String] String representation
|
|
65
|
+
def to_s
|
|
66
|
+
type_str = prefix? ? 'Prefix' : 'Suffix'
|
|
67
|
+
"#{type_str}(#{@add}: #{@flag}#{@crossproduct ? '×' : ''}/#{@flags.to_a.join(',')}, on #{condition})"
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Inspect string.
|
|
71
|
+
#
|
|
72
|
+
# @return [String] Inspect string
|
|
73
|
+
def inspect
|
|
74
|
+
to_s
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Break pattern for word splitting.
|
|
79
|
+
#
|
|
80
|
+
# @attr pattern [String] The break pattern
|
|
81
|
+
# @attr matcher [Regexp] Compiled matcher for the pattern
|
|
82
|
+
class BreakPattern
|
|
83
|
+
attr_reader :pattern, :matcher
|
|
84
|
+
|
|
85
|
+
# Create a new break pattern.
|
|
86
|
+
#
|
|
87
|
+
# @param pattern [String] The pattern string
|
|
88
|
+
def initialize(pattern)
|
|
89
|
+
@pattern = pattern
|
|
90
|
+
# Special chars like #, -, * should be escaped, but ^ and $ should be treated as pattern anchors
|
|
91
|
+
regex_pattern = Regexp.escape(pattern).gsub('\\^', '^').gsub('\\$', '$')
|
|
92
|
+
if regex_pattern.start_with?('^') || regex_pattern.end_with?('$')
|
|
93
|
+
@matcher = Regexp.new("(#{regex_pattern})")
|
|
94
|
+
else
|
|
95
|
+
@matcher = Regexp.new(".(#{regex_pattern}).")
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Ignore characters for lookup/suggest.
|
|
101
|
+
#
|
|
102
|
+
# @attr chars [String] Characters to ignore
|
|
103
|
+
# @attr translation_table [Hash] Translation table for removal
|
|
104
|
+
class Ignore
|
|
105
|
+
attr_reader :chars, :translation_table
|
|
106
|
+
|
|
107
|
+
# Create a new ignore set.
|
|
108
|
+
#
|
|
109
|
+
# @param chars [String] Characters to ignore
|
|
110
|
+
def initialize(chars)
|
|
111
|
+
@chars = chars
|
|
112
|
+
# Create translation table that removes these characters
|
|
113
|
+
@translation_table = chars.each_char.each_with_index.to_h
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Remove ignored characters from string.
|
|
117
|
+
#
|
|
118
|
+
# @param str [String] Input string
|
|
119
|
+
# @return [String] String with ignored chars removed
|
|
120
|
+
def remove(str)
|
|
121
|
+
str.chars.reject { |c| @translation_table.key?(c) }.join
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Replacement pattern for suggestions.
|
|
126
|
+
#
|
|
127
|
+
# @attr pattern [String] The pattern to match
|
|
128
|
+
# @attr replacement [String] The replacement string
|
|
129
|
+
# @attr matcher [Regexp] Compiled matcher for the pattern
|
|
130
|
+
class RepPattern
|
|
131
|
+
attr_reader :pattern, :replacement, :matcher
|
|
132
|
+
|
|
133
|
+
# Create a new replacement pattern.
|
|
134
|
+
#
|
|
135
|
+
# @param pattern [String] The pattern string
|
|
136
|
+
# @param replacement [String] The replacement string
|
|
137
|
+
def initialize(pattern, replacement)
|
|
138
|
+
@pattern = pattern
|
|
139
|
+
@replacement = replacement
|
|
140
|
+
@matcher = Regexp.new(pattern)
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Conversion table for ICONV/OCONV.
|
|
145
|
+
#
|
|
146
|
+
# @attr pairs [Array<Array<String>>] Array of [pattern, replacement] pairs
|
|
147
|
+
class ConvTable
|
|
148
|
+
attr_reader :pairs
|
|
149
|
+
|
|
150
|
+
# Create a new conversion table.
|
|
151
|
+
#
|
|
152
|
+
# @param pairs [Array<Array<String>>] Array of [pattern, replacement] pairs
|
|
153
|
+
def initialize(pairs)
|
|
154
|
+
@pairs = pairs
|
|
155
|
+
@table = pairs.map { |pat1, pat2| compile_row(pat1, pat2) }.sort_by { |search, _| search.length }
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Apply conversions to word.
|
|
159
|
+
#
|
|
160
|
+
# @param word [String] Input word
|
|
161
|
+
# @return [String] Converted word
|
|
162
|
+
def call(word)
|
|
163
|
+
pos = 0
|
|
164
|
+
result = ''
|
|
165
|
+
|
|
166
|
+
while pos < word.length
|
|
167
|
+
matches = @table.select { |_, pattern| pattern.match?(word, pos) }
|
|
168
|
+
.sort_by { |search, _| search.length }
|
|
169
|
+
.reverse
|
|
170
|
+
|
|
171
|
+
if matches.any?
|
|
172
|
+
search, _, replacement = matches.first
|
|
173
|
+
result += replacement
|
|
174
|
+
pos += search.length
|
|
175
|
+
else
|
|
176
|
+
result += word[pos]
|
|
177
|
+
pos += 1
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
result
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
private
|
|
185
|
+
|
|
186
|
+
def compile_row(pat1, pat2)
|
|
187
|
+
pat1_clean = pat1.gsub('_', '')
|
|
188
|
+
pat1_re = pat1_clean.dup
|
|
189
|
+
pat1_re = "^#{pat1_re}" if pat1.start_with?('_')
|
|
190
|
+
pat1_re = "#{pat1_re}$" if pat1.end_with?('_')
|
|
191
|
+
|
|
192
|
+
[pat1_clean, Regexp.new(pat1_re), pat2.gsub('_', ' ')]
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Compound rule pattern.
|
|
197
|
+
#
|
|
198
|
+
# @attr text [String] The rule text
|
|
199
|
+
# @attr flags [Set<String>] Flags in this rule
|
|
200
|
+
# @attr re [Regexp] Compiled regex for full matching
|
|
201
|
+
class CompoundRule
|
|
202
|
+
attr_reader :text, :flags, :re
|
|
203
|
+
|
|
204
|
+
# Create a new compound rule.
|
|
205
|
+
#
|
|
206
|
+
# @param text [String] The rule text (e.g., "A*B?CD")
|
|
207
|
+
def initialize(text)
|
|
208
|
+
@text = text
|
|
209
|
+
# Parse flags from rule text
|
|
210
|
+
if text.include?('(')
|
|
211
|
+
@flags = text.scan(/\((.+?)\)/).flatten.to_set
|
|
212
|
+
parts = text.scan(/\([^*?]+?\)[*?]?/)
|
|
213
|
+
else
|
|
214
|
+
@flags = text.gsub(/[*?]/, '').chars.to_set
|
|
215
|
+
# Handle ) as a flag character (used in sv dictionaries)
|
|
216
|
+
parts = text.gsub(/(?<=[^*?])\)/, '\\)').gsub(/([^*?])/, '\1')
|
|
217
|
+
.scan(/[^*?][*?]?/)
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
@re = Regexp.new(parts.join)
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Check if flag sets fully match this rule.
|
|
224
|
+
#
|
|
225
|
+
# @param flag_sets [Array<Set<String>>] Array of flag sets
|
|
226
|
+
# @return [Boolean] True if matches
|
|
227
|
+
def fullmatch(flag_sets)
|
|
228
|
+
relevant_flags = flag_sets.map { |f| @flags.intersection(f).to_a }
|
|
229
|
+
# Try all combinations of relevant flags
|
|
230
|
+
relevant_flags[0].product(*relevant_flags[1..]).any? do |fc|
|
|
231
|
+
@re.match?(fc.join)
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
# Compound pattern for checking compound words.
|
|
237
|
+
#
|
|
238
|
+
# @attr left [String] Left side pattern
|
|
239
|
+
# @attr right [String] Right side pattern
|
|
240
|
+
# @attr replacement [String, nil] Optional replacement
|
|
241
|
+
class CompoundPattern
|
|
242
|
+
attr_reader :left, :right, :replacement, :left_stem, :left_flag, :right_stem, :right_flag,
|
|
243
|
+
:left_no_affix, :right_no_affix
|
|
244
|
+
|
|
245
|
+
# Create a new compound pattern.
|
|
246
|
+
#
|
|
247
|
+
# @param left [String] Left side pattern
|
|
248
|
+
# @param right [String] Right side pattern
|
|
249
|
+
# @param replacement [String, nil] Optional replacement
|
|
250
|
+
def initialize(left, right, replacement = nil)
|
|
251
|
+
@left = left
|
|
252
|
+
@right = right
|
|
253
|
+
@replacement = replacement
|
|
254
|
+
|
|
255
|
+
# Parse left side
|
|
256
|
+
@left_stem, _, @left_flag = left.partition('/')
|
|
257
|
+
@left_stem = '' if @left_stem == '0'
|
|
258
|
+
@left_no_affix = @left_stem.empty? && left.start_with?('0')
|
|
259
|
+
|
|
260
|
+
# Parse right side
|
|
261
|
+
@right_stem, _, @right_flag = right.partition('/')
|
|
262
|
+
@right_stem = '' if @right_stem == '0'
|
|
263
|
+
@right_no_affix = @right_stem.empty? && right.start_with?('0')
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
# Check if this pattern matches the given left and right parts.
|
|
267
|
+
#
|
|
268
|
+
# @param left_part [AffixForm] Left part with stem, flags, is_base?
|
|
269
|
+
# @param right_part [AffixForm] Right part with stem, flags, is_base?
|
|
270
|
+
# @return [Boolean] True if matches
|
|
271
|
+
def match?(left_part, right_part)
|
|
272
|
+
return false unless left_part.stem.end_with?(@left_stem)
|
|
273
|
+
return false unless right_part.stem.start_with?(@right_stem)
|
|
274
|
+
return false if @left_no_affix && left_part.is_base?
|
|
275
|
+
return false if @right_no_affix && right_part.is_base?
|
|
276
|
+
return false if @left_flag && !left_part.flags.include?(@left_flag)
|
|
277
|
+
return false if @right_flag && !right_part.flags.include?(@right_flag)
|
|
278
|
+
|
|
279
|
+
true
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
# Phonetic table for PHONE directive.
|
|
284
|
+
#
|
|
285
|
+
# @attr table [Array<Array<String>>] Array of [pattern, replacement] pairs
|
|
286
|
+
class PhonetTable
|
|
287
|
+
attr_reader :table
|
|
288
|
+
|
|
289
|
+
# Pattern for matching phonetic rules.
|
|
290
|
+
# Updated to support extended ASCII (Latin-1) characters like É, À, etc.
|
|
291
|
+
RULE_PATTERN = /^(?<letters>[[:alpha:]]+)(\((?<optional>[[:alpha:]]+)\))?(?<lookahead>[-]+)?(?<flags>[\^$<]*)(?<priority>\d)?$/.freeze
|
|
292
|
+
|
|
293
|
+
# Rule class for phonetic transformations.
|
|
294
|
+
#
|
|
295
|
+
# @attr search [Regexp] Search pattern
|
|
296
|
+
# @attr replacement [String] Replacement string
|
|
297
|
+
# @attr start [Boolean] Match at start
|
|
298
|
+
# @attr end [Boolean] Match at end
|
|
299
|
+
# @attr priority [Integer] Rule priority
|
|
300
|
+
# @attr followup [Boolean] Follow-up rule
|
|
301
|
+
Rule = Struct.new(:search, :replacement, :start, :end, :priority, :followup, keyword_init: true) do
|
|
302
|
+
# Check if rule matches at position.
|
|
303
|
+
#
|
|
304
|
+
# @param word [String] Word to check
|
|
305
|
+
# @param pos [Integer] Position in word
|
|
306
|
+
# @return [Boolean] True if matches
|
|
307
|
+
def match?(word, pos)
|
|
308
|
+
return false if @start && pos > 0
|
|
309
|
+
return @search.match?(word, pos) if @end
|
|
310
|
+
@search.match?(word, pos)
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
# Create a new phonetic table.
|
|
315
|
+
#
|
|
316
|
+
# @param table [Array<Array<String>>] Array of [pattern, replacement] pairs
|
|
317
|
+
def initialize(table)
|
|
318
|
+
@table = table
|
|
319
|
+
@rules = Hash.new { |h, k| h[k] = [] }
|
|
320
|
+
|
|
321
|
+
table.each do |search, replacement|
|
|
322
|
+
@rules[search[0]] << parse_rule(search, replacement)
|
|
323
|
+
end
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
# Parse a phonetic rule.
|
|
327
|
+
#
|
|
328
|
+
# @param search [String] Search pattern
|
|
329
|
+
# @param replacement [String] Replacement string
|
|
330
|
+
# @return [Rule] Parsed rule
|
|
331
|
+
def parse_rule(search, replacement)
|
|
332
|
+
match = RULE_PATTERN.match(search)
|
|
333
|
+
raise ArgumentError, "Not a proper rule: #{search.inspect}" unless match
|
|
334
|
+
|
|
335
|
+
text = match['letters'].chars
|
|
336
|
+
text << "[#{match['optional']}]" if match['optional']
|
|
337
|
+
|
|
338
|
+
if match['lookahead']
|
|
339
|
+
lookahead_len = match['lookahead'].length
|
|
340
|
+
regex = text[0...-lookahead_len].join + "(?=#{text[-lookahead_len..].join})"
|
|
341
|
+
else
|
|
342
|
+
regex = text.join
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
Rule.new(
|
|
346
|
+
search: Regexp.new(regex),
|
|
347
|
+
replacement:,
|
|
348
|
+
start: match['flags']&.include?('^'),
|
|
349
|
+
end: match['flags']&.include?('$'),
|
|
350
|
+
priority: match['priority']&.to_i || 5,
|
|
351
|
+
followup: !match['lookahead'].nil?
|
|
352
|
+
)
|
|
353
|
+
end
|
|
354
|
+
end
|
|
355
|
+
end
|
|
356
|
+
end
|