kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,356 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Readers
5
+ # Affix data class for Hunspell affix rules.
6
+ #
7
+ # This class represents a prefix or suffix affix rule.
8
+ #
9
+ # @attr flag [String] The flag character identifying this rule
10
+ # @attr crossproduct [Boolean] Whether this is a cross-product rule
11
+ # @attr strip [String] Characters to strip from the word
12
+ # @attr add [String] Characters to add to the word
13
+ # @attr condition [String] Condition for applying this rule
14
+ # @attr flags [Set<String>] Additional flags
15
+ #
16
+ # @example Creating a suffix affix
17
+ # Affix.new(
18
+ # type: :suffix,
19
+ # flag: 'H',
20
+ # crossproduct: false,
21
+ # strip: 'y',
22
+ # add: 'ieth',
23
+ # condition: 'y',
24
+ # flags: Set.new
25
+ # )
26
+ class Affix
27
+ attr_reader :type, :flag, :crossproduct, :strip, :add, :condition, :flags
28
+
29
+ # Create a new affix.
30
+ #
31
+ # @param type [Symbol] :prefix or :suffix
32
+ # @param flag [String] Flag character
33
+ # @param crossproduct [Boolean] Whether cross-product
34
+ # @param strip [String] Characters to strip
35
+ # @param add [String] Characters to add
36
+ # @param condition [String] Condition regex
37
+ # @param flags [Set<String>] Additional flags
38
+ def initialize(type:, flag:, crossproduct:, strip:, add:, condition:, flags: Set.new)
39
+ @type = type
40
+ @flag = flag
41
+ @crossproduct = crossproduct
42
+ @strip = strip
43
+ @add = add
44
+ @condition = condition
45
+ @flags = flags
46
+ end
47
+
48
+ # Check if this is a prefix.
49
+ #
50
+ # @return [Boolean] True if prefix
51
+ def prefix?
52
+ @type == :prefix
53
+ end
54
+
55
+ # Check if this is a suffix.
56
+ #
57
+ # @return [Boolean] True if suffix
58
+ def suffix?
59
+ @type == :suffix
60
+ end
61
+
62
+ # String representation.
63
+ #
64
+ # @return [String] String representation
65
+ def to_s
66
+ type_str = prefix? ? 'Prefix' : 'Suffix'
67
+ "#{type_str}(#{@add}: #{@flag}#{@crossproduct ? '×' : ''}/#{@flags.to_a.join(',')}, on #{condition})"
68
+ end
69
+
70
+ # Inspect string.
71
+ #
72
+ # @return [String] Inspect string
73
+ def inspect
74
+ to_s
75
+ end
76
+ end
77
+
78
+ # Break pattern for word splitting.
79
+ #
80
+ # @attr pattern [String] The break pattern
81
+ # @attr matcher [Regexp] Compiled matcher for the pattern
82
+ class BreakPattern
83
+ attr_reader :pattern, :matcher
84
+
85
+ # Create a new break pattern.
86
+ #
87
+ # @param pattern [String] The pattern string
88
+ def initialize(pattern)
89
+ @pattern = pattern
90
+ # Special chars like #, -, * should be escaped, but ^ and $ should be treated as pattern anchors
91
+ regex_pattern = Regexp.escape(pattern).gsub('\\^', '^').gsub('\\$', '$')
92
+ if regex_pattern.start_with?('^') || regex_pattern.end_with?('$')
93
+ @matcher = Regexp.new("(#{regex_pattern})")
94
+ else
95
+ @matcher = Regexp.new(".(#{regex_pattern}).")
96
+ end
97
+ end
98
+ end
99
+
100
+ # Ignore characters for lookup/suggest.
101
+ #
102
+ # @attr chars [String] Characters to ignore
103
+ # @attr translation_table [Hash] Translation table for removal
104
+ class Ignore
105
+ attr_reader :chars, :translation_table
106
+
107
+ # Create a new ignore set.
108
+ #
109
+ # @param chars [String] Characters to ignore
110
+ def initialize(chars)
111
+ @chars = chars
112
+ # Create translation table that removes these characters
113
+ @translation_table = chars.each_char.each_with_index.to_h
114
+ end
115
+
116
+ # Remove ignored characters from string.
117
+ #
118
+ # @param str [String] Input string
119
+ # @return [String] String with ignored chars removed
120
+ def remove(str)
121
+ str.chars.reject { |c| @translation_table.key?(c) }.join
122
+ end
123
+ end
124
+
125
+ # Replacement pattern for suggestions.
126
+ #
127
+ # @attr pattern [String] The pattern to match
128
+ # @attr replacement [String] The replacement string
129
+ # @attr matcher [Regexp] Compiled matcher for the pattern
130
+ class RepPattern
131
+ attr_reader :pattern, :replacement, :matcher
132
+
133
+ # Create a new replacement pattern.
134
+ #
135
+ # @param pattern [String] The pattern string
136
+ # @param replacement [String] The replacement string
137
+ def initialize(pattern, replacement)
138
+ @pattern = pattern
139
+ @replacement = replacement
140
+ @matcher = Regexp.new(pattern)
141
+ end
142
+ end
143
+
144
+ # Conversion table for ICONV/OCONV.
145
+ #
146
+ # @attr pairs [Array<Array<String>>] Array of [pattern, replacement] pairs
147
+ class ConvTable
148
+ attr_reader :pairs
149
+
150
+ # Create a new conversion table.
151
+ #
152
+ # @param pairs [Array<Array<String>>] Array of [pattern, replacement] pairs
153
+ def initialize(pairs)
154
+ @pairs = pairs
155
+ @table = pairs.map { |pat1, pat2| compile_row(pat1, pat2) }.sort_by { |search, _| search.length }
156
+ end
157
+
158
+ # Apply conversions to word.
159
+ #
160
+ # @param word [String] Input word
161
+ # @return [String] Converted word
162
+ def call(word)
163
+ pos = 0
164
+ result = ''
165
+
166
+ while pos < word.length
167
+ matches = @table.select { |_, pattern| pattern.match?(word, pos) }
168
+ .sort_by { |search, _| search.length }
169
+ .reverse
170
+
171
+ if matches.any?
172
+ search, _, replacement = matches.first
173
+ result += replacement
174
+ pos += search.length
175
+ else
176
+ result += word[pos]
177
+ pos += 1
178
+ end
179
+ end
180
+
181
+ result
182
+ end
183
+
184
+ private
185
+
186
+ def compile_row(pat1, pat2)
187
+ pat1_clean = pat1.gsub('_', '')
188
+ pat1_re = pat1_clean.dup
189
+ pat1_re = "^#{pat1_re}" if pat1.start_with?('_')
190
+ pat1_re = "#{pat1_re}$" if pat1.end_with?('_')
191
+
192
+ [pat1_clean, Regexp.new(pat1_re), pat2.gsub('_', ' ')]
193
+ end
194
+ end
195
+
196
+ # Compound rule pattern.
197
+ #
198
+ # @attr text [String] The rule text
199
+ # @attr flags [Set<String>] Flags in this rule
200
+ # @attr re [Regexp] Compiled regex for full matching
201
+ class CompoundRule
202
+ attr_reader :text, :flags, :re
203
+
204
+ # Create a new compound rule.
205
+ #
206
+ # @param text [String] The rule text (e.g., "A*B?CD")
207
+ def initialize(text)
208
+ @text = text
209
+ # Parse flags from rule text
210
+ if text.include?('(')
211
+ @flags = text.scan(/\((.+?)\)/).flatten.to_set
212
+ parts = text.scan(/\([^*?]+?\)[*?]?/)
213
+ else
214
+ @flags = text.gsub(/[*?]/, '').chars.to_set
215
+ # Handle ) as a flag character (used in sv dictionaries)
216
+ parts = text.gsub(/(?<=[^*?])\)/, '\\)').gsub(/([^*?])/, '\1')
217
+ .scan(/[^*?][*?]?/)
218
+ end
219
+
220
+ @re = Regexp.new(parts.join)
221
+ end
222
+
223
+ # Check if flag sets fully match this rule.
224
+ #
225
+ # @param flag_sets [Array<Set<String>>] Array of flag sets
226
+ # @return [Boolean] True if matches
227
+ def fullmatch(flag_sets)
228
+ relevant_flags = flag_sets.map { |f| @flags.intersection(f).to_a }
229
+ # Try all combinations of relevant flags
230
+ relevant_flags[0].product(*relevant_flags[1..]).any? do |fc|
231
+ @re.match?(fc.join)
232
+ end
233
+ end
234
+ end
235
+
236
+ # Compound pattern for checking compound words.
237
+ #
238
+ # @attr left [String] Left side pattern
239
+ # @attr right [String] Right side pattern
240
+ # @attr replacement [String, nil] Optional replacement
241
+ class CompoundPattern
242
+ attr_reader :left, :right, :replacement, :left_stem, :left_flag, :right_stem, :right_flag,
243
+ :left_no_affix, :right_no_affix
244
+
245
+ # Create a new compound pattern.
246
+ #
247
+ # @param left [String] Left side pattern
248
+ # @param right [String] Right side pattern
249
+ # @param replacement [String, nil] Optional replacement
250
+ def initialize(left, right, replacement = nil)
251
+ @left = left
252
+ @right = right
253
+ @replacement = replacement
254
+
255
+ # Parse left side
256
+ @left_stem, _, @left_flag = left.partition('/')
257
+ @left_stem = '' if @left_stem == '0'
258
+ @left_no_affix = @left_stem.empty? && left.start_with?('0')
259
+
260
+ # Parse right side
261
+ @right_stem, _, @right_flag = right.partition('/')
262
+ @right_stem = '' if @right_stem == '0'
263
+ @right_no_affix = @right_stem.empty? && right.start_with?('0')
264
+ end
265
+
266
+ # Check if this pattern matches the given left and right parts.
267
+ #
268
+ # @param left_part [AffixForm] Left part with stem, flags, is_base?
269
+ # @param right_part [AffixForm] Right part with stem, flags, is_base?
270
+ # @return [Boolean] True if matches
271
+ def match?(left_part, right_part)
272
+ return false unless left_part.stem.end_with?(@left_stem)
273
+ return false unless right_part.stem.start_with?(@right_stem)
274
+ return false if @left_no_affix && left_part.is_base?
275
+ return false if @right_no_affix && right_part.is_base?
276
+ return false if @left_flag && !left_part.flags.include?(@left_flag)
277
+ return false if @right_flag && !right_part.flags.include?(@right_flag)
278
+
279
+ true
280
+ end
281
+ end
282
+
283
+ # Phonetic table for PHONE directive.
284
+ #
285
+ # @attr table [Array<Array<String>>] Array of [pattern, replacement] pairs
286
+ class PhonetTable
287
+ attr_reader :table
288
+
289
+ # Pattern for matching phonetic rules.
290
+ # Updated to support extended ASCII (Latin-1) characters like É, À, etc.
291
+ RULE_PATTERN = /^(?<letters>[[:alpha:]]+)(\((?<optional>[[:alpha:]]+)\))?(?<lookahead>[-]+)?(?<flags>[\^$<]*)(?<priority>\d)?$/.freeze
292
+
293
+ # Rule class for phonetic transformations.
294
+ #
295
+ # @attr search [Regexp] Search pattern
296
+ # @attr replacement [String] Replacement string
297
+ # @attr start [Boolean] Match at start
298
+ # @attr end [Boolean] Match at end
299
+ # @attr priority [Integer] Rule priority
300
+ # @attr followup [Boolean] Follow-up rule
301
+ Rule = Struct.new(:search, :replacement, :start, :end, :priority, :followup, keyword_init: true) do
302
+ # Check if rule matches at position.
303
+ #
304
+ # @param word [String] Word to check
305
+ # @param pos [Integer] Position in word
306
+ # @return [Boolean] True if matches
307
+ def match?(word, pos)
308
+ return false if @start && pos > 0
309
+ return @search.match?(word, pos) if @end
310
+ @search.match?(word, pos)
311
+ end
312
+ end
313
+
314
+ # Create a new phonetic table.
315
+ #
316
+ # @param table [Array<Array<String>>] Array of [pattern, replacement] pairs
317
+ def initialize(table)
318
+ @table = table
319
+ @rules = Hash.new { |h, k| h[k] = [] }
320
+
321
+ table.each do |search, replacement|
322
+ @rules[search[0]] << parse_rule(search, replacement)
323
+ end
324
+ end
325
+
326
+ # Parse a phonetic rule.
327
+ #
328
+ # @param search [String] Search pattern
329
+ # @param replacement [String] Replacement string
330
+ # @return [Rule] Parsed rule
331
+ def parse_rule(search, replacement)
332
+ match = RULE_PATTERN.match(search)
333
+ raise ArgumentError, "Not a proper rule: #{search.inspect}" unless match
334
+
335
+ text = match['letters'].chars
336
+ text << "[#{match['optional']}]" if match['optional']
337
+
338
+ if match['lookahead']
339
+ lookahead_len = match['lookahead'].length
340
+ regex = text[0...-lookahead_len].join + "(?=#{text[-lookahead_len..].join})"
341
+ else
342
+ regex = text.join
343
+ end
344
+
345
+ Rule.new(
346
+ search: Regexp.new(regex),
347
+ replacement:,
348
+ start: match['flags']&.include?('^'),
349
+ end: match['flags']&.include?('$'),
350
+ priority: match['priority']&.to_i || 5,
351
+ followup: !match['lookahead'].nil?
352
+ )
353
+ end
354
+ end
355
+ end
356
+ end