kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,375 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'aff_data'
4
+ require_relative 'file_reader'
5
+
6
+ module Kotoshu
7
+ module Readers
8
+ # AFF file reader for Hunspell affix files.
9
+ #
10
+ # This class reads .aff files and creates an Aff data structure.
11
+ #
12
+ # @example Reading an aff file
13
+ # reader = AffReader.new('en_US.aff')
14
+ # aff = reader.read
15
+ class AffReader
16
+ # Directives that are single boolean flags
17
+ BOOLEAN_DIRECTIVES = %w[
18
+ COMPLEXPREFIXES FULLSTRIP NOSPLITSUGS CHECKSHARPS
19
+ CHECKCOMPOUNDCASE CHECKCOMPOUNDDUP CHECKCOMPOUNDREP CHECKCOMPOUNDTRIPLE
20
+ SIMPLIFIEDTRIPLE ONLYMAXDIFF COMPOUNDMORESUFFIXES
21
+ ].freeze
22
+
23
+ # Directives that are single string values
24
+ STRING_DIRECTIVES = %w[SET FLAG KEY TRY WORDCHARS LANG].freeze
25
+
26
+ # Directives that are single integer values
27
+ INTEGER_DIRECTIVES = %w[MAXDIFF MAXNGRAMSUGS MAXCPDSUGS COMPOUNDMIN COMPOUNDWORDMAX].freeze
28
+
29
+ # Directives that are single flag values
30
+ FLAG_DIRECTIVES = %w[
31
+ NOSUGGEST KEEPCASE CIRCUMFIX NEEDAFFIX FORBIDDENWORD WARN
32
+ COMPOUNDFLAG COMPOUNDBEGIN COMPOUNDMIDDLE COMPOUNDEND
33
+ ONLYINCOMPOUND COMPOUNDPERMITFLAG COMPOUNDFORBIDFLAG FORCEUCASE
34
+ SUBSTANDARD SYLLABLENUM COMPOUNDROOT
35
+ ].freeze
36
+
37
+ # Outdated directive names and their synonyms
38
+ SYNONYMS = {
39
+ 'PSEUDOROOT' => 'NEEDAFFIX',
40
+ 'COMPOUNDLAST' => 'COMPOUNDEND'
41
+ }.freeze
42
+
43
+ attr_reader :path, :encoding, :flag_format
44
+
45
+ # Create a new AFF reader.
46
+ #
47
+ # @param path [String] Path to the .aff file
48
+ # @param encoding [String] File encoding (default: 'UTF-8');
49
+ # overridden by the file's SET directive when present
50
+ def initialize(path, encoding: 'UTF-8')
51
+ @path = path
52
+ @encoding = detect_encoding(path) || encoding
53
+ @flag_format = 'short'
54
+ @flag_synonyms = {}
55
+ end
56
+
57
+ # Read the aff file and return the aff data structure.
58
+ #
59
+ # @param source [FileReader, nil] Optional file reader to use instead of creating a new one
60
+ # @return [Hash] The aff data structure
61
+ def read(source = nil)
62
+ reader = source || FileReader.new(@path, @encoding)
63
+
64
+ data = {
65
+ 'SFX' => {},
66
+ 'PFX' => {},
67
+ 'FLAG' => 'short'
68
+ }
69
+
70
+ reader.each do |_line_no, line|
71
+ dir_value = read_directive(reader, line)
72
+ next unless dir_value
73
+
74
+ directive, value = dir_value
75
+
76
+ # Update flag format when FLAG directive is encountered (BEFORE using it)
77
+ if directive == 'FLAG'
78
+ @flag_format = value
79
+ end
80
+
81
+ # Re-parse FLAG directive value now that @flag_format is updated
82
+ if directive == 'FLAG' && value.is_a?(String)
83
+ # No re-parsing needed for FLAG, just update the format
84
+ end
85
+
86
+ # SFX/PFX have multiple entries
87
+ if %w[SFX PFX].include?(directive)
88
+ data[directive][value.first.flag] = value
89
+ else
90
+ data[directive] = value
91
+ end
92
+
93
+ # Update flag synonyms when AF directive is encountered (AFTER storing it)
94
+ if directive == 'AF'
95
+ @flag_synonyms = value
96
+ end
97
+
98
+ # Note: We don't reset_encoding during iteration because it closes
99
+ # the file and breaks the iteration. The FileReader is initialized
100
+ # with UTF-8 encoding which handles most cases.
101
+ end
102
+
103
+ data
104
+ end
105
+
106
+ private
107
+
108
+ # Read a directive from a line.
109
+ #
110
+ # @param reader [FileReader] The file reader
111
+ # @param line [String] The line to parse
112
+ # @return [Array, nil] [directive, value] or nil
113
+ def read_directive(reader, line)
114
+ parts = line.split(/\s+/)
115
+ return nil if parts.empty?
116
+
117
+ name = parts[0]
118
+
119
+ # Check if it looks like a directive (all caps)
120
+ return nil unless name =~ /^[A-Z]+$/
121
+
122
+ # Handle synonyms
123
+ name = SYNONYMS[name] || name
124
+
125
+ value = read_value(reader, name, parts[1..])
126
+
127
+ return nil if value.nil?
128
+
129
+ [name, value]
130
+ end
131
+
132
+ # Read the value for a directive.
133
+ #
134
+ # @param reader [FileReader] The file reader
135
+ # @param directive [String] The directive name
136
+ # @param values [Array<String>] Values from the line
137
+ # @return [Object] The parsed value
138
+ def read_value(reader, directive, values)
139
+ value = values.first
140
+
141
+ # String directives
142
+ if STRING_DIRECTIVES.include?(directive)
143
+ return value
144
+ end
145
+
146
+ # Integer directives
147
+ if INTEGER_DIRECTIVES.include?(directive)
148
+ return value&.to_i
149
+ end
150
+
151
+ # Flag directives
152
+ if FLAG_DIRECTIVES.include?(directive)
153
+ return parse_flag(value)
154
+ end
155
+
156
+ # Boolean directives
157
+ if BOOLEAN_DIRECTIVES.include?(directive)
158
+ return true
159
+ end
160
+
161
+ # IGNORE directive
162
+ if directive == 'IGNORE'
163
+ return Ignore.new(value || '')
164
+ end
165
+
166
+ # BREAK directive
167
+ if directive == 'BREAK'
168
+ count = value&.to_i || 0
169
+ return read_array(reader, count).map { |parts| BreakPattern.new(parts.first) }
170
+ end
171
+
172
+ # COMPOUNDRULE directive
173
+ if directive == 'COMPOUNDRULE'
174
+ count = value&.to_i || 0
175
+ return read_array(reader, count).map { |parts| CompoundRule.new(parts.first) }
176
+ end
177
+
178
+ # ICONV/OCONV directives
179
+ if %w[ICONV OCONV].include?(directive)
180
+ count = value&.to_i || 0
181
+ pairs = read_array(reader, count).map { |parts| [parts[0], parts[1] || ''] }
182
+ return ConvTable.new(pairs)
183
+ end
184
+
185
+ # REP directive
186
+ if directive == 'REP'
187
+ count = value&.to_i || 0
188
+ return read_array(reader, count).map { |parts| RepPattern.new(parts[0], parts[1] || '') }
189
+ end
190
+
191
+ # MAP directive
192
+ if directive == 'MAP'
193
+ count = value&.to_i || 0
194
+ return read_array(reader, count).map do |parts|
195
+ chars = parts.first || ''
196
+ # Parse MAP format: "aàâä" or "ß(ss)" - split by parentheses or individual chars
197
+ # Parenthesized groups like "(ss)" should be kept as a single string "ss"
198
+ chars.scan(/(\([^()]+\)|[^()])/).flatten.map do |group|
199
+ # Remove parentheses from parenthesized groups, keep as single string
200
+ # For single characters, keep as is
201
+ if group.start_with?('(') && group.end_with?(')')
202
+ group[1..-2] # Remove parentheses, keep content as single string
203
+ else
204
+ group # Keep single character as is
205
+ end
206
+ end
207
+ end
208
+ end
209
+
210
+ # SFX/PFX directives
211
+ if %w[SFX PFX].include?(directive)
212
+ flag, crossproduct, count = values[0], values[1], values[2]&.to_i || 0
213
+ type = directive == 'PFX' ? :prefix : :suffix
214
+
215
+ affixes = read_array(reader, count).map do |parts|
216
+ # Format: FLAG strip add condition [morph_data]
217
+ # After read_array (which skips directive), parts[0] is FLAG again
218
+ # So we skip parts[0] and use: parts[1]=strip, parts[2]=add, parts[3]=condition
219
+ strip = parts[1] == '0' ? '' : (parts[1] || '')
220
+ add = parts[2] || ''
221
+ condition = parts[3] || '.'
222
+
223
+ # Handle flags in add field: "able/CD" -> add="able", flags=["C", "D"]
224
+ if add.include?('/')
225
+ add_str, _, flags_str = add.rpartition('/')
226
+ else
227
+ add_str = add
228
+ flags_str = ''
229
+ end
230
+ flags = flags_str.empty? ? Set.new : parse_flags(flags_str).to_set
231
+
232
+ Affix.new(
233
+ type:,
234
+ flag:,
235
+ crossproduct: crossproduct == 'Y',
236
+ strip:,
237
+ add: add_str == '0' ? '' : add_str,
238
+ condition:,
239
+ flags:
240
+ )
241
+ end
242
+
243
+ return affixes
244
+ end
245
+
246
+ # CHECKCOMPOUNDPATTERN directive
247
+ if directive == 'CHECKCOMPOUNDPATTERN'
248
+ count = value&.to_i || 0
249
+ return read_array(reader, count).map do |parts|
250
+ CompoundPattern.new(parts[0], parts[1] || '', parts[2])
251
+ end
252
+ end
253
+
254
+ # AF directive (flag synonyms)
255
+ if directive == 'AF'
256
+ count = value&.to_i || 0
257
+ result = {}
258
+ read_array(reader, count).each_with_index do |parts, i|
259
+ # AF directives always use single-character flags (short format)
260
+ # regardless of the main FLAG format
261
+ flags = parts.first.chars
262
+ result[(i + 1).to_s] = flags.to_set
263
+ end
264
+ return result
265
+ end
266
+
267
+ # AM directive
268
+ if directive == 'AM'
269
+ count = value&.to_i || 0
270
+ result = {}
271
+ read_array(reader, count).each_with_index do |parts, i|
272
+ result[(i + 1).to_s] = parts.to_set
273
+ end
274
+ return result
275
+ end
276
+
277
+ # COMPOUNDSYLLABLE directive
278
+ if directive == 'COMPOUNDSYLLABLE'
279
+ return [value&.to_i, values[1]]
280
+ end
281
+
282
+ # PHONE directive
283
+ if directive == 'PHONE'
284
+ count = value&.to_i || 0
285
+ table = read_array(reader, count).map { |parts| [parts[0], parts[1] || '_'] }
286
+ return PhonetTable.new(table)
287
+ end
288
+
289
+ # Unknown directive - return nil
290
+ nil
291
+ end
292
+
293
+ # Read an array of values from the reader.
294
+ #
295
+ # @param reader [FileReader] The file reader
296
+ # @param count [Integer] Number of lines to read
297
+ # @return [Array<Array<String>>] Array of parsed lines
298
+ def read_array(reader, count)
299
+ result = []
300
+ count.times do
301
+ line_no, line = reader.next
302
+ parts = line.split(/\s+/)
303
+ # Skip the directive name at the beginning
304
+ result << parts[1..] if parts.length > 1
305
+ end
306
+ result
307
+ end
308
+
309
+ # Parse a single flag.
310
+ #
311
+ # @param string [String] Flag string
312
+ # @return [String] Parsed flag
313
+ def parse_flag(string)
314
+ parse_flags(string).first
315
+ end
316
+
317
+ # Parse multiple flags.
318
+ #
319
+ # @param string [String] Flag string
320
+ # @return [Array<String>] Parsed flags
321
+ def parse_flags(string)
322
+ return [] if string.nil? || string.empty?
323
+
324
+ # Check flag synonyms (only if the key exists in @flag_synonyms)
325
+ if @flag_synonyms&.key?(string)
326
+ return @flag_synonyms[string].to_a
327
+ end
328
+
329
+ case @flag_format
330
+ when 'short'
331
+ string.chars
332
+ when 'long'
333
+ string.scan(/../)
334
+ when 'num'
335
+ string.scan(/\d+/)
336
+ when 'UTF-8'
337
+ string.chars
338
+ else
339
+ raise ArgumentError, "Unknown flag format: #{@flag_format}"
340
+ end
341
+ end
342
+
343
+ # Detect the file's encoding from its SET directive.
344
+ # Pre-scans the first ~4KB of the file in binary mode so we can
345
+ # reopen with the correct encoding before the FileReader consumes it.
346
+ #
347
+ # @param path [String] Path to the .aff file
348
+ # @return [String, nil] Encoding name (e.g., "ISO8859-1", "UTF-8") or nil
349
+ def detect_encoding(path)
350
+ return nil unless File.file?(path)
351
+
352
+ snippet = File.open(path, "rb") { |f| f.read(4096) }
353
+ match = snippet.match(/^SET\s+(\S+)/)
354
+ return nil unless match
355
+
356
+ normalize_encoding_name(match[1])
357
+ end
358
+
359
+ # Normalize Hunspell encoding names to Ruby encoding names.
360
+ #
361
+ # @param name [String] Hunspell encoding identifier
362
+ # @return [String] Ruby encoding name
363
+ def normalize_encoding_name(name)
364
+ return name if name.upcase == "UTF-8"
365
+
366
+ normalized = name.upcase.delete("-")
367
+ if normalized.start_with?("ISO8859")
368
+ "ISO-8859-#{normalized.sub("ISO8859", "")}"
369
+ else
370
+ name
371
+ end
372
+ end
373
+ end
374
+ end
375
+ end
@@ -0,0 +1,142 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Readers
5
+ # Base class for checking affix conditions.
6
+ #
7
+ # Hunspell affix rules specify conditions that the stem must match
8
+ # before an affix can be applied. Different scripts may have different
9
+ # interpretations of these conditions.
10
+ #
11
+ # @example Latin script condition checking
12
+ # checker = LatinScriptConditionChecker.compile('[^y]')
13
+ # checker.matches?('try') # => true (doesn't end with 'y')
14
+ # checker.matches?('fly') # => false (ends with 'y')
15
+ #
16
+ # @abstract Subclasses must implement the matches? method
17
+ class ConditionChecker
18
+ # Compile a condition string into a checker.
19
+ #
20
+ # @param condition [String] The condition string from .aff file
21
+ # @param script [Symbol] The script type (:latin, :arabic, :hebrew, etc.)
22
+ # @return [ConditionChecker] A checker instance
23
+ def self.compile(condition, script: :latin)
24
+ case script
25
+ when :latin
26
+ LatinScriptConditionChecker.compile(condition)
27
+ else
28
+ # For other scripts, create a passthrough checker
29
+ # (condition is not applied)
30
+ PassthroughConditionChecker.new
31
+ end
32
+ end
33
+
34
+ # Check if the given stem matches this condition.
35
+ #
36
+ # @param stem [String] The stem to check
37
+ # @return [Boolean] True if the stem matches
38
+ def matches?(stem)
39
+ raise NotImplementedError, "#{self.class} must implement #matches?"
40
+ end
41
+ end
42
+
43
+ # Passthrough condition checker that always returns true.
44
+ #
45
+ # Used for scripts where Hunspell conditions don't apply or aren't supported.
46
+ class PassthroughConditionChecker < ConditionChecker
47
+ def matches?(stem)
48
+ true
49
+ end
50
+ end
51
+
52
+ # Condition checker for Latin-script dictionaries.
53
+ #
54
+ # Handles Hunspell condition syntax for Latin scripts:
55
+ # - '.' matches any stem
56
+ # - 'y' or 'abc' (single char or string) matches stems ending with that string
57
+ # - '[abc]' matches stems ending with 'a', 'b', or 'c'
58
+ # - '[^y]' matches stems NOT ending with 'y'
59
+ # - '[0-9]' matches stems ending with a digit
60
+ # - '[aeiou]y' matches stems ending with vowel + 'y' (multi-char pattern)
61
+ # - '[^aeiou]y' matches stems ending with consonant + 'y' (multi-char pattern)
62
+ #
63
+ # This is NOT suitable for RTL scripts or CJK languages.
64
+ class LatinScriptConditionChecker < ConditionChecker
65
+ attr_reader :pattern, :condition, :type
66
+
67
+ # Compile a condition string.
68
+ #
69
+ # @param condition [String] The condition string (e.g., '[^y]', '[abc]', 'y', '.', '[aeiou]y')
70
+ # @return [LatinScriptConditionChecker] A checker instance
71
+ def self.compile(condition)
72
+ return new(condition: nil, type: :any) if condition == '.'
73
+
74
+ # Check if it's a bracket expression: [abc] or [^y] or [aeiou]y or [^aeiou]y
75
+ # Note: [aeiou]y means "ends with vowel + y", not "ends with one of [aeiou]y"
76
+ if condition =~ /^\[([^\]]+)\]/
77
+ content = $1
78
+ negated = content.start_with?('^')
79
+
80
+ # Check if this is a multi-char pattern like [aeiou]y or [^aeiou]y
81
+ # These should be used as regex patterns directly
82
+ if content.length > 1
83
+ # For multi-char patterns, use the whole condition as a regex
84
+ new(condition: condition, type: :regex)
85
+ elsif negated
86
+ # Single character negation: [^x]
87
+ chars = content[1..]
88
+ new(condition: chars, type: :not_ends_with)
89
+ else
90
+ # Single character set: [x]
91
+ new(condition: content, type: :ends_with_any)
92
+ end
93
+ else
94
+ # Bare character or string - matches stems ENDING with this string
95
+ new(condition: condition, type: :ends_with)
96
+ end
97
+ end
98
+
99
+ def initialize(condition:, type:)
100
+ @condition = condition
101
+ @type = type
102
+ @regex_pattern = compile_regex if type == :regex
103
+ end
104
+
105
+ # Compile a regex pattern for multi-character conditions.
106
+ #
107
+ # @return [Regexp, nil] Compiled regex or nil
108
+ def compile_regex
109
+ return nil unless @condition
110
+
111
+ # Convert Hunspell condition to Ruby regex
112
+ # [^aeiou]y -> /[^aeiou]y$/
113
+ # [aeiou]y -> /[aeiou]y$/
114
+ Regexp.new(@condition + '$')
115
+ end
116
+
117
+ # Check if the stem matches the condition.
118
+ #
119
+ # @param stem [String] The stem to check
120
+ # @return [Boolean] True if the stem matches
121
+ def matches?(stem)
122
+ case @type
123
+ when :any
124
+ true
125
+ when :ends_with
126
+ stem.end_with?(@condition)
127
+ when :ends_with_any
128
+ @condition.chars.any? { |char| stem.end_with?(char) }
129
+ when :not_ends_with
130
+ # Check that stem doesn't end with ANY of the characters in the condition
131
+ @condition.chars.none? { |char| stem.end_with?(char) }
132
+ when :regex
133
+ @regex_pattern.match?(stem)
134
+ when :equals
135
+ stem == @condition
136
+ else
137
+ false
138
+ end
139
+ end
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'file_reader'
4
+
5
+ module Kotoshu
6
+ module Readers
7
+ # Word entry from the dictionary file.
8
+ #
9
+ # @attr stem [String] The word stem
10
+ # @attr flags [Set<String>] Morphological flags
11
+ Word = Struct.new(:stem, :flags, keyword_init: true) do
12
+ # Create a word from a dictionary line.
13
+ #
14
+ # @param line [String] The dictionary line
15
+ # @param context [Hash] The reading context (for flag parsing)
16
+ # @return [Word] The parsed word
17
+ def self.from_line(line, context = {})
18
+ parts = line.split('/')
19
+ stem = parts[0].strip
20
+ flags_str = parts[1]
21
+
22
+ flags = if flags_str && context[:flag_format]
23
+ parse_flags(flags_str, context[:flag_format], context[:flag_synonyms])
24
+ elsif flags_str
25
+ flags_str.chars.to_set
26
+ else
27
+ Set.new
28
+ end
29
+
30
+ new(stem:, flags:)
31
+ end
32
+
33
+ # Parse flags from string.
34
+ #
35
+ # @param string [String] Flag string
36
+ # @param flag_format [String] Flag format ('short', 'long', 'num', 'UTF-8')
37
+ # @param flag_synonyms [Hash] Flag synonyms map
38
+ # @return [Set<String>] Parsed flags
39
+ def self.parse_flags(string, flag_format, flag_synonyms = {})
40
+ return Set.new if string.nil? || string.empty?
41
+
42
+ # Check flag synonyms
43
+ if flag_synonyms && string =~ /^\d+$/
44
+ return flag_synonyms[string] || Set.new
45
+ end
46
+
47
+ case flag_format
48
+ when 'short'
49
+ string.chars.to_set
50
+ when 'long'
51
+ string.scan(/../).to_set
52
+ when 'num'
53
+ string.scan(/\d+/).to_set
54
+ when 'UTF-8'
55
+ string.chars.to_set
56
+ else
57
+ string.chars.to_set
58
+ end
59
+ end
60
+ end
61
+
62
+ # DIC file reader for Hunspell dictionary files.
63
+ #
64
+ # This class reads .dic files and creates a list of Word entries.
65
+ #
66
+ # @example Reading a dic file
67
+ # reader = DicReader.new('en_US.dic', flag_format: 'short')
68
+ # words = reader.read
69
+ class DicReader
70
+ attr_reader :path, :encoding, :flag_format, :flag_synonyms
71
+
72
+ # Create a new DIC reader.
73
+ #
74
+ # @param path [String] Path to the .dic file
75
+ # @param encoding [String] File encoding (default: 'UTF-8')
76
+ # @param flag_format [String] Flag format ('short', 'long', 'num', 'UTF-8')
77
+ # @param flag_synonyms [Hash] Flag synonyms map
78
+ def initialize(path, encoding: 'UTF-8', flag_format: 'short', flag_synonyms: {})
79
+ @path = path
80
+ @encoding = encoding
81
+ @flag_format = flag_format
82
+ @flag_synonyms = flag_synonyms
83
+ end
84
+
85
+ # Read the dic file and return a list of Word entries.
86
+ #
87
+ # @return [Array<Word>] List of word entries
88
+ def read
89
+ reader = FileReader.new(@path, @encoding)
90
+
91
+ words = []
92
+ first_line = true
93
+ expected_count = 0
94
+
95
+ reader.each do |_line_no, line|
96
+ if first_line
97
+ # First line is word count
98
+ expected_count = line.to_i
99
+ first_line = false
100
+ next
101
+ end
102
+
103
+ # Skip empty lines
104
+ next if line.empty?
105
+
106
+ # Parse word
107
+ word = Word.from_line(line, flag_format: @flag_format, flag_synonyms: @flag_synonyms)
108
+ words << word
109
+ end
110
+
111
+ # Verify word count
112
+ # Note: We don't raise an error if count doesn't match, as some dictionaries have different formats
113
+
114
+ words
115
+ end
116
+ end
117
+ end
118
+ end