kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,347 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rubygems'
4
+
5
+ begin
6
+ require 'zip'
7
+ rescue LoadError
8
+ # rubyzip is optional — only needed for ZipReader (.oxt dictionaries).
9
+ # The plain FileReader and StreamReader work without it.
10
+ end
11
+
12
+ module Kotoshu
13
+ module Readers
14
+ # Base reader class for reading files line by line.
15
+ #
16
+ # This class provides:
17
+ # - Line-by-line reading with line numbers
18
+ # - BOM (byte-order mark) handling
19
+ # - Comment stripping
20
+ # - Empty line filtering
21
+ #
22
+ # @example Basic usage
23
+ # reader = FileReader.new('file.aff', 'UTF-8')
24
+ # reader.each do |line_no, line|
25
+ # puts "#{line_no}: #{line}"
26
+ # end
27
+ class FileReader
28
+ # @return [String] The file path
29
+ attr_reader :path
30
+
31
+ # @return [String] The encoding
32
+ attr_reader :encoding
33
+
34
+ # @return [Integer] Current line number
35
+ attr_reader :line_no
36
+
37
+ # BOM (byte-order mark) for UTF-8
38
+ UTF8_BOM = "\xEF\xBB\xBF".freeze
39
+
40
+ # Create a new file reader.
41
+ #
42
+ # @param path [String] Path to the file
43
+ # @param encoding [String] File encoding (default: 'UTF-8')
44
+ def initialize(path, encoding = 'UTF-8')
45
+ @path = path
46
+ @encoding = encoding
47
+ @line_no = 0
48
+ @file = nil
49
+ @iterator = nil
50
+ reset_io
51
+ end
52
+
53
+ # Reset encoding and reopen file.
54
+ #
55
+ # @param new_encoding [String] New encoding
56
+ def reset_encoding(new_encoding)
57
+ @encoding = new_encoding
58
+ @line_no = 0
59
+ @file&.close
60
+ reset_io
61
+ end
62
+
63
+ # Iterate over lines.
64
+ #
65
+ # @yield [Integer, String] Line number and line content
66
+ # @return [Enumerator] If no block given
67
+ def each
68
+ return enum_for(:each) unless block_given?
69
+
70
+ @iterator.each { |line_no, line| yield(line_no, line) }
71
+ end
72
+
73
+ # Get all lines as an array.
74
+ #
75
+ # @return [Array<Array<Integer, String>>] Array of [line_no, line] pairs
76
+ def to_a
77
+ @iterator.to_a
78
+ end
79
+
80
+ # Check if there are more lines.
81
+ #
82
+ # @return [Boolean] True if there are more lines
83
+ def has_next?
84
+ peek
85
+ true
86
+ rescue StopIteration
87
+ false
88
+ end
89
+
90
+ # Peek at next line without consuming it.
91
+ #
92
+ # @return [Array<Integer, String>] Next line number and content
93
+ def peek
94
+ @iterator.peek
95
+ end
96
+
97
+ # Get next line.
98
+ #
99
+ # @return [Array<Integer, String>] Line number and content
100
+ def next
101
+ @iterator.next
102
+ end
103
+
104
+ # Reset the reader to the beginning.
105
+ def reset
106
+ @line_no = 0
107
+ reset_io
108
+ end
109
+
110
+ # Close the file.
111
+ def close
112
+ @file&.close
113
+ @file = nil
114
+ end
115
+
116
+ private
117
+
118
+ # Reset the IO object.
119
+ def reset_io
120
+ @file = File.open(@path, "r:#{@encoding}:utf-8")
121
+ @iterator = read_lines.lazy
122
+ end
123
+
124
+ # Read lines from the file.
125
+ #
126
+ # @return [Enumerator] Enumerator of [line_no, line] pairs
127
+ def read_lines
128
+ return enum_for(:read_lines) unless block_given?
129
+
130
+ @file.each_line do |line|
131
+ @line_no += 1
132
+ line = line.strip
133
+
134
+ # Skip empty lines
135
+ next if line.empty?
136
+
137
+ # Handle UTF-8 BOM on first line
138
+ if @line_no == 1 && line.start_with?(UTF8_BOM)
139
+ line = line[UTF8_BOM.length..]
140
+ line = line.strip if line
141
+ end
142
+
143
+ # Skip if line is now empty after processing
144
+ next if line.nil? || line.empty?
145
+
146
+ yield [@line_no, line]
147
+ end
148
+ end
149
+ end
150
+
151
+ # String reader for reading from a string.
152
+ #
153
+ # Useful for testing or when content is already in memory.
154
+ class StringReader < FileReader
155
+ # Create a new string reader.
156
+ #
157
+ # @param content [String] The content to read
158
+ # @param encoding [String] Encoding (default: 'UTF-8')
159
+ def initialize(content, encoding = 'UTF-8')
160
+ @content = content
161
+ @lines = content.split("\n", -1)
162
+ @index = 0
163
+ super(nil, encoding)
164
+ end
165
+
166
+ private
167
+
168
+ def reset_io
169
+ @line_no = 0
170
+ @index = 0
171
+ @iterator = read_lines_iterator
172
+ end
173
+
174
+ def read_lines_iterator
175
+ Enumerator.new do |yielder|
176
+ while @index < @lines.length
177
+ @line_no += 1
178
+ line = @lines[@index].strip
179
+ @index += 1
180
+
181
+ # Skip empty lines
182
+ next if line.empty?
183
+
184
+ # Handle UTF-8 BOM on first line
185
+ if @line_no == 1 && line.start_with?(UTF8_BOM)
186
+ line = line[UTF8_BOM.length..]
187
+ line = line.strip if line
188
+ end
189
+
190
+ # Skip if line is now empty after processing
191
+ next if line.nil? || line.empty?
192
+
193
+ yielder << [@line_no, line]
194
+ end
195
+ end
196
+ end
197
+ end
198
+
199
+ # Zip reader for reading files from zip archives.
200
+ #
201
+ # This class reads files from within zip archives, such as
202
+ # OpenOffice/LibreOffice extensions (.odt, .oxt).
203
+ #
204
+ # @example Reading from a zip archive
205
+ # zip = Zip::File.open('dictionary.oxt')
206
+ # reader = ZipReader.new(zip, 'en_US.aff', 'UTF-8')
207
+ # reader.each do |line_no, line|
208
+ # puts "#{line_no}: #{line}"
209
+ # end
210
+ class ZipReader
211
+ # @return [Zip::File] The zip file object
212
+ attr_reader :zipfile
213
+
214
+ # @return [String] The entry path within the zip
215
+ attr_reader :entry_path
216
+
217
+ # @return [String] The encoding
218
+ attr_reader :encoding
219
+
220
+ # @return [Integer] Current line number
221
+ attr_reader :line_no
222
+
223
+ # BOM (byte-order mark) for UTF-8
224
+ UTF8_BOM = "\xEF\xBB\xBF".freeze
225
+
226
+ # Create a new zip reader.
227
+ #
228
+ # @param zipfile [Zip::File] The zip file object
229
+ # @param entry_path [String] Path to the entry within the zip
230
+ # @param encoding [String] File encoding (default: 'UTF-8')
231
+ def initialize(zipfile, entry_path, encoding = 'UTF-8')
232
+ @zipfile = zipfile
233
+ @entry_path = entry_path
234
+ @encoding = encoding
235
+ @line_no = 0
236
+ @entry = nil
237
+ @iterator = nil
238
+ reset_io
239
+ end
240
+
241
+ # Reset encoding and reopen zip entry.
242
+ #
243
+ # @param new_encoding [String] New encoding
244
+ def reset_encoding(new_encoding)
245
+ @encoding = new_encoding
246
+ @line_no = 0
247
+ @entry&.close
248
+ reset_io
249
+ end
250
+
251
+ # Iterate over lines.
252
+ #
253
+ # @yield [Integer, String] Line number and line content
254
+ # @return [Enumerator] If no block given
255
+ def each
256
+ return enum_for(:each) unless block_given?
257
+
258
+ @iterator.each { |line_no, line| yield(line_no, line) }
259
+ end
260
+
261
+ # Get all lines as an array.
262
+ #
263
+ # @return [Array<Array<Integer, String>>] Array of [line_no, line] pairs
264
+ def to_a
265
+ @iterator.to_a
266
+ end
267
+
268
+ # Check if there are more lines.
269
+ #
270
+ # @return [Boolean] True if there are more lines
271
+ def has_next?
272
+ peek
273
+ true
274
+ rescue StopIteration
275
+ false
276
+ end
277
+
278
+ # Peek at next line without consuming it.
279
+ #
280
+ # @return [Array<Integer, String>] Next line number and content
281
+ def peek
282
+ @iterator.peek
283
+ end
284
+
285
+ # Get next line.
286
+ #
287
+ # @return [Array<Integer, String>] Line number and content
288
+ def next
289
+ @iterator.next
290
+ end
291
+
292
+ # Reset the reader to the beginning.
293
+ def reset
294
+ @line_no = 0
295
+ reset_io
296
+ end
297
+
298
+ # Close the zip entry.
299
+ def close
300
+ @entry&.close
301
+ @entry = nil
302
+ end
303
+
304
+ private
305
+
306
+ # Reset the IO object.
307
+ def reset_io
308
+ @entry = @zipfile.find_entry(@entry_path)
309
+ raise IOError, "Entry not found in zip: #{@entry_path}" unless @entry
310
+
311
+ # Read the entire entry content and decode it
312
+ content = @entry.get_input_stream.read
313
+ content = content.encode(@encoding, invalid: :replace, undef: :replace)
314
+
315
+ @lines = content.split("\n", -1)
316
+ @line_no = 0
317
+ @iterator = read_lines_from_zip.lazy
318
+ end
319
+
320
+ # Read lines from the zip entry.
321
+ #
322
+ # @return [Enumerator] Enumerator of [line_no, line] pairs
323
+ def read_lines_from_zip
324
+ return enum_for(:read_lines_from_zip) unless block_given?
325
+
326
+ @lines.each do |line|
327
+ @line_no += 1
328
+ line = line.strip
329
+
330
+ # Skip empty lines
331
+ next if line.empty?
332
+
333
+ # Handle UTF-8 BOM on first line
334
+ if @line_no == 1 && line.start_with?(UTF8_BOM)
335
+ line = line[UTF8_BOM.length..]
336
+ line = line.strip if line
337
+ end
338
+
339
+ # Skip if line is now empty after processing
340
+ next if line.nil? || line.empty?
341
+
342
+ yield [@line_no, line]
343
+ end
344
+ end
345
+ end
346
+ end
347
+ end
@@ -0,0 +1,299 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../algorithms/lookup'
4
+ require_relative '../algorithms/capitalization'
5
+ require_relative 'aff_reader'
6
+ require_relative 'dic_reader'
7
+ require_relative 'condition_checker'
8
+
9
+ module Kotoshu
10
+ module Readers
11
+ # Builder for creating Lookup::Lookuper instances from Hunspell data.
12
+ #
13
+ # This class can either read from files or accept pre-read aff/dic data.
14
+ #
15
+ # @example Building a lookuper from files
16
+ # builder = LookupBuilder.new('en_US.aff', 'en_US.dic')
17
+ # lookuper = builder.build
18
+ #
19
+ # @example Building a lookuper from pre-read data
20
+ # aff_reader = AffReader.new('en_US.aff')
21
+ # aff_data = aff_reader.read
22
+ # dic_reader = DicReader.new('en_US.dic')
23
+ # words = dic_reader.read
24
+ # builder = LookupBuilder.from_data(aff_data, words)
25
+ # lookuper = builder.build
26
+ class LookupBuilder
27
+ attr_reader :aff_path, :dic_path, :encoding, :aff_data, :words, :script
28
+
29
+ # Create a new LookupBuilder from file paths.
30
+ #
31
+ # @param aff_path [String] Path to the .aff file
32
+ # @param dic_path [String] Path to the .dic file
33
+ # @param encoding [String] File encoding (default: 'UTF-8')
34
+ # @param script [Symbol] The script type for condition checking (default: :latin)
35
+ def initialize(aff_path, dic_path, encoding: 'UTF-8', script: :latin)
36
+ @aff_path = aff_path
37
+ @dic_path = dic_path
38
+ @encoding = encoding
39
+ @script = script
40
+ @aff_data = nil
41
+ @words = nil
42
+ end
43
+
44
+ # Create a new LookupBuilder from pre-read data.
45
+ #
46
+ # @param aff_data [Hash] Raw aff data from AffReader
47
+ # @param words [Array<Word>] Word entries from DicReader
48
+ # @return [LookupBuilder] A new builder instance
49
+ def self.from_data(aff_data, words)
50
+ builder = new(nil, nil)
51
+ builder.instance_variable_set(:@aff_data, aff_data)
52
+ builder.instance_variable_set(:@words, words)
53
+ builder
54
+ end
55
+
56
+ # Build the Lookuper instance.
57
+ #
58
+ # @return [Algorithms::Lookup::Lookuper] The lookuper instance
59
+ def build
60
+ # Read files if data not already provided
61
+ aff_data_to_use = @aff_data || read_aff_data
62
+ words_to_use = @words || read_dic_data(aff_data_to_use)
63
+
64
+ # Build the aff structure for Lookuper
65
+ aff = build_aff_structure(aff_data_to_use)
66
+
67
+ # Build the dic structure for Lookuper
68
+ dic = build_dic_structure(words_to_use)
69
+
70
+ # Create and return the Lookuper
71
+ Algorithms::Lookup::Lookuper.new(aff, dic)
72
+ end
73
+
74
+ private
75
+
76
+ # Read aff data from file.
77
+ #
78
+ # @return [Hash] Raw aff data
79
+ def read_aff_data
80
+ aff_reader = AffReader.new(@aff_path, encoding: @encoding)
81
+ aff_reader.read
82
+ end
83
+
84
+ # Read dic data from file.
85
+ #
86
+ # @param aff_data [Hash] Aff data for flag format info
87
+ # @return [Array<Word>] Word entries
88
+ def read_dic_data(aff_data)
89
+ dic_reader = DicReader.new(@dic_path,
90
+ encoding: @encoding,
91
+ flag_format: aff_data['FLAG'] || 'short',
92
+ flag_synonyms: aff_data['AF'] || {})
93
+ dic_reader.read
94
+ end
95
+
96
+ private
97
+
98
+ # Build the aff data structure for Lookuper.
99
+ #
100
+ # @param aff_data [Hash] Raw aff data from AffReader
101
+ # @return [Hash] Aff structure for Lookuper
102
+ def build_aff_structure(aff_data)
103
+ aff = {}
104
+
105
+ # Capitalization handler - default to standard Casing
106
+ # Could be extended to use TurkicCasing or GermanCasing based on LANG
107
+ aff[:casing] = Algorithms::Capitalization::Casing.new
108
+
109
+ # Build suffixes index (indexed by first character of reversed suffix)
110
+ suffixes_index = {}
111
+ aff_data['SFX'].each do |_flag, affix_list|
112
+ affix_list.each do |affix|
113
+ # For suffixes, we need to index by the first char of the REVERSED suffix
114
+ # because the lookup code reverses the word to check suffixes
115
+ reversed_suffix = affix.add.reverse
116
+ first_char = reversed_suffix[0] || ''
117
+ suffixes_index[first_char] ||= []
118
+ suffixes_index[first_char] << build_affix_hash(affix, script: @script || :latin)
119
+ end
120
+ end
121
+ aff[:suffixes_index] = suffixes_index
122
+
123
+ # Build prefixes index (indexed by first character of prefix)
124
+ prefixes_index = {}
125
+ aff_data['PFX'].each do |_flag, affix_list|
126
+ affix_list.each do |affix|
127
+ first_char = affix.add[0] || ''
128
+ prefixes_index[first_char] ||= []
129
+ prefixes_index[first_char] << build_affix_hash(affix, script: @script || :latin)
130
+ end
131
+ end
132
+ aff[:prefixes_index] = prefixes_index
133
+
134
+ # Single-value flags
135
+ aff[:COMPOUNDMIN] = aff_data['COMPOUNDMIN']
136
+ aff[:COMPOUNDWORDMAX] = aff_data['COMPOUNDWORDMAX']
137
+ aff[:COMPOUNDBEGIN] = aff_data['COMPOUNDBEGIN']
138
+ aff[:COMPOUNDMIDDLE] = aff_data['COMPOUNDMIDDLE']
139
+ aff[:COMPOUNDEND] = aff_data['COMPOUNDEND']
140
+ aff[:COMPOUNDFLAG] = aff_data['COMPOUNDFLAG']
141
+ aff[:COMPOUNDPERMITFLAG] = aff_data['COMPOUNDPERMITFLAG']
142
+ aff[:COMPOUNDFORBIDFLAG] = aff_data['COMPOUNDFORBIDFLAG']
143
+ aff[:COMPOUNDRULE] = build_compound_rules(aff_data['COMPOUNDRULE'])
144
+ aff[:ONLYINCOMPOUND] = aff_data['ONLYINCOMPOUND']
145
+ aff[:COMPLEXPREFIXES] = aff_data['COMPLEXPREFIXES']
146
+ aff[:FORCEUCASE] = aff_data['FORCEUCASE']
147
+
148
+ # Special flags
149
+ aff[:FORBIDDENWORD] = aff_data['FORBIDDENWORD']
150
+ aff[:NOSUGGEST] = aff_data['NOSUGGEST']
151
+ aff[:KEEPCASE] = aff_data['KEEPCASE']
152
+ aff[:NEEDAFFIX] = aff_data['NEEDAFFIX']
153
+ aff[:CIRCUMFIX] = aff_data['CIRCUMFIX']
154
+ aff[:WARN] = aff_data['WARN']
155
+
156
+ # Compound checking flags
157
+ aff[:CHECKCOMPOUNDCASE] = aff_data['CHECKCOMPOUNDCASE']
158
+ aff[:CHECKCOMPOUNDDUP] = aff_data['CHECKCOMPOUNDDUP']
159
+ aff[:CHECKCOMPOUNDREP] = aff_data['CHECKCOMPOUNDREP']
160
+ aff[:CHECKCOMPOUNDTRIPLE] = aff_data['CHECKCOMPOUNDTRIPLE']
161
+ aff[:CHECKCOMPOUNDPATTERN] = build_compound_patterns(aff_data['CHECKCOMPOUNDPATTERN'])
162
+ aff[:SIMPLIFIEDTRIPLE] = aff_data['SIMPLIFIEDTRIPLE']
163
+
164
+ # Other directives
165
+ aff[:IGNORE] = aff_data['IGNORE']&.chars || []
166
+ aff[:BREAK] = build_break_patterns(aff_data['BREAK'])
167
+ aff[:ICONV] = aff_data['ICONV']
168
+ aff[:OCONV] = aff_data['OCONV']
169
+ aff[:REP] = aff_data['REP'] || []
170
+ aff[:MAP] = aff_data['MAP'] || []
171
+ aff[:CHECKSHARPS] = aff_data['CHECKSHARPS']
172
+
173
+ aff
174
+ end
175
+
176
+ # Build the dic data structure for Lookuper.
177
+ #
178
+ # @param words [Array<Word>] List of word entries
179
+ # @return [Hash] Dic structure for Lookuper
180
+ def build_dic_structure(words)
181
+ # Build a hash indexed by word for fast lookup
182
+ word_index = Hash.new { |h, k| h[k] = [] }
183
+
184
+ words.each do |word|
185
+ word_index[word.stem] << {
186
+ stem: word.stem,
187
+ flags: word.flags.to_a
188
+ }
189
+ end
190
+
191
+ # Build the dic structure with homonyms callable
192
+ {
193
+ homonyms: ->(word) { word_index[word] || [] },
194
+ has_flag: ->(word, flag, for_all: false) {
195
+ entries = word_index[word] || []
196
+ flags_present = entries.map { |e| e[:flags] }.flatten
197
+ if for_all
198
+ flags_present.all? { |flags| flags.include?(flag) }
199
+ else
200
+ flags_present.any? { |flags| flags.include?(flag) }
201
+ end
202
+ }
203
+ }
204
+ end
205
+
206
+ # Build an affix hash for Lookuper.
207
+ #
208
+ # @param affix [Affix] The affix object
209
+ # @param script [Symbol] The script type for condition checking
210
+ # @return [Hash] Affix hash for Lookuper
211
+ def build_affix_hash(affix, script: :latin)
212
+ {
213
+ flag: affix.flag,
214
+ crossproduct: affix.crossproduct,
215
+ strip: affix.strip,
216
+ affix: affix.add,
217
+ condition_checker: compile_condition_matcher(affix.condition, script: script),
218
+ affix_data: build_affix_transform(affix.strip, affix.add, type: affix.type),
219
+ flags: affix.flags.to_a
220
+ }
221
+ end
222
+
223
+ # Compile a condition checker.
224
+ #
225
+ # @param condition [String] Condition string from .aff file
226
+ # @param script [Symbol] The script type (:latin, :arabic, etc.)
227
+ # @return [ConditionChecker, nil] Compiled checker or nil
228
+ def compile_condition_matcher(condition, script: :latin)
229
+ return nil if condition.nil? || condition.empty?
230
+
231
+ ConditionChecker.compile(condition, script: script)
232
+ end
233
+
234
+ # Build affix stripping data.
235
+ #
236
+ # Build affix transformation data.
237
+ #
238
+ # @param strip [String] Characters to strip
239
+ # @param add [String] Characters to add
240
+ # @param type [Symbol] :prefix or :suffix
241
+ # @return [Hash] Hash with affix data for transformation
242
+ def build_affix_transform(strip, add, type:)
243
+ return nil if strip.empty? && add.empty?
244
+
245
+ {
246
+ add: add,
247
+ strip: strip || '',
248
+ type: type
249
+ }
250
+ end
251
+
252
+ # Build compound rules array.
253
+ #
254
+ # @param rules [Array<CompoundRule>] List of compound rules
255
+ # @return [Array<Hash>] Array of compound rule hashes
256
+ def build_compound_rules(rules)
257
+ return [] if rules.nil? || rules.empty?
258
+
259
+ rules.map do |rule|
260
+ {
261
+ text: rule.text,
262
+ flags: rule.flags,
263
+ full_match: ->(flag_sets) { rule.fullmatch(flag_sets) },
264
+ partial_match: ->(flag_sets) { rule.flags.intersect?(flag_sets.flatten.to_set) }
265
+ }
266
+ end
267
+ end
268
+
269
+ # Build compound patterns array.
270
+ #
271
+ # @param patterns [Array<CompoundPattern>] List of compound patterns
272
+ # @return [Array<Hash>] Array of compound pattern hashes
273
+ def build_compound_patterns(patterns)
274
+ return [] if patterns.nil? || patterns.empty?
275
+
276
+ patterns.map do |pattern|
277
+ {
278
+ match: ->(left, right) { pattern.match?(left, right) }
279
+ }
280
+ end
281
+ end
282
+
283
+ # Build break patterns array.
284
+ #
285
+ # @param break_patterns [Array<BreakPattern>] List of break patterns
286
+ # @return [Array<Hash>] Array of break pattern hashes
287
+ def build_break_patterns(break_patterns)
288
+ return [] if break_patterns.nil? || break_patterns.empty?
289
+
290
+ break_patterns.map do |bp|
291
+ {
292
+ pattern: bp.pattern,
293
+ matcher: bp.matcher
294
+ }
295
+ end
296
+ end
297
+ end
298
+ end
299
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'file_reader'
4
+ require_relative 'aff_data'
5
+ require_relative 'aff_reader'
6
+ require_relative 'dic_reader'
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ # Hunspell readers module for reading dictionary and affix files.
5
+ module Readers
6
+ end
7
+ end
8
+
9
+ require_relative 'readers/readers'
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ ResourceBundle = Struct.new(
5
+ :language,
6
+ :dictionary,
7
+ :frequency,
8
+ :model,
9
+ :rules,
10
+ :cached,
11
+ :source_urls,
12
+ keyword_init: true
13
+ ) do
14
+ def cached?
15
+ cached ? true : false
16
+ end
17
+
18
+ def has_frequency?
19
+ !frequency.nil?
20
+ end
21
+
22
+ def has_model?
23
+ !model.nil?
24
+ end
25
+
26
+ def has_rules?
27
+ !rules.nil?
28
+ end
29
+ end
30
+ end