kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'rubygems'
|
|
4
|
+
|
|
5
|
+
begin
|
|
6
|
+
require 'zip'
|
|
7
|
+
rescue LoadError
|
|
8
|
+
# rubyzip is optional — only needed for ZipReader (.oxt dictionaries).
|
|
9
|
+
# The plain FileReader and StreamReader work without it.
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
module Kotoshu
|
|
13
|
+
module Readers
|
|
14
|
+
# Base reader class for reading files line by line.
|
|
15
|
+
#
|
|
16
|
+
# This class provides:
|
|
17
|
+
# - Line-by-line reading with line numbers
|
|
18
|
+
# - BOM (byte-order mark) handling
|
|
19
|
+
# - Comment stripping
|
|
20
|
+
# - Empty line filtering
|
|
21
|
+
#
|
|
22
|
+
# @example Basic usage
|
|
23
|
+
# reader = FileReader.new('file.aff', 'UTF-8')
|
|
24
|
+
# reader.each do |line_no, line|
|
|
25
|
+
# puts "#{line_no}: #{line}"
|
|
26
|
+
# end
|
|
27
|
+
class FileReader
|
|
28
|
+
# @return [String] The file path
|
|
29
|
+
attr_reader :path
|
|
30
|
+
|
|
31
|
+
# @return [String] The encoding
|
|
32
|
+
attr_reader :encoding
|
|
33
|
+
|
|
34
|
+
# @return [Integer] Current line number
|
|
35
|
+
attr_reader :line_no
|
|
36
|
+
|
|
37
|
+
# BOM (byte-order mark) for UTF-8
|
|
38
|
+
UTF8_BOM = "\xEF\xBB\xBF".freeze
|
|
39
|
+
|
|
40
|
+
# Create a new file reader.
|
|
41
|
+
#
|
|
42
|
+
# @param path [String] Path to the file
|
|
43
|
+
# @param encoding [String] File encoding (default: 'UTF-8')
|
|
44
|
+
def initialize(path, encoding = 'UTF-8')
|
|
45
|
+
@path = path
|
|
46
|
+
@encoding = encoding
|
|
47
|
+
@line_no = 0
|
|
48
|
+
@file = nil
|
|
49
|
+
@iterator = nil
|
|
50
|
+
reset_io
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Reset encoding and reopen file.
|
|
54
|
+
#
|
|
55
|
+
# @param new_encoding [String] New encoding
|
|
56
|
+
def reset_encoding(new_encoding)
|
|
57
|
+
@encoding = new_encoding
|
|
58
|
+
@line_no = 0
|
|
59
|
+
@file&.close
|
|
60
|
+
reset_io
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Iterate over lines.
|
|
64
|
+
#
|
|
65
|
+
# @yield [Integer, String] Line number and line content
|
|
66
|
+
# @return [Enumerator] If no block given
|
|
67
|
+
def each
|
|
68
|
+
return enum_for(:each) unless block_given?
|
|
69
|
+
|
|
70
|
+
@iterator.each { |line_no, line| yield(line_no, line) }
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Get all lines as an array.
|
|
74
|
+
#
|
|
75
|
+
# @return [Array<Array<Integer, String>>] Array of [line_no, line] pairs
|
|
76
|
+
def to_a
|
|
77
|
+
@iterator.to_a
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Check if there are more lines.
|
|
81
|
+
#
|
|
82
|
+
# @return [Boolean] True if there are more lines
|
|
83
|
+
def has_next?
|
|
84
|
+
peek
|
|
85
|
+
true
|
|
86
|
+
rescue StopIteration
|
|
87
|
+
false
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Peek at next line without consuming it.
|
|
91
|
+
#
|
|
92
|
+
# @return [Array<Integer, String>] Next line number and content
|
|
93
|
+
def peek
|
|
94
|
+
@iterator.peek
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Get next line.
|
|
98
|
+
#
|
|
99
|
+
# @return [Array<Integer, String>] Line number and content
|
|
100
|
+
def next
|
|
101
|
+
@iterator.next
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Reset the reader to the beginning.
|
|
105
|
+
def reset
|
|
106
|
+
@line_no = 0
|
|
107
|
+
reset_io
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Close the file.
|
|
111
|
+
def close
|
|
112
|
+
@file&.close
|
|
113
|
+
@file = nil
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
private
|
|
117
|
+
|
|
118
|
+
# Reset the IO object.
|
|
119
|
+
def reset_io
|
|
120
|
+
@file = File.open(@path, "r:#{@encoding}:utf-8")
|
|
121
|
+
@iterator = read_lines.lazy
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Read lines from the file.
|
|
125
|
+
#
|
|
126
|
+
# @return [Enumerator] Enumerator of [line_no, line] pairs
|
|
127
|
+
def read_lines
|
|
128
|
+
return enum_for(:read_lines) unless block_given?
|
|
129
|
+
|
|
130
|
+
@file.each_line do |line|
|
|
131
|
+
@line_no += 1
|
|
132
|
+
line = line.strip
|
|
133
|
+
|
|
134
|
+
# Skip empty lines
|
|
135
|
+
next if line.empty?
|
|
136
|
+
|
|
137
|
+
# Handle UTF-8 BOM on first line
|
|
138
|
+
if @line_no == 1 && line.start_with?(UTF8_BOM)
|
|
139
|
+
line = line[UTF8_BOM.length..]
|
|
140
|
+
line = line.strip if line
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Skip if line is now empty after processing
|
|
144
|
+
next if line.nil? || line.empty?
|
|
145
|
+
|
|
146
|
+
yield [@line_no, line]
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# String reader for reading from a string.
|
|
152
|
+
#
|
|
153
|
+
# Useful for testing or when content is already in memory.
|
|
154
|
+
class StringReader < FileReader
|
|
155
|
+
# Create a new string reader.
|
|
156
|
+
#
|
|
157
|
+
# @param content [String] The content to read
|
|
158
|
+
# @param encoding [String] Encoding (default: 'UTF-8')
|
|
159
|
+
def initialize(content, encoding = 'UTF-8')
|
|
160
|
+
@content = content
|
|
161
|
+
@lines = content.split("\n", -1)
|
|
162
|
+
@index = 0
|
|
163
|
+
super(nil, encoding)
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
private
|
|
167
|
+
|
|
168
|
+
def reset_io
|
|
169
|
+
@line_no = 0
|
|
170
|
+
@index = 0
|
|
171
|
+
@iterator = read_lines_iterator
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def read_lines_iterator
|
|
175
|
+
Enumerator.new do |yielder|
|
|
176
|
+
while @index < @lines.length
|
|
177
|
+
@line_no += 1
|
|
178
|
+
line = @lines[@index].strip
|
|
179
|
+
@index += 1
|
|
180
|
+
|
|
181
|
+
# Skip empty lines
|
|
182
|
+
next if line.empty?
|
|
183
|
+
|
|
184
|
+
# Handle UTF-8 BOM on first line
|
|
185
|
+
if @line_no == 1 && line.start_with?(UTF8_BOM)
|
|
186
|
+
line = line[UTF8_BOM.length..]
|
|
187
|
+
line = line.strip if line
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Skip if line is now empty after processing
|
|
191
|
+
next if line.nil? || line.empty?
|
|
192
|
+
|
|
193
|
+
yielder << [@line_no, line]
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Zip reader for reading files from zip archives.
|
|
200
|
+
#
|
|
201
|
+
# This class reads files from within zip archives, such as
|
|
202
|
+
# OpenOffice/LibreOffice extensions (.odt, .oxt).
|
|
203
|
+
#
|
|
204
|
+
# @example Reading from a zip archive
|
|
205
|
+
# zip = Zip::File.open('dictionary.oxt')
|
|
206
|
+
# reader = ZipReader.new(zip, 'en_US.aff', 'UTF-8')
|
|
207
|
+
# reader.each do |line_no, line|
|
|
208
|
+
# puts "#{line_no}: #{line}"
|
|
209
|
+
# end
|
|
210
|
+
class ZipReader
|
|
211
|
+
# @return [Zip::File] The zip file object
|
|
212
|
+
attr_reader :zipfile
|
|
213
|
+
|
|
214
|
+
# @return [String] The entry path within the zip
|
|
215
|
+
attr_reader :entry_path
|
|
216
|
+
|
|
217
|
+
# @return [String] The encoding
|
|
218
|
+
attr_reader :encoding
|
|
219
|
+
|
|
220
|
+
# @return [Integer] Current line number
|
|
221
|
+
attr_reader :line_no
|
|
222
|
+
|
|
223
|
+
# BOM (byte-order mark) for UTF-8
|
|
224
|
+
UTF8_BOM = "\xEF\xBB\xBF".freeze
|
|
225
|
+
|
|
226
|
+
# Create a new zip reader.
|
|
227
|
+
#
|
|
228
|
+
# @param zipfile [Zip::File] The zip file object
|
|
229
|
+
# @param entry_path [String] Path to the entry within the zip
|
|
230
|
+
# @param encoding [String] File encoding (default: 'UTF-8')
|
|
231
|
+
def initialize(zipfile, entry_path, encoding = 'UTF-8')
|
|
232
|
+
@zipfile = zipfile
|
|
233
|
+
@entry_path = entry_path
|
|
234
|
+
@encoding = encoding
|
|
235
|
+
@line_no = 0
|
|
236
|
+
@entry = nil
|
|
237
|
+
@iterator = nil
|
|
238
|
+
reset_io
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Reset encoding and reopen zip entry.
|
|
242
|
+
#
|
|
243
|
+
# @param new_encoding [String] New encoding
|
|
244
|
+
def reset_encoding(new_encoding)
|
|
245
|
+
@encoding = new_encoding
|
|
246
|
+
@line_no = 0
|
|
247
|
+
@entry&.close
|
|
248
|
+
reset_io
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Iterate over lines.
|
|
252
|
+
#
|
|
253
|
+
# @yield [Integer, String] Line number and line content
|
|
254
|
+
# @return [Enumerator] If no block given
|
|
255
|
+
def each
|
|
256
|
+
return enum_for(:each) unless block_given?
|
|
257
|
+
|
|
258
|
+
@iterator.each { |line_no, line| yield(line_no, line) }
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Get all lines as an array.
|
|
262
|
+
#
|
|
263
|
+
# @return [Array<Array<Integer, String>>] Array of [line_no, line] pairs
|
|
264
|
+
def to_a
|
|
265
|
+
@iterator.to_a
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
# Check if there are more lines.
|
|
269
|
+
#
|
|
270
|
+
# @return [Boolean] True if there are more lines
|
|
271
|
+
def has_next?
|
|
272
|
+
peek
|
|
273
|
+
true
|
|
274
|
+
rescue StopIteration
|
|
275
|
+
false
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Peek at next line without consuming it.
|
|
279
|
+
#
|
|
280
|
+
# @return [Array<Integer, String>] Next line number and content
|
|
281
|
+
def peek
|
|
282
|
+
@iterator.peek
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
# Get next line.
|
|
286
|
+
#
|
|
287
|
+
# @return [Array<Integer, String>] Line number and content
|
|
288
|
+
def next
|
|
289
|
+
@iterator.next
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
# Reset the reader to the beginning.
|
|
293
|
+
def reset
|
|
294
|
+
@line_no = 0
|
|
295
|
+
reset_io
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
# Close the zip entry.
|
|
299
|
+
def close
|
|
300
|
+
@entry&.close
|
|
301
|
+
@entry = nil
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
private
|
|
305
|
+
|
|
306
|
+
# Reset the IO object.
|
|
307
|
+
def reset_io
|
|
308
|
+
@entry = @zipfile.find_entry(@entry_path)
|
|
309
|
+
raise IOError, "Entry not found in zip: #{@entry_path}" unless @entry
|
|
310
|
+
|
|
311
|
+
# Read the entire entry content and decode it
|
|
312
|
+
content = @entry.get_input_stream.read
|
|
313
|
+
content = content.encode(@encoding, invalid: :replace, undef: :replace)
|
|
314
|
+
|
|
315
|
+
@lines = content.split("\n", -1)
|
|
316
|
+
@line_no = 0
|
|
317
|
+
@iterator = read_lines_from_zip.lazy
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
# Read lines from the zip entry.
|
|
321
|
+
#
|
|
322
|
+
# @return [Enumerator] Enumerator of [line_no, line] pairs
|
|
323
|
+
def read_lines_from_zip
|
|
324
|
+
return enum_for(:read_lines_from_zip) unless block_given?
|
|
325
|
+
|
|
326
|
+
@lines.each do |line|
|
|
327
|
+
@line_no += 1
|
|
328
|
+
line = line.strip
|
|
329
|
+
|
|
330
|
+
# Skip empty lines
|
|
331
|
+
next if line.empty?
|
|
332
|
+
|
|
333
|
+
# Handle UTF-8 BOM on first line
|
|
334
|
+
if @line_no == 1 && line.start_with?(UTF8_BOM)
|
|
335
|
+
line = line[UTF8_BOM.length..]
|
|
336
|
+
line = line.strip if line
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
# Skip if line is now empty after processing
|
|
340
|
+
next if line.nil? || line.empty?
|
|
341
|
+
|
|
342
|
+
yield [@line_no, line]
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
end
|
|
346
|
+
end
|
|
347
|
+
end
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../algorithms/lookup'
|
|
4
|
+
require_relative '../algorithms/capitalization'
|
|
5
|
+
require_relative 'aff_reader'
|
|
6
|
+
require_relative 'dic_reader'
|
|
7
|
+
require_relative 'condition_checker'
|
|
8
|
+
|
|
9
|
+
module Kotoshu
|
|
10
|
+
module Readers
|
|
11
|
+
# Builder for creating Lookup::Lookuper instances from Hunspell data.
|
|
12
|
+
#
|
|
13
|
+
# This class can either read from files or accept pre-read aff/dic data.
|
|
14
|
+
#
|
|
15
|
+
# @example Building a lookuper from files
|
|
16
|
+
# builder = LookupBuilder.new('en_US.aff', 'en_US.dic')
|
|
17
|
+
# lookuper = builder.build
|
|
18
|
+
#
|
|
19
|
+
# @example Building a lookuper from pre-read data
|
|
20
|
+
# aff_reader = AffReader.new('en_US.aff')
|
|
21
|
+
# aff_data = aff_reader.read
|
|
22
|
+
# dic_reader = DicReader.new('en_US.dic')
|
|
23
|
+
# words = dic_reader.read
|
|
24
|
+
# builder = LookupBuilder.from_data(aff_data, words)
|
|
25
|
+
# lookuper = builder.build
|
|
26
|
+
class LookupBuilder
|
|
27
|
+
attr_reader :aff_path, :dic_path, :encoding, :aff_data, :words, :script
|
|
28
|
+
|
|
29
|
+
# Create a new LookupBuilder from file paths.
|
|
30
|
+
#
|
|
31
|
+
# @param aff_path [String] Path to the .aff file
|
|
32
|
+
# @param dic_path [String] Path to the .dic file
|
|
33
|
+
# @param encoding [String] File encoding (default: 'UTF-8')
|
|
34
|
+
# @param script [Symbol] The script type for condition checking (default: :latin)
|
|
35
|
+
def initialize(aff_path, dic_path, encoding: 'UTF-8', script: :latin)
|
|
36
|
+
@aff_path = aff_path
|
|
37
|
+
@dic_path = dic_path
|
|
38
|
+
@encoding = encoding
|
|
39
|
+
@script = script
|
|
40
|
+
@aff_data = nil
|
|
41
|
+
@words = nil
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Create a new LookupBuilder from pre-read data.
|
|
45
|
+
#
|
|
46
|
+
# @param aff_data [Hash] Raw aff data from AffReader
|
|
47
|
+
# @param words [Array<Word>] Word entries from DicReader
|
|
48
|
+
# @return [LookupBuilder] A new builder instance
|
|
49
|
+
def self.from_data(aff_data, words)
|
|
50
|
+
builder = new(nil, nil)
|
|
51
|
+
builder.instance_variable_set(:@aff_data, aff_data)
|
|
52
|
+
builder.instance_variable_set(:@words, words)
|
|
53
|
+
builder
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Build the Lookuper instance.
|
|
57
|
+
#
|
|
58
|
+
# @return [Algorithms::Lookup::Lookuper] The lookuper instance
|
|
59
|
+
def build
|
|
60
|
+
# Read files if data not already provided
|
|
61
|
+
aff_data_to_use = @aff_data || read_aff_data
|
|
62
|
+
words_to_use = @words || read_dic_data(aff_data_to_use)
|
|
63
|
+
|
|
64
|
+
# Build the aff structure for Lookuper
|
|
65
|
+
aff = build_aff_structure(aff_data_to_use)
|
|
66
|
+
|
|
67
|
+
# Build the dic structure for Lookuper
|
|
68
|
+
dic = build_dic_structure(words_to_use)
|
|
69
|
+
|
|
70
|
+
# Create and return the Lookuper
|
|
71
|
+
Algorithms::Lookup::Lookuper.new(aff, dic)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
private
|
|
75
|
+
|
|
76
|
+
# Read aff data from file.
|
|
77
|
+
#
|
|
78
|
+
# @return [Hash] Raw aff data
|
|
79
|
+
def read_aff_data
|
|
80
|
+
aff_reader = AffReader.new(@aff_path, encoding: @encoding)
|
|
81
|
+
aff_reader.read
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Read dic data from file.
|
|
85
|
+
#
|
|
86
|
+
# @param aff_data [Hash] Aff data for flag format info
|
|
87
|
+
# @return [Array<Word>] Word entries
|
|
88
|
+
def read_dic_data(aff_data)
|
|
89
|
+
dic_reader = DicReader.new(@dic_path,
|
|
90
|
+
encoding: @encoding,
|
|
91
|
+
flag_format: aff_data['FLAG'] || 'short',
|
|
92
|
+
flag_synonyms: aff_data['AF'] || {})
|
|
93
|
+
dic_reader.read
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
private
|
|
97
|
+
|
|
98
|
+
# Build the aff data structure for Lookuper.
|
|
99
|
+
#
|
|
100
|
+
# @param aff_data [Hash] Raw aff data from AffReader
|
|
101
|
+
# @return [Hash] Aff structure for Lookuper
|
|
102
|
+
def build_aff_structure(aff_data)
|
|
103
|
+
aff = {}
|
|
104
|
+
|
|
105
|
+
# Capitalization handler - default to standard Casing
|
|
106
|
+
# Could be extended to use TurkicCasing or GermanCasing based on LANG
|
|
107
|
+
aff[:casing] = Algorithms::Capitalization::Casing.new
|
|
108
|
+
|
|
109
|
+
# Build suffixes index (indexed by first character of reversed suffix)
|
|
110
|
+
suffixes_index = {}
|
|
111
|
+
aff_data['SFX'].each do |_flag, affix_list|
|
|
112
|
+
affix_list.each do |affix|
|
|
113
|
+
# For suffixes, we need to index by the first char of the REVERSED suffix
|
|
114
|
+
# because the lookup code reverses the word to check suffixes
|
|
115
|
+
reversed_suffix = affix.add.reverse
|
|
116
|
+
first_char = reversed_suffix[0] || ''
|
|
117
|
+
suffixes_index[first_char] ||= []
|
|
118
|
+
suffixes_index[first_char] << build_affix_hash(affix, script: @script || :latin)
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
aff[:suffixes_index] = suffixes_index
|
|
122
|
+
|
|
123
|
+
# Build prefixes index (indexed by first character of prefix)
|
|
124
|
+
prefixes_index = {}
|
|
125
|
+
aff_data['PFX'].each do |_flag, affix_list|
|
|
126
|
+
affix_list.each do |affix|
|
|
127
|
+
first_char = affix.add[0] || ''
|
|
128
|
+
prefixes_index[first_char] ||= []
|
|
129
|
+
prefixes_index[first_char] << build_affix_hash(affix, script: @script || :latin)
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
aff[:prefixes_index] = prefixes_index
|
|
133
|
+
|
|
134
|
+
# Single-value flags
|
|
135
|
+
aff[:COMPOUNDMIN] = aff_data['COMPOUNDMIN']
|
|
136
|
+
aff[:COMPOUNDWORDMAX] = aff_data['COMPOUNDWORDMAX']
|
|
137
|
+
aff[:COMPOUNDBEGIN] = aff_data['COMPOUNDBEGIN']
|
|
138
|
+
aff[:COMPOUNDMIDDLE] = aff_data['COMPOUNDMIDDLE']
|
|
139
|
+
aff[:COMPOUNDEND] = aff_data['COMPOUNDEND']
|
|
140
|
+
aff[:COMPOUNDFLAG] = aff_data['COMPOUNDFLAG']
|
|
141
|
+
aff[:COMPOUNDPERMITFLAG] = aff_data['COMPOUNDPERMITFLAG']
|
|
142
|
+
aff[:COMPOUNDFORBIDFLAG] = aff_data['COMPOUNDFORBIDFLAG']
|
|
143
|
+
aff[:COMPOUNDRULE] = build_compound_rules(aff_data['COMPOUNDRULE'])
|
|
144
|
+
aff[:ONLYINCOMPOUND] = aff_data['ONLYINCOMPOUND']
|
|
145
|
+
aff[:COMPLEXPREFIXES] = aff_data['COMPLEXPREFIXES']
|
|
146
|
+
aff[:FORCEUCASE] = aff_data['FORCEUCASE']
|
|
147
|
+
|
|
148
|
+
# Special flags
|
|
149
|
+
aff[:FORBIDDENWORD] = aff_data['FORBIDDENWORD']
|
|
150
|
+
aff[:NOSUGGEST] = aff_data['NOSUGGEST']
|
|
151
|
+
aff[:KEEPCASE] = aff_data['KEEPCASE']
|
|
152
|
+
aff[:NEEDAFFIX] = aff_data['NEEDAFFIX']
|
|
153
|
+
aff[:CIRCUMFIX] = aff_data['CIRCUMFIX']
|
|
154
|
+
aff[:WARN] = aff_data['WARN']
|
|
155
|
+
|
|
156
|
+
# Compound checking flags
|
|
157
|
+
aff[:CHECKCOMPOUNDCASE] = aff_data['CHECKCOMPOUNDCASE']
|
|
158
|
+
aff[:CHECKCOMPOUNDDUP] = aff_data['CHECKCOMPOUNDDUP']
|
|
159
|
+
aff[:CHECKCOMPOUNDREP] = aff_data['CHECKCOMPOUNDREP']
|
|
160
|
+
aff[:CHECKCOMPOUNDTRIPLE] = aff_data['CHECKCOMPOUNDTRIPLE']
|
|
161
|
+
aff[:CHECKCOMPOUNDPATTERN] = build_compound_patterns(aff_data['CHECKCOMPOUNDPATTERN'])
|
|
162
|
+
aff[:SIMPLIFIEDTRIPLE] = aff_data['SIMPLIFIEDTRIPLE']
|
|
163
|
+
|
|
164
|
+
# Other directives
|
|
165
|
+
aff[:IGNORE] = aff_data['IGNORE']&.chars || []
|
|
166
|
+
aff[:BREAK] = build_break_patterns(aff_data['BREAK'])
|
|
167
|
+
aff[:ICONV] = aff_data['ICONV']
|
|
168
|
+
aff[:OCONV] = aff_data['OCONV']
|
|
169
|
+
aff[:REP] = aff_data['REP'] || []
|
|
170
|
+
aff[:MAP] = aff_data['MAP'] || []
|
|
171
|
+
aff[:CHECKSHARPS] = aff_data['CHECKSHARPS']
|
|
172
|
+
|
|
173
|
+
aff
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Build the dic data structure for Lookuper.
|
|
177
|
+
#
|
|
178
|
+
# @param words [Array<Word>] List of word entries
|
|
179
|
+
# @return [Hash] Dic structure for Lookuper
|
|
180
|
+
def build_dic_structure(words)
|
|
181
|
+
# Build a hash indexed by word for fast lookup
|
|
182
|
+
word_index = Hash.new { |h, k| h[k] = [] }
|
|
183
|
+
|
|
184
|
+
words.each do |word|
|
|
185
|
+
word_index[word.stem] << {
|
|
186
|
+
stem: word.stem,
|
|
187
|
+
flags: word.flags.to_a
|
|
188
|
+
}
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# Build the dic structure with homonyms callable
|
|
192
|
+
{
|
|
193
|
+
homonyms: ->(word) { word_index[word] || [] },
|
|
194
|
+
has_flag: ->(word, flag, for_all: false) {
|
|
195
|
+
entries = word_index[word] || []
|
|
196
|
+
flags_present = entries.map { |e| e[:flags] }.flatten
|
|
197
|
+
if for_all
|
|
198
|
+
flags_present.all? { |flags| flags.include?(flag) }
|
|
199
|
+
else
|
|
200
|
+
flags_present.any? { |flags| flags.include?(flag) }
|
|
201
|
+
end
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Build an affix hash for Lookuper.
|
|
207
|
+
#
|
|
208
|
+
# @param affix [Affix] The affix object
|
|
209
|
+
# @param script [Symbol] The script type for condition checking
|
|
210
|
+
# @return [Hash] Affix hash for Lookuper
|
|
211
|
+
def build_affix_hash(affix, script: :latin)
|
|
212
|
+
{
|
|
213
|
+
flag: affix.flag,
|
|
214
|
+
crossproduct: affix.crossproduct,
|
|
215
|
+
strip: affix.strip,
|
|
216
|
+
affix: affix.add,
|
|
217
|
+
condition_checker: compile_condition_matcher(affix.condition, script: script),
|
|
218
|
+
affix_data: build_affix_transform(affix.strip, affix.add, type: affix.type),
|
|
219
|
+
flags: affix.flags.to_a
|
|
220
|
+
}
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Compile a condition checker.
|
|
224
|
+
#
|
|
225
|
+
# @param condition [String] Condition string from .aff file
|
|
226
|
+
# @param script [Symbol] The script type (:latin, :arabic, etc.)
|
|
227
|
+
# @return [ConditionChecker, nil] Compiled checker or nil
|
|
228
|
+
def compile_condition_matcher(condition, script: :latin)
|
|
229
|
+
return nil if condition.nil? || condition.empty?
|
|
230
|
+
|
|
231
|
+
ConditionChecker.compile(condition, script: script)
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# Build affix stripping data.
|
|
235
|
+
#
|
|
236
|
+
# Build affix transformation data.
|
|
237
|
+
#
|
|
238
|
+
# @param strip [String] Characters to strip
|
|
239
|
+
# @param add [String] Characters to add
|
|
240
|
+
# @param type [Symbol] :prefix or :suffix
|
|
241
|
+
# @return [Hash] Hash with affix data for transformation
|
|
242
|
+
def build_affix_transform(strip, add, type:)
|
|
243
|
+
return nil if strip.empty? && add.empty?
|
|
244
|
+
|
|
245
|
+
{
|
|
246
|
+
add: add,
|
|
247
|
+
strip: strip || '',
|
|
248
|
+
type: type
|
|
249
|
+
}
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Build compound rules array.
|
|
253
|
+
#
|
|
254
|
+
# @param rules [Array<CompoundRule>] List of compound rules
|
|
255
|
+
# @return [Array<Hash>] Array of compound rule hashes
|
|
256
|
+
def build_compound_rules(rules)
|
|
257
|
+
return [] if rules.nil? || rules.empty?
|
|
258
|
+
|
|
259
|
+
rules.map do |rule|
|
|
260
|
+
{
|
|
261
|
+
text: rule.text,
|
|
262
|
+
flags: rule.flags,
|
|
263
|
+
full_match: ->(flag_sets) { rule.fullmatch(flag_sets) },
|
|
264
|
+
partial_match: ->(flag_sets) { rule.flags.intersect?(flag_sets.flatten.to_set) }
|
|
265
|
+
}
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
# Build compound patterns array.
|
|
270
|
+
#
|
|
271
|
+
# @param patterns [Array<CompoundPattern>] List of compound patterns
|
|
272
|
+
# @return [Array<Hash>] Array of compound pattern hashes
|
|
273
|
+
def build_compound_patterns(patterns)
|
|
274
|
+
return [] if patterns.nil? || patterns.empty?
|
|
275
|
+
|
|
276
|
+
patterns.map do |pattern|
|
|
277
|
+
{
|
|
278
|
+
match: ->(left, right) { pattern.match?(left, right) }
|
|
279
|
+
}
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
# Build break patterns array.
|
|
284
|
+
#
|
|
285
|
+
# @param break_patterns [Array<BreakPattern>] List of break patterns
|
|
286
|
+
# @return [Array<Hash>] Array of break pattern hashes
|
|
287
|
+
def build_break_patterns(break_patterns)
|
|
288
|
+
return [] if break_patterns.nil? || break_patterns.empty?
|
|
289
|
+
|
|
290
|
+
break_patterns.map do |bp|
|
|
291
|
+
{
|
|
292
|
+
pattern: bp.pattern,
|
|
293
|
+
matcher: bp.matcher
|
|
294
|
+
}
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
ResourceBundle = Struct.new(
|
|
5
|
+
:language,
|
|
6
|
+
:dictionary,
|
|
7
|
+
:frequency,
|
|
8
|
+
:model,
|
|
9
|
+
:rules,
|
|
10
|
+
:cached,
|
|
11
|
+
:source_urls,
|
|
12
|
+
keyword_init: true
|
|
13
|
+
) do
|
|
14
|
+
def cached?
|
|
15
|
+
cached ? true : false
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def has_frequency?
|
|
19
|
+
!frequency.nil?
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def has_model?
|
|
23
|
+
!model.nil?
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def has_rules?
|
|
27
|
+
!rules.nil?
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|