kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,260 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'zip'
4
+ require_relative 'readers/file_reader'
5
+ require_relative 'readers/aff_reader'
6
+ require_relative 'readers/dic_reader'
7
+ require_relative 'readers/lookup_builder'
8
+ require_relative 'algorithms/lookup'
9
+ require_relative 'algorithms/suggest'
10
+
11
+ module Kotoshu
12
+ # Unified Dictionary interface matching Spylls API.
13
+ #
14
+ # This class provides the main interface to Hunspell dictionaries,
15
+ # supporting loading from files, zip archives, and system paths.
16
+ #
17
+ # @example Loading from files
18
+ # dictionary = Dictionary.from_files('/path/to/dictionary/en_US')
19
+ # dictionary.lookup('spells') # => true
20
+ #
21
+ # @example Loading from zip archive
22
+ # dictionary = Dictionary.from_zip('/path/to/dictionary/en_US.odt')
23
+ #
24
+ # @example Loading from system
25
+ # dictionary = Dictionary.from_system('en_US')
26
+ #
27
+ # @example Getting suggestions
28
+ # dictionary.suggest('spylls') # => ["spells", "spills", ...]
29
+ #
30
+ # @example Accessing algorithms for experimentation
31
+ # dictionary.lookuper.good_forms('building') do |form|
32
+ # puts form
33
+ # end
34
+ class Dictionary
35
+ # System paths to search for Hunspell dictionaries
36
+ PATHES = [
37
+ '/usr/share/hunspell',
38
+ '/usr/share/myspell',
39
+ '/usr/share/myspell/dicts',
40
+ '/Library/Spelling',
41
+ '/opt/openoffice.org/basis3.0/share/dict/ooo',
42
+ '/usr/lib/openoffice.org/basis3.0/share/dict/ooo',
43
+ '/opt/openoffice.org2.4/share/dict/ooo',
44
+ '/usr/lib/openoffice.org2.4/share/dict/ooo',
45
+ '/opt/openoffice.org2.3/share/dict/ooo',
46
+ '/usr/lib/openoffice.org2.3/share/dict/ooo',
47
+ '/opt/openoffice.org2.2/share/dict/ooo',
48
+ '/usr/lib/openoffice.org2.2/share/dict/ooo',
49
+ '/opt/openoffice.org2.1/share/dict/ooo',
50
+ '/usr/lib/openoffice.org2.1/share/dict/ooo',
51
+ '/opt/openoffice.org2.0/share/dict/ooo',
52
+ '/usr/lib/openoffice.org2.0/share/dict/ooo'
53
+ ].freeze
54
+
55
+ # Distributed dictionaries for testing
56
+ DISTRIBUTED = {
57
+ 'en_US' => 'en',
58
+ 'ru' => 'ru',
59
+ 'sv_SE' => 'sv'
60
+ }.freeze
61
+
62
+ # @return [Hash] Aff data structure
63
+ attr_reader :aff
64
+
65
+ # @return [Array<Readers::Word>] Dic data structure
66
+ attr_reader :dic_words
67
+
68
+ # @return [Algorithms::Lookup::Lookuper] Lookuper instance for experimentation
69
+ attr_reader :lookuper
70
+
71
+ # @return [Algorithms::Suggest::Suggester] Suggester instance for experimentation
72
+ attr_reader :suggester
73
+
74
+ # Create a Dictionary from aff and dic data.
75
+ #
76
+ # @param aff [Hash] Aff data structure
77
+ # @param dic_words [Array<Readers::Word>] Dictionary word entries
78
+ def initialize(aff, dic_words)
79
+ @aff = aff
80
+ @dic_words = dic_words
81
+
82
+ # Create lookuper and suggester
83
+ @lookuper = Readers::LookupBuilder.from_data(aff, dic_words).build
84
+ @suggester = Algorithms::Suggest::Suggester.new(
85
+ aff: aff,
86
+ dic: build_dic_structure(dic_words),
87
+ lookuper: @lookuper
88
+ )
89
+ end
90
+
91
+ # Load dictionary from file path.
92
+ #
93
+ # The path should be the base name without extension, e.g., 'en_US'
94
+ # for files 'en_US.aff' and 'en_US.dic'.
95
+ #
96
+ # @param path [String] Base path to dictionary files (without extension)
97
+ # @return [Dictionary] The loaded dictionary
98
+ #
99
+ # @example
100
+ # Dictionary.from_files('en_US')
101
+ def self.from_files(path)
102
+ # Check if it's a distributed dictionary
103
+ if DISTRIBUTED.key?(path) && !File.exist?("#{path}.aff")
104
+ distributed_path = File.join(File.dirname(__FILE__), '../../data', DISTRIBUTED[path], path)
105
+ if File.exist?("#{distributed_path}.aff")
106
+ path = distributed_path
107
+ end
108
+ end
109
+
110
+ aff_path = "#{path}.aff"
111
+ dic_path = "#{path}.dic"
112
+
113
+ raise ArgumentError, "Dictionary file not found: #{aff_path}" unless File.exist?(aff_path)
114
+ raise ArgumentError, "Dictionary file not found: #{dic_path}" unless File.exist?(dic_path)
115
+
116
+ # Read aff file
117
+ aff_reader = Readers::AffReader.new(aff_path)
118
+ aff_data = aff_reader.read
119
+
120
+ # Read dic file
121
+ dic_reader = Readers::DicReader.new(dic_path,
122
+ flag_format: aff_data['FLAG'] || 'short',
123
+ flag_synonyms: aff_data['AF'] || {})
124
+ dic_words = dic_reader.read
125
+
126
+ new(aff_data, dic_words)
127
+ end
128
+
129
+ # Load dictionary from zip archive.
130
+ #
131
+ # Supports OpenOffice/LibreOffice dictionary extensions (.odt, .oxt)
132
+ # and Firefox/Thunderbird dictionary extensions (.xpi).
133
+ #
134
+ # @param zip_path [String] Path to zip archive
135
+ # @return [Dictionary] The loaded dictionary
136
+ #
137
+ # @example
138
+ # Dictionary.from_zip('en_US.odt')
139
+ def self.from_zip(zip_path)
140
+ Zip::File.open(zip_path) do |zipfile|
141
+ # Find .aff and .dic files
142
+ aff_entry = nil
143
+ dic_entry = nil
144
+
145
+ zipfile.each do |entry|
146
+ if entry.name.end_with?('.aff')
147
+ raise ArgumentError, "Multiple .aff files found in zip" if aff_entry
148
+ aff_entry = entry
149
+ elsif entry.name.end_with?('.dic')
150
+ raise ArgumentError, "Multiple .dic files found in zip" if dic_entry
151
+ dic_entry = entry
152
+ end
153
+ end
154
+
155
+ raise ArgumentError, "No .aff file found in zip" unless aff_entry
156
+ raise ArgumentError, "No .dic file found in zip" unless dic_entry
157
+
158
+ # Read aff file
159
+ aff_reader = Readers::ZipReader.new(zipfile, aff_entry.name)
160
+ aff_data = aff_reader.to_a
161
+ # Parse the raw data into proper aff structure
162
+ aff_reader = Readers::AffReader.new(zip_path) # Temporary for context
163
+ aff_data = Readers::AffReader.new(aff_entry.name).read
164
+
165
+ # Read dic file
166
+ dic_reader = Readers::DicReader.new(dic_entry.name,
167
+ flag_format: aff_data['FLAG'] || 'short',
168
+ flag_synonyms: aff_data['AF'] || {})
169
+ dic_words = dic_reader.read
170
+
171
+ new(aff_data, dic_words)
172
+ end
173
+ end
174
+
175
+ # Load dictionary from system paths.
176
+ #
177
+ # Searches standard system locations for Hunspell dictionaries.
178
+ #
179
+ # @param name [String] Dictionary name (e.g., 'en_US', 'ru_RU')
180
+ # @return [Dictionary] The loaded dictionary
181
+ # @raise [ArgumentError] If dictionary not found in system paths
182
+ #
183
+ # @example
184
+ # Dictionary.from_system('en_US')
185
+ def self.from_system(name)
186
+ PATHES.each do |folder|
187
+ aff_path = File.join(folder, "#{name}.aff")
188
+ if File.exist?(aff_path)
189
+ base_path = aff_path.sub(/\.aff$/, '')
190
+ return from_files(base_path)
191
+ end
192
+ end
193
+
194
+ raise ArgumentError, "#{name}.aff not found in system paths: #{PATHES.inspect}"
195
+ end
196
+
197
+ # Check if a word is correct.
198
+ #
199
+ # @param word [String] Word to check
200
+ # @return [Boolean] True if the word exists in the dictionary
201
+ #
202
+ # @example
203
+ # dictionary.lookup('spells') # => true
204
+ # dictionary.lookup('spylls') # => false
205
+ def lookup(word)
206
+ @lookuper.call(word)
207
+ end
208
+
209
+ # Generate suggestions for a misspelled word.
210
+ #
211
+ # Returns suggestions in order of probability/similarity,
212
+ # with best suggestions first.
213
+ #
214
+ # @param word [String] The misspelled word
215
+ # @yield [String] Each suggestion
216
+ # @return [Enumerator] If no block given
217
+ #
218
+ # @example
219
+ # dictionary.suggest('spylls') # => ["spells", "spills", ...]
220
+ def suggest(word)
221
+ return enum_for(:suggest, word) unless block_given?
222
+
223
+ @suggester.suggestions(word) do |suggestion|
224
+ yield suggestion
225
+ end
226
+ end
227
+
228
+ private
229
+
230
+ # Build dic structure for suggester.
231
+ #
232
+ # @param dic_words [Array<Readers::Word>] Dictionary word entries
233
+ # @return [Hash] Dic structure
234
+ def build_dic_structure(dic_words)
235
+ # Build a hash indexed by word for fast lookup
236
+ word_index = Hash.new { |h, k| h[k] = [] }
237
+
238
+ dic_words.each do |word|
239
+ word_index[word.stem] << {
240
+ stem: word.stem,
241
+ flags: word.flags.to_a
242
+ }
243
+ end
244
+
245
+ # Build the dic structure with homonyms callable
246
+ {
247
+ homonyms: ->(w) { word_index[w] || [] },
248
+ has_flag: ->(w, flag, for_all: false) {
249
+ entries = word_index[w] || []
250
+ flags_present = entries.map { |e| e[:flags] }.flatten
251
+ if for_all
252
+ flags_present.all? { |flags| flags.include?(flag) }
253
+ else
254
+ flags_present.any? { |flags| flags.include?(flag) }
255
+ end
256
+ }
257
+ }
258
+ end
259
+ end
260
+ end
@@ -0,0 +1,218 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base"
4
+
5
+ module Kotoshu
6
+ module Dictionary
7
+ # Unix system dictionary backend.
8
+ #
9
+ # This dictionary reads from Unix-style system dictionary files,
10
+ # typically located at `/usr/share/dict/words` or symlinks to
11
+ # dictionaries like `web2` (Webster's Second International).
12
+ #
13
+ # @example Using system dictionary
14
+ # dict = UnixWords.new("/usr/share/dict/words", language_code: "en-US")
15
+ # dict.lookup?("hello") # => true
16
+ # dict.suggest("helo") # => ["hello", "help", "held", ...]
17
+ #
18
+ # @example Auto-detecting system dictionary
19
+ # dict = UnixWords.detect(language_code: "en-US")
20
+ class UnixWords < Base
21
+ # Standard system paths to check for dictionaries.
22
+ SYSTEM_PATHS = [
23
+ "/usr/share/dict/words",
24
+ "/usr/share/dict/web2",
25
+ "/usr/share/dict/american-english",
26
+ "/usr/share/dict/british-english",
27
+ "/usr/dict/words",
28
+ "/System/Library/Assets/com_apple_MobileAsset_DictionaryServices_dictionaryOS/Dictionary/words" # macOS
29
+ ].freeze
30
+
31
+ # @return [String] The path to the dictionary file
32
+ attr_reader :path
33
+
34
+ # @return [Boolean] Whether lookups are case-sensitive
35
+ attr_reader :case_sensitive
36
+
37
+ # Create a new UnixWords dictionary.
38
+ #
39
+ # @param path [String] Path to the dictionary file
40
+ # @param language_code [String] The language code
41
+ # @param locale [String, nil] The locale (optional)
42
+ # @param case_sensitive [Boolean] Whether lookups are case-sensitive
43
+ # @param metadata [Hash] Additional metadata (optional)
44
+ def initialize(path, language_code:, locale: nil, case_sensitive: false, metadata: {})
45
+ super(language_code, locale: locale, metadata: metadata)
46
+
47
+ @path = File.expand_path(path)
48
+ @case_sensitive = case_sensitive
49
+ @words = load_words(@path)
50
+ @word_set = build_word_set
51
+
52
+ # Register this dictionary type
53
+ self.class.register_type(:unix_words) unless Dictionary.registry.key?(:unix_words)
54
+ end
55
+
56
+ # Check if a word exists in the dictionary.
57
+ #
58
+ # @param word [String] The word to look up
59
+ # @return [Boolean] True if the word exists
60
+ def lookup(word)
61
+ return false if word.nil? || word.empty?
62
+
63
+ lookup_word = @case_sensitive ? word : word.downcase
64
+ @word_set.key?(lookup_word)
65
+ end
66
+
67
+ # Generate spelling suggestions.
68
+ #
69
+ # Uses edit distance to find similar words in the dictionary.
70
+ #
71
+ # @param word [String] The misspelled word
72
+ # @param max_suggestions [Integer] Maximum suggestions
73
+ # @return [Array<String>] List of suggested words
74
+ def suggest(word, max_suggestions: 10)
75
+ return [] if word.nil? || word.empty?
76
+
77
+ # For now, use simple prefix matching and edit distance
78
+ # This will be improved with the suggestion algorithms
79
+ lookup_word = @case_sensitive ? word : word.downcase
80
+
81
+ # Find words with same prefix
82
+ prefix_len = [lookup_word.length - 1, 3].max
83
+ prefix = lookup_word[0...prefix_len]
84
+ candidates = @words.select { |w| w.start_with?(prefix) }
85
+
86
+ # Calculate edit distances
87
+ candidates.map do |dict_word|
88
+ dist = edit_distance(lookup_word, dict_word)
89
+ [dict_word, dist]
90
+ end.select { |_, dist| dist.positive? && dist <= 2 } # Only close matches
91
+ .sort_by { |_, dist| dist }
92
+ .first(max_suggestions)
93
+ .map(&:first)
94
+ end
95
+
96
+ # Add a word to the dictionary.
97
+ #
98
+ # @param word [String] The word to add
99
+ # @param flags [Array<String>] Flags (ignored for UnixWords)
100
+ # @return [Boolean] True if added
101
+ def add_word(word, flags: [])
102
+ return false if word.nil? || word.empty?
103
+
104
+ lookup_word = @case_sensitive ? word : word.downcase
105
+ return false if @word_set.key?(lookup_word)
106
+
107
+ @words << lookup_word
108
+ @word_set[lookup_word] = @words.length - 1
109
+
110
+ true
111
+ end
112
+
113
+ # Remove a word from the dictionary.
114
+ #
115
+ # @param word [String] The word to remove
116
+ # @return [Boolean] True if removed
117
+ def remove_word(word)
118
+ return false if word.nil? || word.empty?
119
+
120
+ lookup_word = @case_sensitive ? word : word.downcase
121
+ return false unless @word_set.key?(lookup_word)
122
+
123
+ index = @word_set.delete(lookup_word)
124
+ @words.delete_at(index)
125
+
126
+ true
127
+ end
128
+
129
+ # Get all words in the dictionary.
130
+ #
131
+ # @return [Array<String>] All words
132
+ def words
133
+ @words.dup
134
+ end
135
+
136
+ # Detect system dictionary path.
137
+ #
138
+ # Checks standard system paths for an existing dictionary file.
139
+ #
140
+ # @return [String, nil] The detected path or nil
141
+ #
142
+ # @example
143
+ # UnixWords.detect_system_dictionary # => "/usr/share/dict/words"
144
+ def self.detect_system_dictionary
145
+ SYSTEM_PATHS.find { |p| File.exist?(p) }
146
+ end
147
+
148
+ # Create a dictionary by auto-detecting system dictionary.
149
+ #
150
+ # @param language_code [String] The language code
151
+ # @param locale [String, nil] The locale (optional)
152
+ # @param case_sensitive [Boolean] Whether lookups are case-sensitive
153
+ # @return [UnixWords, nil] The dictionary or nil if not found
154
+ #
155
+ # @example
156
+ # dict = UnixWords.detect(language_code: "en-US")
157
+ def self.detect(language_code:, locale: nil, case_sensitive: false)
158
+ path = detect_system_dictionary
159
+ return nil unless path
160
+
161
+ new(path, language_code: language_code, locale: locale,
162
+ case_sensitive: case_sensitive)
163
+ end
164
+
165
+ private
166
+
167
+ # Load words from dictionary file.
168
+ #
169
+ # @param path [String] The file path
170
+ # @return [Array<String>] List of words
171
+ def load_words(path)
172
+ raise DictionaryNotFoundError, path unless File.exist?(path)
173
+
174
+ File.foreach(path, chomp: true)
175
+ .reject { |line| line.empty? || line.start_with?("#") }
176
+ .map { |word| @case_sensitive ? word : word.downcase }
177
+ end
178
+
179
+ # Build a hash set for O(1) lookups.
180
+ #
181
+ # @return [Hash] Word to index mapping
182
+ def build_word_set
183
+ @words.each_with_index.to_h
184
+ end
185
+
186
+ # Calculate Levenshtein edit distance.
187
+ #
188
+ # @param str1 [String] First string
189
+ # @param str2 [String] Second string
190
+ # @return [Integer] Edit distance
191
+ def edit_distance(str1, str2)
192
+ return str2.length if str1.empty?
193
+ return str1.length if str2.empty?
194
+
195
+ # Use smaller string for inner loop
196
+ str1, str2 = str2, str1 if str1.length > str2.length
197
+
198
+ previous = (0..str1.length).to_a
199
+
200
+ str2.each_char.with_index do |char2, j|
201
+ current = [j + 1]
202
+
203
+ str1.each_char.with_index do |char1, i|
204
+ insert_cost = current[i] + 1
205
+ delete_cost = previous[i + 1] + 1
206
+ substitute_cost = previous[i] + (char1 == char2 ? 0 : 1)
207
+
208
+ current << [insert_cost, delete_cost, substitute_cost].min
209
+ end
210
+
211
+ previous = current
212
+ end
213
+
214
+ previous.last
215
+ end
216
+ end
217
+ end
218
+ end