kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,526 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "open-uri"
4
+ require_relative "base"
5
+ require_relative "../readers/lookup_builder"
6
+ require_relative "../readers/aff_reader"
7
+ require_relative "../readers/dic_reader"
8
+
9
+ module Kotoshu
10
+ module Dictionary
11
+ # Hunspell dictionary backend.
12
+ #
13
+ # This dictionary reads Hunspell-formatted dictionary files (.dic and .aff).
14
+ # Hunspell is the spell checker used by LibreOffice, Firefox, Chrome, and many
15
+ # other applications.
16
+ #
17
+ # File format:
18
+ # - .dic: Dictionary file with word count on first line, words with optional flags
19
+ # - .aff: Affix file with prefix/suffix rules and configuration
20
+ #
21
+ # @example Creating a Hunspell dictionary
22
+ # dict = Hunspell.new(
23
+ # dic_path: "en_US.dic",
24
+ # aff_path: "en_US.aff",
25
+ # language_code: "en-US"
26
+ # )
27
+ # dict.lookup?("hello") # => true
28
+ #
29
+ # @example Creating from GitHub cache
30
+ # dict = Hunspell.from_github("de")
31
+ # dict.lookup?("über") # => true
32
+ #
33
+ # @see https://hunspell.github.io/ Hunspell documentation
34
+ class Hunspell < Base
35
+ # @return [String] Path to the .dic file
36
+ attr_reader :dic_path
37
+
38
+ # @return [String] Path to the .aff file
39
+ attr_reader :aff_path
40
+
41
+ # @return [Hash] Affix rules (flag => array of rules)
42
+ attr_reader :affix_rules
43
+
44
+ # @return [Hash] Configuration options from affix file
45
+ attr_reader :aff_config
46
+
47
+ # @return [Hash] Raw aff data from AffReader (cached for Lookuper)
48
+ attr_reader :aff_data
49
+
50
+ # @return [Array] Raw words from DicReader (cached for Lookuper)
51
+ attr_reader :dic_words
52
+
53
+ # @return [Algorithms::Lookup::Lookuper] The lookup algorithm instance
54
+ def lookuper
55
+ @lookuper ||= Readers::LookupBuilder.from_data(@aff_data, @dic_words).build
56
+ end
57
+
58
+ class << self
59
+ # Load Hunspell dictionary from GitHub cache, downloading if necessary.
60
+ #
61
+ # This class method provides automatic dictionary management by:
62
+ # 1. Checking the local cache for existing dictionaries
63
+ # 2. Downloading from GitHub if not cached or expired
64
+ # 3. Managing cache metadata and TTL
65
+ #
66
+ # @example Load English dictionary
67
+ # dict = Hunspell.from_github("en")
68
+ # dict.lookup?("hello") # => true
69
+ #
70
+ # @example Load German dictionary
71
+ # dict = Hunspell.from_github("de")
72
+ # dict.lookup?("über") # => true
73
+ #
74
+ # @example Force re-download
75
+ # dict = Hunspell.from_github("fr", force_download: true)
76
+ #
77
+ # @param language_code [String] ISO 639-1 language code (e.g., 'en', 'de', 'fr')
78
+ # @param cache [Cache::LanguageCache, nil] Custom cache instance (optional)
79
+ # @param force_download [Boolean] Force re-download even if cached
80
+ # @return [Hunspell] Configured Hunspell dictionary instance
81
+ # @raise [ArgumentError] If language_code is not supported
82
+ def from_github(language_code, cache: nil, force_download: false)
83
+ require_relative '../cache/language_cache'
84
+
85
+ cache ||= Cache::LanguageCache.new
86
+ cached = cache.get_dictionary(language_code, force_download: force_download)
87
+
88
+ new(
89
+ dic_path: cached[:dic_path],
90
+ aff_path: cached[:aff_path],
91
+ language_code: language_code,
92
+ metadata: {
93
+ source: 'github',
94
+ github_url: cached[:metadata]['url'],
95
+ checksum: cached[:metadata]['checksum'],
96
+ downloaded_at: cached[:metadata]['downloaded_at']
97
+ }
98
+ )
99
+ end
100
+
101
+ # Check if a language is available on GitHub.
102
+ #
103
+ # @param language_code [String] ISO 639-1 language code
104
+ # @param cache [Cache::LanguageCache, nil] Custom cache instance (optional)
105
+ # @return [Boolean] True if language is supported
106
+ def available_on_github?(language_code, cache: nil)
107
+ require_relative '../cache/language_cache'
108
+
109
+ cache ||= Cache::LanguageCache.new
110
+ cache.available_languages.include?(language_code)
111
+ end
112
+
113
+ # Get list of available languages on GitHub.
114
+ #
115
+ # @param cache [Cache::LanguageCache, nil] Custom cache instance (optional)
116
+ # @return [Array<String>] List of supported language codes
117
+ def available_github_languages(cache: nil)
118
+ require_relative '../cache/language_cache'
119
+
120
+ cache ||= Cache::LanguageCache.new
121
+ cache.available_languages
122
+ end
123
+
124
+ # Get information about a language from GitHub.
125
+ #
126
+ # @param language_code [String] ISO 639-1 language code
127
+ # @param cache [Cache::LanguageCache, nil] Custom cache instance (optional)
128
+ # @return [Hash] Language information
129
+ def language_info(language_code, cache: nil)
130
+ require_relative '../cache/language_cache'
131
+
132
+ cache ||= Cache::LanguageCache.new
133
+ cache.get_language_info(language_code)
134
+ end
135
+ end
136
+
137
+ # Create a new Hunspell dictionary.
138
+ #
139
+ # @param dic_path [String] Path or URL to the .dic file
140
+ # @param aff_path [String] Path or URL to the .aff file
141
+ # @param language_code [String] The language code
142
+ # @param locale [String, nil] The locale (optional)
143
+ # @param metadata [Hash] Additional metadata (optional)
144
+ def initialize(dic_path:, aff_path:, language_code:, locale: nil, metadata: {})
145
+ super(language_code, locale: locale, metadata: metadata)
146
+
147
+ @dic_path = resolve_path(dic_path)
148
+ @aff_path = resolve_path(aff_path)
149
+
150
+ raise DictionaryNotFoundError, @aff_path unless File.exist?(@aff_path)
151
+ raise DictionaryNotFoundError, @dic_path unless File.exist?(@dic_path)
152
+
153
+ # Read aff file using AffReader and cache the data
154
+ aff_reader = Readers::AffReader.new(@aff_path)
155
+ @aff_data = aff_reader.read
156
+ @aff_config = @aff_data # For backward compatibility
157
+
158
+ # Read dic file using DicReader with the same encoding as the aff file
159
+ dic_reader = Readers::DicReader.new(@dic_path,
160
+ encoding: aff_reader.encoding,
161
+ flag_format: @aff_data['FLAG'] || 'short',
162
+ flag_synonyms: @aff_data['AF'] || {})
163
+ @dic_words = dic_reader.read
164
+
165
+ # Build legacy structures for backward compatibility
166
+ @word_index = build_word_index(@dic_words)
167
+ @affix_rules = parse_affix_rules(@aff_config)
168
+
169
+ # Lazy initialization of Lookuper (only created when needed)
170
+ @lookuper = nil
171
+
172
+ # Register this dictionary type
173
+ self.class.register_type(:hunspell) unless Dictionary.registry.key?(:hunspell)
174
+ end
175
+
176
+ private
177
+
178
+ # Build word index from DicReader words.
179
+ #
180
+ # @param words [Array<Readers::Word>] Words from DicReader
181
+ # @return [Hash] Word index (word => flags)
182
+ def build_word_index(words)
183
+ index = {}
184
+ words.each do |word|
185
+ index[word.stem.downcase] = word.flags.to_a
186
+ end
187
+ index
188
+ end
189
+
190
+ # Parse affix rules from AffReader data.
191
+ #
192
+ # @param aff_data [Hash] Aff data from AffReader
193
+ # @return [Hash] Affix rules by type
194
+ def parse_affix_rules(aff_data)
195
+ rules = {
196
+ prefix: Hash.new { |h, k| h[k] = [] },
197
+ suffix: Hash.new { |h, k| h[k] = [] }
198
+ }
199
+
200
+ # Convert AffReader's SFX/PFX data to legacy format
201
+ # AffReader returns: 'SFX' => { flag => [Affix, ...] }
202
+ # We need to convert each Affix to Models::AffixRule
203
+
204
+ aff_data['SFX']&.each do |flag, affix_list|
205
+ rules[:suffix][flag] = affix_list.map do |affix|
206
+ convert_to_affix_rule(affix, :suffix)
207
+ end
208
+ end
209
+
210
+ aff_data['PFX']&.each do |flag, affix_list|
211
+ rules[:prefix][flag] = affix_list.map do |affix|
212
+ convert_to_affix_rule(affix, :prefix)
213
+ end
214
+ end
215
+
216
+ rules
217
+ end
218
+
219
+ # Convert AffReader Affix to Models::AffixRule.
220
+ #
221
+ # @param affix [Readers::Affix] The affix to convert
222
+ # @param type [Symbol] :prefix or :suffix
223
+ # @return [Models::AffixRule] The converted rule
224
+ def convert_to_affix_rule(affix, type)
225
+ # Create a simple string representation for from_hunspell
226
+ # Format: PFX/SFX FLAG crossproduct strip add condition
227
+ cross_str = affix.crossproduct ? 'Y' : 'N'
228
+ strip_str = affix.strip.empty? ? '0' : affix.strip
229
+ add_str = affix.add.empty? ? '0' : affix.add
230
+ condition_str = affix.condition || '.'
231
+
232
+ type_str = type == :prefix ? 'PFX' : 'SFX'
233
+ rule_line = "#{type_str} #{affix.flag} #{cross_str} #{strip_str} #{add_str} #{condition_str}"
234
+
235
+ Models::AffixRule.from_hunspell(rule_line, type)
236
+ end
237
+
238
+ # Check if path is a URL
239
+ # @param path [String] Path to check
240
+ # @return [Boolean] True if path is a URL
241
+ def url?(path)
242
+ path.start_with?("http://", "https://")
243
+ end
244
+
245
+ # Resolve path to local file path (downloading if URL)
246
+ # @param path [String] Path or URL
247
+ # @return [String] Local file path
248
+ def resolve_path(path)
249
+ return File.expand_path(path) unless url?(path)
250
+
251
+ download_to_temp(path)
252
+ end
253
+
254
+ # Download URL to temporary file
255
+ # @param url [String] URL to download
256
+ # @return [String] Temporary file path
257
+ def download_to_temp(url)
258
+ require "tempfile"
259
+
260
+ uri = URI.parse(url)
261
+ filename = File.basename(uri.path)
262
+
263
+ temp = Tempfile.new([filename, ""], encoding: "UTF-8")
264
+ temp.binmode
265
+
266
+ URI.open(uri, "rb") do |remote_file|
267
+ IO.copy_stream(remote_file, temp)
268
+ end
269
+
270
+ temp.close
271
+ temp.path
272
+ end
273
+
274
+ public
275
+
276
+ # Check if a word exists in the dictionary.
277
+ #
278
+ # Uses the Lookup::Lookuper algorithm for full affix and compound support.
279
+ #
280
+ # @param word [String] The word to look up
281
+ # @return [Boolean] True if the word exists
282
+ def lookup(word)
283
+ return false if word.nil? || word.empty?
284
+
285
+ # Use the Lookuper for full Hunspell algorithm support
286
+ lookuper.call(word)
287
+ end
288
+
289
+ # Generate spelling suggestions.
290
+ #
291
+ # @param word [String] The misspelled word
292
+ # @param max_suggestions [Integer] Maximum suggestions
293
+ # @return [Array<String>] List of suggested words
294
+ def suggest(word, max_suggestions: 10)
295
+ return [] if word.nil? || word.empty?
296
+
297
+ all_words = @word_index.keys + generate_affix_variants
298
+ lookup_word = word.downcase
299
+
300
+ # Find words with same prefix
301
+ prefix_len = [lookup_word.length - 1, 2].max
302
+ prefix = lookup_word[0...prefix_len]
303
+ candidates = all_words.select { |w| w.downcase.start_with?(prefix) }
304
+
305
+ # Calculate edit distances
306
+ candidates.map do |dict_word|
307
+ dist = edit_distance(lookup_word, dict_word.downcase)
308
+ [dict_word, dist]
309
+ end.select { |_, dist| dist.positive? && dist <= 2 }
310
+ .sort_by { |_, dist| dist }
311
+ .first(max_suggestions)
312
+ .map(&:first)
313
+ end
314
+
315
+ # Add a word to the dictionary.
316
+ #
317
+ # @param word [String] The word to add
318
+ # @param flags [Array<String>] Morphological flags
319
+ # @return [Boolean] True if added
320
+ def add_word(word, flags: [])
321
+ return false if word.nil? || word.empty?
322
+
323
+ word_key = word.downcase
324
+ @word_index[word_key] = flags
325
+
326
+ true
327
+ end
328
+
329
+ # Remove a word from the dictionary.
330
+ #
331
+ # @param word [String] The word to remove
332
+ # @return [Boolean] True if removed
333
+ def remove_word(word)
334
+ return false if word.nil? || word.empty?
335
+
336
+ word_key = word.downcase
337
+ !@word_index.delete(word_key).nil?
338
+ end
339
+
340
+ # Get all words in the dictionary.
341
+ #
342
+ # @return [Array<String>] All words
343
+ def words
344
+ @word_index.keys.dup
345
+ end
346
+
347
+ # Get word variants using affix rules.
348
+ #
349
+ # @param word [String] The word
350
+ # @return [Array<String>] Word variants
351
+ def word_variants(word)
352
+ return [] if word.nil? || word.empty?
353
+
354
+ variants = []
355
+
356
+ # Get flags for this word (if any)
357
+ word_key = word.downcase
358
+ flags = @word_index[word_key] || []
359
+
360
+ # Generate prefix variants
361
+ @affix_rules[:prefix].each do |flag, rules|
362
+ next unless flags.include?(flag)
363
+
364
+ rules.each do |rule|
365
+ variant = rule.apply(word)
366
+ variants << variant if variant
367
+ end
368
+ end
369
+
370
+ # Generate suffix variants
371
+ @affix_rules[:suffix].each do |flag, rules|
372
+ next unless flags.include?(flag)
373
+
374
+ rules.each do |rule|
375
+ variant = rule.apply(word)
376
+ variants << variant if variant
377
+ end
378
+ end
379
+
380
+ variants
381
+ end
382
+
383
+ private
384
+
385
+ # Load the dictionary file.
386
+ #
387
+ # @param path [String] Path to .dic file
388
+ # @return [Hash] Word index (word => flags)
389
+ def load_dic_file(path)
390
+ index = {}
391
+ lines = File.readlines(path, chomp: true)
392
+
393
+ # First line is word count
394
+ return index if lines.empty?
395
+
396
+ # Parse remaining lines
397
+ lines[1..].each do |line|
398
+ next if line.empty? || line.start_with?("#") || line.strip.empty?
399
+
400
+ parts = line.split("/")
401
+ word = parts[0]
402
+
403
+ # Skip if word is nil or empty after stripping
404
+ next if word.nil? || word.strip.empty?
405
+
406
+ word = word.strip
407
+ flags = parts[1] ? parts[1].split("") : []
408
+
409
+ index[word.downcase] = flags
410
+ end
411
+
412
+ index
413
+ end
414
+
415
+ # Load the affix file.
416
+ #
417
+ # @param path [String] Path to .aff file
418
+ # @return [Hash] Configuration options
419
+ def load_aff_file(path)
420
+ config = {
421
+ set: "UTF-8",
422
+ try: "",
423
+ flag: "char", # or "long" or "num"
424
+ affix_rules: []
425
+ }
426
+
427
+ File.foreach(path, chomp: true) do |line|
428
+ next if line.empty? || line.start_with?("#")
429
+
430
+ parts = line.split
431
+ next if parts.empty?
432
+
433
+ keyword = parts[0].upcase
434
+
435
+ case keyword
436
+ when "SET"
437
+ config[:set] = parts[1]
438
+ when "TRY"
439
+ config[:try] = parts[1]
440
+ when "FLAG"
441
+ config[:flag] = parts[1]
442
+ when "PFX", "SFX"
443
+ config[:affix_rules] << line
444
+ when "REP", "MAP", "COMPOUNDRULE", "COMPOUNDWORDMIN", "COMPOUNDFLAG"
445
+ # Store for future use
446
+ config[keyword.downcase.to_sym] ||= []
447
+ config[keyword.downcase.to_sym] << line
448
+ end
449
+ end
450
+
451
+ config
452
+ end
453
+
454
+ # Direct lookup without affix processing.
455
+ #
456
+ # @param word [String] The word
457
+ # @return [Boolean] True if word exists
458
+ def direct_lookup?(word)
459
+ word_key = word.downcase
460
+ @word_index.key?(word_key)
461
+ end
462
+
463
+ # Generate all possible affix variants.
464
+ #
465
+ # @return [Array<String>] All variants
466
+ def generate_affix_variants
467
+ variants = []
468
+
469
+ @affix_rules[:prefix].each do |flag, rules|
470
+ rules.each do |rule|
471
+ @word_index.each do |word, flags|
472
+ next unless flags.include?(flag)
473
+
474
+ variant = rule.apply(word)
475
+ variants << variant if variant
476
+ end
477
+ end
478
+ end
479
+
480
+ @affix_rules[:suffix].each do |flag, rules|
481
+ rules.each do |rule|
482
+ @word_index.each do |word, flags|
483
+ next unless flags.include?(flag)
484
+
485
+ variant = rule.apply(word)
486
+ variants << variant if variant
487
+ end
488
+ end
489
+ end
490
+
491
+ variants.uniq
492
+ end
493
+
494
+ # Calculate Levenshtein edit distance.
495
+ #
496
+ # @param str1 [String] First string
497
+ # @param str2 [String] Second string
498
+ # @return [Integer] Edit distance
499
+ def edit_distance(str1, str2)
500
+ return str2.length if str1.empty?
501
+ return str1.length if str2.empty?
502
+
503
+ # Use smaller string for inner loop
504
+ str1, str2 = str2, str1 if str1.length > str2.length
505
+
506
+ previous = (0..str1.length).to_a
507
+
508
+ str2.each_char.with_index do |char2, j|
509
+ current = [j + 1]
510
+
511
+ str1.each_char.with_index do |char1, i|
512
+ insert_cost = current[i] + 1
513
+ delete_cost = previous[i + 1] + 1
514
+ substitute_cost = previous[i] + (char1 == char2 ? 0 : 1)
515
+
516
+ current << [insert_cost, delete_cost, substitute_cost].min
517
+ end
518
+
519
+ previous = current
520
+ end
521
+
522
+ previous.last
523
+ end
524
+ end
525
+ end
526
+ end