kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,282 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "open-uri"
4
+ require_relative "base"
5
+
6
+ module Kotoshu
7
+ module Dictionary
8
+ # Plain text dictionary backend.
9
+ #
10
+ # This dictionary reads from simple plain text word lists,
11
+ # with support for comments and various formatting options.
12
+ #
13
+ # File format:
14
+ # - One word per line
15
+ # - Lines starting with # are comments
16
+ # - Empty lines are ignored
17
+ # - Supports multi-word phrases (e.g., "New York")
18
+ #
19
+ # @example Creating from a file
20
+ # dict = PlainText.new("words.txt", language_code: "en-US")
21
+ # dict.lookup?("hello") # => true
22
+ #
23
+ # @example Creating from a URL
24
+ # dict = PlainText.new("https://raw.githubusercontent.com/kotoshu/dictionaries/main/en_US/words.txt",
25
+ # language_code: "en-US")
26
+ #
27
+ # @example Creating from an array
28
+ # dict = PlainText.from_words(%w[hello world test], language_code: "en")
29
+ class PlainText < Base
30
+ # @return [String] The path to the dictionary file (or nil if created from array)
31
+ attr_reader :path
32
+
33
+ # @return [Boolean] Whether lookups are case-sensitive
34
+ attr_reader :case_sensitive
35
+
36
+ # @return [Regexp, nil] Pattern for word filtering
37
+ attr_reader :word_pattern
38
+
39
+ # Create a new PlainText dictionary.
40
+ #
41
+ # @param path [String] Path to the dictionary file or URL
42
+ # @param language_code [String] The language code
43
+ # @param locale [String, nil] The locale (optional)
44
+ # @param case_sensitive [Boolean] Whether lookups are case-sensitive
45
+ # @param word_pattern [Regexp, nil] Pattern to filter words (optional)
46
+ # @param metadata [Hash] Additional metadata (optional)
47
+ def initialize(path, language_code:, locale: nil, case_sensitive: false,
48
+ word_pattern: nil, metadata: {})
49
+ super(language_code, locale: locale, metadata: metadata)
50
+
51
+ @original_path = path
52
+ @path = resolve_path(path)
53
+ @case_sensitive = case_sensitive
54
+ @word_pattern = word_pattern
55
+ @words = load_words(@path)
56
+ @word_set = build_word_set
57
+
58
+ # Register this dictionary type
59
+ self.class.register_type(:plain_text) unless Dictionary.registry.key?(:plain_text)
60
+ end
61
+
62
+ # Check if a word exists in the dictionary.
63
+ #
64
+ # @param word [String] The word to look up
65
+ # @return [Boolean] True if the word exists
66
+ def lookup(word)
67
+ return false if word.nil? || word.empty?
68
+
69
+ lookup_word = @case_sensitive ? word : word.downcase
70
+ @word_set.key?(lookup_word)
71
+ end
72
+
73
+ # Generate spelling suggestions.
74
+ #
75
+ # Uses edit distance to find similar words in the dictionary.
76
+ #
77
+ # @param word [String] The misspelled word
78
+ # @param max_suggestions [Integer] Maximum suggestions
79
+ # @return [Array<String>] List of suggested words
80
+ def suggest(word, max_suggestions: 10)
81
+ return [] if word.nil? || word.empty?
82
+
83
+ lookup_word = @case_sensitive ? word : word.downcase
84
+
85
+ # Find words with same prefix
86
+ prefix_len = [lookup_word.length - 1, 3].max
87
+ prefix = lookup_word[0...prefix_len]
88
+ candidates = @words.select { |w| w.start_with?(prefix) }
89
+
90
+ # Calculate edit distances
91
+ candidates.map do |dict_word|
92
+ dist = edit_distance(lookup_word, dict_word)
93
+ [dict_word, dist]
94
+ end.select { |_, dist| dist.positive? && dist <= 2 }
95
+ .sort_by { |_, dist| dist }
96
+ .first(max_suggestions)
97
+ .map(&:first)
98
+ end
99
+
100
+ # Add a word to the dictionary.
101
+ #
102
+ # @param word [String] The word to add
103
+ # @param flags [Array<String>] Flags (ignored for PlainText)
104
+ # @return [Boolean] True if added
105
+ def add_word(word, flags: [])
106
+ return false if word.nil? || word.empty?
107
+
108
+ lookup_word = @case_sensitive ? word : word.downcase
109
+ return false if @word_set.key?(lookup_word)
110
+
111
+ @words << lookup_word
112
+ @word_set[lookup_word] = @words.length - 1
113
+
114
+ true
115
+ end
116
+
117
+ # Remove a word from the dictionary.
118
+ #
119
+ # @param word [String] The word to remove
120
+ # @return [Boolean] True if removed
121
+ def remove_word(word)
122
+ return false if word.nil? || word.empty?
123
+
124
+ lookup_word = @case_sensitive ? word : word.downcase
125
+ return false unless @word_set.key?(lookup_word)
126
+
127
+ index = @word_set.delete(lookup_word)
128
+ @words.delete_at(index)
129
+
130
+ true
131
+ end
132
+
133
+ # Get all words in the dictionary.
134
+ #
135
+ # @return [Array<String>] All words
136
+ def words
137
+ @words.dup
138
+ end
139
+
140
+ # Create a dictionary from an array of words.
141
+ #
142
+ # @param words [Array<String>] The words
143
+ # @param language_code [String] The language code
144
+ # @param locale [String, nil] The locale (optional)
145
+ # @param case_sensitive [Boolean] Whether lookups are case-sensitive
146
+ # @return [PlainText] New dictionary
147
+ #
148
+ # @example
149
+ # dict = PlainText.from_words(%w[hello world test], language_code: "en")
150
+ def self.from_words(words, language_code:, locale: nil, case_sensitive: false)
151
+ dict = allocate
152
+
153
+ dict.instance_variable_set(:@language_code, language_code.dup.freeze)
154
+ dict.instance_variable_set(:@locale, locale&.dup&.freeze)
155
+ dict.instance_variable_set(:@path, nil)
156
+ dict.instance_variable_set(:@case_sensitive, case_sensitive)
157
+ dict.instance_variable_set(:@word_pattern, nil)
158
+ dict.instance_variable_set(:@words, words.dup.map { |w| case_sensitive ? w : w.downcase })
159
+ dict.instance_variable_set(:@word_set, dict.instance_variable_get(:@words).each_with_index.to_h)
160
+ dict.instance_variable_set(:@metadata, {}.freeze)
161
+
162
+ # Register this dictionary type (unless already registered)
163
+ register_type(:plain_text) unless Dictionary.registry.key?(:plain_text)
164
+
165
+ dict
166
+ end
167
+
168
+ # Create a dictionary from a string.
169
+ #
170
+ # @param text [String] The text containing words (newline separated)
171
+ # @param language_code [String] The language code
172
+ # @param locale [String, nil] The locale (optional)
173
+ # @param case_sensitive [Boolean] Whether lookups are case-sensitive
174
+ # @return [PlainText] New dictionary
175
+ #
176
+ # @example
177
+ # text = "hello\nworld\ntest"
178
+ # dict = PlainText.from_string(text, language_code: "en")
179
+ def self.from_string(text, language_code:, locale: nil, case_sensitive: false)
180
+ words = text.split("\n").reject { |l| l.empty? || l.strip.start_with?("#") }
181
+ .map(&:strip)
182
+
183
+ from_words(words, language_code: language_code, locale: locale,
184
+ case_sensitive: case_sensitive)
185
+ end
186
+
187
+ private
188
+
189
+ # Resolve path - handles URLs by downloading to temp location.
190
+ #
191
+ # @param path [String] File path or URL
192
+ # @return [String] Local file path
193
+ def resolve_path(path)
194
+ return File.expand_path(path) unless url?(path)
195
+
196
+ # Download URL to temp file
197
+ download_to_temp(path)
198
+ end
199
+
200
+ # Check if path is a URL.
201
+ #
202
+ # @param path [String] Path to check
203
+ # @return [Boolean] True if URL
204
+ def url?(path)
205
+ path.start_with?("http://", "https://")
206
+ end
207
+
208
+ # Download URL to temporary file.
209
+ #
210
+ # @param url [String] URL to download
211
+ # @return [String] Path to downloaded file
212
+ def download_to_temp(url)
213
+ require "tempfile"
214
+
215
+ uri = URI.parse(url)
216
+ filename = File.basename(uri.path)
217
+
218
+ temp = Tempfile.new([filename, ".txt"], encoding: "UTF-8")
219
+ temp.binmode
220
+
221
+ URI.open(uri, "rb") do |remote_file|
222
+ IO.copy_stream(remote_file, temp)
223
+ end
224
+
225
+ temp.close
226
+ temp.path
227
+ end
228
+
229
+ # Load words from dictionary file.
230
+ #
231
+ # @param path [String] The file path
232
+ # @return [Array<String>] List of words
233
+ def load_words(path)
234
+ raise DictionaryNotFoundError, path unless File.exist?(path)
235
+
236
+ File.foreach(path, chomp: true)
237
+ .reject { |line| line.empty? || line.strip.start_with?("#") }
238
+ .map(&:strip)
239
+ .select { |word| @word_pattern.nil? || word.match?(@word_pattern) }
240
+ .map { |word| @case_sensitive ? word : word.downcase }
241
+ end
242
+
243
+ # Build a hash set for O(1) lookups.
244
+ #
245
+ # @return [Hash] Word to index mapping
246
+ def build_word_set
247
+ @words.each_with_index.to_h
248
+ end
249
+
250
+ # Calculate Levenshtein edit distance.
251
+ #
252
+ # @param str1 [String] First string
253
+ # @param str2 [String] Second string
254
+ # @return [Integer] Edit distance
255
+ def edit_distance(str1, str2)
256
+ return str2.length if str1.empty?
257
+ return str1.length if str2.empty?
258
+
259
+ # Use smaller string for inner loop
260
+ str1, str2 = str2, str1 if str1.length > str2.length
261
+
262
+ previous = (0..str1.length).to_a
263
+
264
+ str2.each_char.with_index do |char2, j|
265
+ current = [j + 1]
266
+
267
+ str1.each_char.with_index do |char1, i|
268
+ insert_cost = current[i] + 1
269
+ delete_cost = previous[i + 1] + 1
270
+ substitute_cost = previous[i] + (char1 == char2 ? 0 : 1)
271
+
272
+ current << [insert_cost, delete_cost, substitute_cost].min
273
+ end
274
+
275
+ previous = current
276
+ end
277
+
278
+ previous.last
279
+ end
280
+ end
281
+ end
282
+ end
@@ -0,0 +1,248 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base"
4
+
5
+ module Kotoshu
6
+ module Dictionary
7
+ # Repository for managing multiple dictionary instances.
8
+ #
9
+ # This class provides a centralized registry for dictionaries,
10
+ # allowing them to be registered and retrieved by key.
11
+ #
12
+ # @example Registering and retrieving dictionaries
13
+ # repo = Repository.new
14
+ # repo.register(:en_US, unix_dict)
15
+ # repo.register(:custom, custom_dict)
16
+ # repo.get(:en_US) # => unix_dict
17
+ #
18
+ # @example Using the global repository
19
+ # Repository.register(:en_US, dict)
20
+ # Repository.get(:en_US)
21
+ class Repository
22
+ # @return [Hash] The dictionary storage
23
+ attr_reader :dictionaries
24
+
25
+ # Create a new repository.
26
+ #
27
+ # @param dictionaries [Hash] Initial dictionaries (optional)
28
+ def initialize(dictionaries = {})
29
+ @dictionaries = dictionaries.dup
30
+ end
31
+
32
+ # Register a dictionary.
33
+ #
34
+ # @param key [Symbol, String] The key to register under
35
+ # @param dictionary [Base] The dictionary instance
36
+ # @return [self] Self for chaining
37
+ #
38
+ # @example
39
+ # repo.register(:en_US, unix_dict)
40
+ def register(key, dictionary)
41
+ @dictionaries[key.to_sym] = dictionary
42
+ self
43
+ end
44
+ alias add register
45
+ alias []= register
46
+
47
+ # Get a dictionary by key.
48
+ #
49
+ # @param key [Symbol, String] The key
50
+ # @return [Base, nil] The dictionary or nil if not found
51
+ #
52
+ # @example
53
+ # repo.get(:en_US)
54
+ def get(key)
55
+ @dictionaries[key.to_sym]
56
+ end
57
+ alias [] get
58
+
59
+ # Check if a key is registered.
60
+ #
61
+ # @param key [Symbol, String] The key
62
+ # @return [Boolean] True if the key exists
63
+ #
64
+ # @example
65
+ # repo.registered?(:en_US) # => true
66
+ def registered?(key)
67
+ @dictionaries.key?(key.to_sym)
68
+ end
69
+ alias has_key? registered?
70
+ alias key? registered?
71
+
72
+ # Unregister a dictionary.
73
+ #
74
+ # @param key [Symbol, String] The key
75
+ # @return [Base, nil] The removed dictionary or nil
76
+ #
77
+ # @example
78
+ # repo.unregister(:en_US)
79
+ def unregister(key)
80
+ @dictionaries.delete(key.to_sym)
81
+ end
82
+ alias remove unregister
83
+
84
+ # Clear all dictionaries.
85
+ #
86
+ # @return [self] Self for chaining
87
+ def clear
88
+ @dictionaries.clear
89
+ self
90
+ end
91
+
92
+ # Get all registered keys.
93
+ #
94
+ # @return [Array<Symbol>] All keys
95
+ def keys
96
+ @dictionaries.keys
97
+ end
98
+
99
+ # Iterate over registered keys.
100
+ #
101
+ # @yield [key] Block to execute for each key
102
+ # @return [Enumerator] Enumerator if no block given
103
+ def each_key(&block)
104
+ return enum_for(:each_key) unless block_given?
105
+
106
+ @dictionaries.each_key(&block)
107
+ end
108
+
109
+ # Get all dictionaries.
110
+ #
111
+ # @return [Array<Base>] All dictionaries
112
+ def values
113
+ @dictionaries.values
114
+ end
115
+
116
+ # Get the number of registered dictionaries.
117
+ #
118
+ # @return [Integer] Dictionary count
119
+ def size
120
+ @dictionaries.size
121
+ end
122
+ alias count size
123
+ alias length size
124
+
125
+ # Check if the repository is empty.
126
+ #
127
+ # @return [Boolean] True if empty
128
+ def empty?
129
+ @dictionaries.empty?
130
+ end
131
+
132
+ # Iterate over dictionaries.
133
+ #
134
+ # @yield [key, dictionary] Each key and dictionary
135
+ # @return [Enumerator] Enumerator if no block given
136
+ def each(&block)
137
+ return enum_for(:each) unless block_given?
138
+
139
+ @dictionaries.each(&block)
140
+ end
141
+
142
+ # Merge another repository into this one.
143
+ #
144
+ # @param other [Repository, Hash] The repository or hash to merge
145
+ # @return [self] Self for chaining
146
+ #
147
+ # @example
148
+ # repo1.merge(repo2)
149
+ def merge(other)
150
+ dicts_to_merge = other.is_a?(Repository) ? other.dictionaries : other
151
+
152
+ @dictionaries.merge!(dicts_to_merge)
153
+ self
154
+ end
155
+
156
+ # Find dictionaries by language code.
157
+ #
158
+ # @param language_code [String] The language code
159
+ # @return [Array<Base>] Matching dictionaries
160
+ #
161
+ # @example
162
+ # repo.find_by_language("en-US")
163
+ def find_by_language(language_code)
164
+ @dictionaries.values.select do |dict|
165
+ dict.language_code.casecmp(language_code).zero?
166
+ end
167
+ end
168
+
169
+ # Convert to hash.
170
+ #
171
+ # @return [Hash] Hash representation
172
+ def to_h
173
+ @dictionaries.dup
174
+ end
175
+
176
+ # String representation.
177
+ #
178
+ # @return [String] String representation
179
+ def to_s
180
+ "Repository(size: #{size})"
181
+ end
182
+ alias inspect to_s
183
+
184
+ # Global repository instance.
185
+ #
186
+ # @return [Repository] The global repository
187
+ #
188
+ # @example Using the global repository
189
+ # Repository.instance.register(:en_US, dict)
190
+ def self.instance
191
+ @instance ||= new
192
+ end
193
+
194
+ # Register a dictionary in the global repository.
195
+ #
196
+ # @param key [Symbol, String] The key
197
+ # @param dictionary [Base] The dictionary
198
+ # @return [Repository] The global repository
199
+ #
200
+ # @example
201
+ # Repository.register(:en_US, dict)
202
+ def self.register(key, dictionary)
203
+ instance.register(key, dictionary)
204
+ end
205
+
206
+ # Get a dictionary from the global repository.
207
+ #
208
+ # @param key [Symbol, String] The key
209
+ # @return [Base, nil] The dictionary or nil
210
+ #
211
+ # @example
212
+ # Repository.get(:en_US)
213
+ def self.get(key)
214
+ instance.get(key)
215
+ end
216
+
217
+ # Unregister a dictionary from the global repository.
218
+ #
219
+ # @param key [Symbol, String] The key
220
+ # @return [Base, nil] The removed dictionary or nil
221
+ def self.unregister(key)
222
+ instance.unregister(key)
223
+ end
224
+
225
+ # Clear the global repository.
226
+ #
227
+ # @return [Repository] The global repository
228
+ def self.clear
229
+ instance.clear
230
+ end
231
+
232
+ # Get all keys from the global repository.
233
+ #
234
+ # @return [Array<Symbol>] All keys
235
+ def self.keys
236
+ instance.keys
237
+ end
238
+
239
+ # Check if a key is registered in the global repository.
240
+ #
241
+ # @param key [Symbol, String] The key
242
+ # @return [Boolean] True if the key exists
243
+ def self.registered?(key)
244
+ instance.registered?(key)
245
+ end
246
+ end
247
+ end
248
+ end