kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,444 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Dictionary Validation Script
5
+ #
6
+ # This script validates all dictionaries in the kotoshu/dictionaries catalog
7
+ # by loading them from GitHub and testing basic functionality.
8
+ #
9
+ # Usage:
10
+ # ruby script/validate_all_dictionaries.rb [--full] [--lang LANG] [--code CODE]
11
+ #
12
+ # Options:
13
+ # --full Run full validation including suggestion tests (slow)
14
+ # --lang LANG Only test dictionaries for this language (e.g., en, de, fr)
15
+ # --code CODE Only test this specific dictionary code (e.g., en-GB, de-AT)
16
+ # --format FMT Only test dictionaries with this format (hunspell, plain_text)
17
+
18
+ require_relative "../lib/kotoshu"
19
+ require "optparse"
20
+ require "benchmark"
21
+ require "json"
22
+
23
+ # ANSI color codes for terminal output
24
+ module Colors
25
+ RESET = "\e[0m"
26
+ RED = "\e[31m"
27
+ GREEN = "\e[32m"
28
+ YELLOW = "\e[33m"
29
+ BLUE = "\e[34m"
30
+ MAGENTA = "\e[35m"
31
+ CYAN = "\e[36m"
32
+ BOLD = "\e[1m"
33
+ end
34
+
35
+ # Validation result for a single dictionary
36
+ class ValidationResult
37
+ attr_reader :code, :status, :load_time, :size, :test_results, :error
38
+
39
+ def initialize(code)
40
+ @code = code
41
+ @status = :pending # :pending, :success, :warning, :error
42
+ @load_time = nil
43
+ @size = nil
44
+ @test_results = {}
45
+ @error = nil
46
+ end
47
+
48
+ def success!(load_time, size)
49
+ @status = :success
50
+ @load_time = load_time
51
+ @size = size
52
+ end
53
+
54
+ def warning!(load_time, size, message)
55
+ @status = :warning
56
+ @load_time = load_time
57
+ @size = size
58
+ @test_results[:warning] = message
59
+ end
60
+
61
+ def error!(error)
62
+ @status = :error
63
+ @error = error
64
+ end
65
+
66
+ def add_test_result(name, passed, details = nil)
67
+ @test_results[name] = { passed: passed, details: details }
68
+ end
69
+
70
+ def success?
71
+ @status == :success
72
+ end
73
+
74
+ def error?
75
+ @status == :error
76
+ end
77
+
78
+ def warning?
79
+ @status == :warning
80
+ end
81
+
82
+ def to_h
83
+ {
84
+ code: @code,
85
+ status: @status,
86
+ load_time: @load_time,
87
+ size: @size,
88
+ test_results: @test_results,
89
+ error: @error&.message
90
+ }
91
+ end
92
+ end
93
+
94
+ # Main validator class
95
+ class DictionaryValidator
96
+ attr_reader :options, :results
97
+
98
+ def initialize(options = {})
99
+ @options = options
100
+ @results = []
101
+ @catalog = Kotoshu::Dictionaries::Catalog
102
+ end
103
+
104
+ # Run validation
105
+ def run
106
+ print_header
107
+
108
+ dictionaries = select_dictionaries
109
+
110
+ print "Validating #{dictionaries.size} dictionaries...\n\n"
111
+
112
+ dictionaries.each_with_index do |entry, index|
113
+ validate_dictionary(entry, index + 1, dictionaries.size)
114
+ end
115
+
116
+ print_summary
117
+
118
+ write_report if @options[:report]
119
+
120
+ exit_with_code
121
+ end
122
+
123
+ private
124
+
125
+ def select_dictionaries
126
+ dicts = @catalog.all
127
+
128
+ dicts = dicts.select { |d| d.language == @options[:lang] } if @options[:lang]
129
+ dicts = [dicts.find { |d| d.code.casecmp(@options[:code]).zero? }].compact if @options[:code]
130
+ dicts = dicts.select { |d| d.format == @options[:format].to_sym } if @options[:format]
131
+
132
+ dicts
133
+ end
134
+
135
+ def validate_dictionary(entry, index, total)
136
+ result = ValidationResult.new(entry.code)
137
+
138
+ print_status(entry, index, total, result)
139
+
140
+ begin
141
+ # Load dictionary with timing
142
+ dict = nil
143
+ load_time = Benchmark.realtime do
144
+ dict = entry.load
145
+ end
146
+
147
+ # Basic validation
148
+ size = dict.size
149
+
150
+ if size.zero?
151
+ result.warning!(load_time, size, "Dictionary has zero words")
152
+ elsif size < 100
153
+ result.warning!(load_time, size, "Dictionary has fewer than 100 words")
154
+ else
155
+ result.success!(load_time, size)
156
+ end
157
+
158
+ # Run tests if --full
159
+ run_full_tests(dict, entry, result) if @options[:full] && result.success?
160
+ rescue StandardError => e
161
+ result.error!(e)
162
+ end
163
+
164
+ @results << result
165
+ print_result(entry, result)
166
+ end
167
+
168
+ def run_full_tests(dict, entry, result)
169
+ # Test 1: Lookup basic word (varies by language)
170
+ test_word = basic_test_word(entry.language)
171
+ if dict.lookup?(test_word)
172
+ result.add_test_result(:basic_lookup, true, test_word)
173
+ else
174
+ result.add_test_result(:basic_lookup, false, "Could not find '#{test_word}'")
175
+ end
176
+
177
+ # Test 2: Lookup non-existent word
178
+ nonsense_word = nonsense_test_word(entry.language)
179
+ if !dict.lookup?(nonsense_word)
180
+ result.add_test_result(:nonexistent_lookup, true, nonsense_word)
181
+ else
182
+ result.add_test_result(:nonexistent_lookup, false, "Incorrectly found '#{nonsense_word}'")
183
+ end
184
+
185
+ # Test 3: Suggestions (if supported)
186
+ begin
187
+ misspelled = misspelled_test_word(entry.language)
188
+ suggestions = dict.suggest(misspelled, max_suggestions: 5)
189
+ if suggestions&.any?
190
+ result.add_test_result(:suggestions, true, "Found #{suggestions.size} suggestions for '#{misspelled}'")
191
+ else
192
+ result.add_test_result(:suggestions, false, "No suggestions for '#{misspelled}'")
193
+ end
194
+ rescue StandardError => e
195
+ result.add_test_result(:suggestions, false, e.message)
196
+ end
197
+
198
+ # Test 4: Case sensitivity (if not case-sensitive)
199
+ return if dict.case_sensitive?
200
+
201
+ if dict.lookup?(test_word.upcase) || dict.lookup?(test_word.downcase)
202
+ result.add_test_result(:case_insensitive, true, "Case-insensitive lookup works")
203
+ else
204
+ result.add_test_result(:case_insensitive, false, "Case-insensitive lookup failed")
205
+ end
206
+ end
207
+
208
+ def basic_test_word(language)
209
+ # Common words in different languages
210
+ {
211
+ "en" => "the",
212
+ "de" => "der",
213
+ "es" => "el",
214
+ "fr" => "le",
215
+ "it" => "il",
216
+ "pt" => "o",
217
+ "ru" => "и",
218
+ "nl" => "de",
219
+ "pl" => "i",
220
+ "cs" => "a",
221
+ "sv" => "och",
222
+ "da" => "og",
223
+ "no" => "og",
224
+ "fi" => "ja",
225
+ "tr" => "ve",
226
+ "ko" => "그",
227
+ "vi" => "là",
228
+ "ja" => "は",
229
+ "zh" => "的",
230
+ "ar" => "في",
231
+ "he" => "ו",
232
+ "el" => "το",
233
+ "hu" => "a",
234
+ "ro" => "şi",
235
+ "bg" => "и",
236
+ "uk" => "і",
237
+ "ga" => "an",
238
+ "cy" => "y",
239
+ "is" => "og",
240
+ "mt" => "u",
241
+ "lv" => "un",
242
+ "et" => "ja",
243
+ "lt" => "ir",
244
+ "sk" => "a",
245
+ "sl" => "in",
246
+ "hr" => "i",
247
+ "sr" => "и",
248
+ "sq" => "dhe",
249
+ "be" => "і",
250
+ "mk" => "и",
251
+ "hy" => "և",
252
+ "ka" => "და",
253
+ "fa" => "و",
254
+ "ur" => "اور",
255
+ "hi" => "और",
256
+ "bn" => "এবং",
257
+ "th" => "และ",
258
+ "id" => "dan",
259
+ "ms" => "dan",
260
+ "sw" => "na",
261
+ "af" => "en",
262
+ "ca" => "i",
263
+ "gl" => "e",
264
+ "eu" => "eta",
265
+ "lb" => "an",
266
+ "fy" => "en",
267
+ "ku" => "û",
268
+ "eo" => "kaj",
269
+ "ia" => "e"
270
+ }.fetch(language, "a")
271
+ end
272
+
273
+ def nonsense_test_word(_language)
274
+ # Nonsense words that shouldn't exist
275
+ "zzzzzzzzz"
276
+ end
277
+
278
+ def misspelled_test_word(language)
279
+ # Common misspellings in different languages
280
+ {
281
+ "en" => "helo",
282
+ "de" => "hallo",
283
+ "es" => "ola",
284
+ "fr" => "bonjur",
285
+ "it" => "ciao",
286
+ "pt" => "ola",
287
+ "ru" => "привет",
288
+ "nl" => "halo",
289
+ "pl" => "czesc"
290
+ }.fetch(language, "teest")
291
+ end
292
+
293
+ def print_header
294
+ print "#{Colors::BOLD}Kotoshu Dictionary Validator#{Colors::RESET}\n"
295
+ print "#{"=" * 60}\n\n"
296
+
297
+ stats = @catalog.statistics
298
+ print "Catalog Statistics:\n"
299
+ print " Total dictionaries: #{stats[:total]}\n"
300
+ print " Hunspell dictionaries: #{stats[:hunspell]}\n"
301
+ print " Plain text dictionaries: #{stats[:plain_text]}\n"
302
+ print " Languages: #{stats[:languages]}\n"
303
+ print " Total words: #{stats[:total_words].round}\n"
304
+ print "\n"
305
+ print "#{"=" * 60}\n\n"
306
+ end
307
+
308
+ def print_status(entry, index, total, _result)
309
+ print "[#{index}/#{total}] #{Colors::CYAN}#{entry.code}#{Colors::RESET} - #{entry.description}\n"
310
+ print " Format: #{entry.format}, License: #{entry.license}\n"
311
+ end
312
+
313
+ def print_result(_entry, result)
314
+ if result.success?
315
+ print " #{Colors::GREEN}✓ PASS#{Colors::RESET}"
316
+ print " - #{result.size.round} words, #{(result.load_time * 1000).round(1)}ms"
317
+ print " - Tests: #{result.test_results.size}" if @options[:full]
318
+ elsif result.warning?
319
+ print " #{Colors::YELLOW}⚠ WARN#{Colors::RESET}"
320
+ print " - #{result.size.round} words, #{(result.load_time * 1000).round(1)}ms"
321
+ print " - #{result.test_results[:warning]}"
322
+ else
323
+ print " #{Colors::RED}✗ FAIL#{Colors::RESET}"
324
+ print " - #{result.error.class}: #{result.error.message}"
325
+ end
326
+ print "\n"
327
+
328
+ # Print test results details
329
+ if @options[:full] && result.test_results.any?
330
+ result.test_results.each do |name, test_result|
331
+ next if name == :warning
332
+
333
+ status = test_result[:passed] ? "#{Colors::GREEN}✓#{Colors::RESET}" : "#{Colors::RED}✗#{Colors::RESET}"
334
+ print " #{status} #{name}: #{test_result[:details]}\n"
335
+ end
336
+ end
337
+
338
+ print "\n"
339
+ end
340
+
341
+ def print_summary
342
+ print "#{"=" * 60}\n"
343
+ print "#{Colors::BOLD}Validation Summary#{Colors::RESET}\n"
344
+ print "#{"=" * 60}\n\n"
345
+
346
+ total = @results.size
347
+ success = @results.count(&:success?)
348
+ warnings = @results.count(&:warning?)
349
+ errors = @results.count(&:error?)
350
+
351
+ print "Total: #{total}\n"
352
+ print "#{Colors::GREEN}✓ Passed: #{success}#{Colors::RESET}\n"
353
+ print "#{Colors::YELLOW}⚠ Warnings: #{warnings}#{Colors::RESET}\n"
354
+ print "#{Colors::RED}✗ Failed: #{errors}#{Colors::RESET}\n"
355
+ print "\n"
356
+
357
+ if success.positive?
358
+ avg_load_time = @results.select(&:success?).map(&:load_time).sum / success
359
+ avg_size = @results.select(&:success?).map(&:size).sum / success
360
+ print "Average load time: #{(avg_load_time * 1000).round(1)}ms\n"
361
+ print "Average size: #{avg_size.round} words\n"
362
+ print "\n"
363
+ end
364
+
365
+ if errors.positive?
366
+ print "#{Colors::BOLD}Failed Dictionaries:#{Colors::RESET}\n"
367
+ @results.select(&:error?).each do |result|
368
+ print " #{Colors::RED}#{result.code}#{Colors::RESET}: #{result.error.message}\n"
369
+ end
370
+ print "\n"
371
+ end
372
+
373
+ return unless warnings.positive?
374
+
375
+ print "#{Colors::BOLD}Warnings:#{Colors::RESET}\n"
376
+ @results.select(&:warning?).each do |result|
377
+ print " #{Colors::YELLOW}#{result.code}#{Colors::RESET}: #{result.test_results[:warning]}\n"
378
+ end
379
+ print "\n"
380
+ end
381
+
382
+ def write_report
383
+ report_path = "dictionary_validation_report.json"
384
+ File.write(report_path, JSON.pretty_generate({
385
+ timestamp: Time.now.iso8601,
386
+ summary: {
387
+ total: @results.size,
388
+ success: @results.count(&:success?),
389
+ warnings: @results.count(&:warning?),
390
+ errors: @results.count(&:error?)
391
+ },
392
+ results: @results.map(&:to_h)
393
+ }))
394
+ print "Report written to: #{report_path}\n"
395
+ end
396
+
397
+ def exit_with_code
398
+ # Exit with error code if any failures
399
+ exit 1 if @results.any?(&:error?)
400
+ exit 0
401
+ end
402
+ end
403
+
404
+ # Parse options
405
+ options = {
406
+ full: false,
407
+ lang: nil,
408
+ code: nil,
409
+ format: nil,
410
+ report: false
411
+ }
412
+
413
+ OptionParser.new do |opts|
414
+ opts.banner = "Usage: ruby script/validate_all_dictionaries.rb [options]"
415
+
416
+ opts.on("--full", "Run full validation including tests") do
417
+ options[:full] = true
418
+ end
419
+
420
+ opts.on("--lang LANG", "Filter by language (e.g., en, de, fr)") do |lang|
421
+ options[:lang] = lang
422
+ end
423
+
424
+ opts.on("--code CODE", "Filter by dictionary code (e.g., en-GB)") do |code|
425
+ options[:code] = code
426
+ end
427
+
428
+ opts.on("--format FORMAT", "Filter by format (hunspell, plain_text)") do |fmt|
429
+ options[:format] = fmt
430
+ end
431
+
432
+ opts.on("--report", "Write JSON report file") do
433
+ options[:report] = true
434
+ end
435
+
436
+ opts.on("-h", "--help", "Show this message") do
437
+ puts opts
438
+ exit
439
+ end
440
+ end.parse!
441
+
442
+ # Run validator
443
+ validator = DictionaryValidator.new(options)
444
+ validator.run
data/sig/kotoshu.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Kotoshu
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
data/test_oop.rb ADDED
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/setup"
4
+ require_relative "lib/kotoshu"
5
+
6
+ # Test IndexedDictionary
7
+ puts "=== Testing IndexedDictionary ==="
8
+ dict = Kotoshu::Core::IndexedDictionary.new(%w[hello world help held heap])
9
+ puts "Has 'hello': #{dict.has_word?("hello")}"
10
+ puts "Has 'HELLO' (ignorecase): #{dict.has_word_ignorecase?("HELLO")}"
11
+ puts "Words starting with 'he': #{dict.find_by_prefix("he").inspect}"
12
+ puts "Words ending with 'ld': #{dict.find_by_suffix("ld").inspect}"
13
+ puts "Words with length 5: #{dict.find_by_length(5).inspect}"
14
+ puts "Statistics: #{dict.statistics.inspect}"
15
+ puts
16
+
17
+ # Test Trie
18
+ puts "=== Testing Trie ==="
19
+ trie = Kotoshu::Core::Trie::Builder.from_array(%w[hello help held heap world])
20
+ puts "Has 'hello': #{trie.lookup("hello")}"
21
+ puts "Has prefix 'he': #{trie.has_prefix?("he")}"
22
+ puts "Words with prefix 'he': #{trie.words_with_prefix("he").inspect}"
23
+ puts "Suggestions for 'hel': #{trie.suggestions("hel").inspect}"
24
+ puts "All words: #{trie.all_words.inspect}"
25
+ puts
26
+
27
+ # Test Suggestion
28
+ puts "=== Testing Suggestion ==="
29
+ suggestion = Kotoshu::Suggestions::Suggestion.new(
30
+ word: "hello",
31
+ distance: 1,
32
+ confidence: 0.9,
33
+ source: :test
34
+ )
35
+ puts "High confidence: #{suggestion.high_confidence?}"
36
+ puts "Combined score: #{suggestion.combined_score}"
37
+ puts "Same word as 'HELLO': #{suggestion.same_word?("HELLO")}"
38
+ puts
39
+
40
+ # Test SuggestionSet
41
+ puts "=== Testing SuggestionSet ==="
42
+ suggestions = Kotoshu::Suggestions::SuggestionSet.from_words(
43
+ %w[hello help held],
44
+ source: :test
45
+ )
46
+ puts "Size: #{suggestions.size}"
47
+ puts "First: #{suggestions.first.inspect}"
48
+ puts "Has word 'help': #{suggestions.has_word?("help")}"
49
+ puts "Top 2: #{suggestions.top(2).map(&:word).inspect}"
50
+ puts
51
+
52
+ # Test Context
53
+ puts "=== Testing Context ==="
54
+ context = Kotoshu::Suggestions::Context.new(
55
+ word: "helo",
56
+ dictionary: dict,
57
+ max_results: 5
58
+ )
59
+ puts "Word: #{context.word}"
60
+ puts "Max results: #{context.max_results}"
61
+ puts
62
+
63
+ # Test EditDistanceStrategy
64
+ puts "=== Testing EditDistanceStrategy ==="
65
+ strategy = Kotoshu::Suggestions::Strategies::EditDistanceStrategy.new
66
+ result = strategy.generate(context)
67
+ puts "Suggestions for 'helo': #{result.to_words.inspect}"
68
+ puts
69
+
70
+ # Test CompositeStrategy (Pipeline)
71
+ puts "=== Testing CompositeStrategy ==="
72
+ pipeline = Kotoshu.suggestion_pipeline(
73
+ Kotoshu::Suggestions::Strategies::EditDistanceStrategy.new
74
+ )
75
+ result = pipeline.generate(context)
76
+ puts "Pipeline suggestions: #{result.to_words.inspect}"
77
+ puts
78
+
79
+ puts "All tests passed!"