kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,627 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "thor"
4
+ require_relative "../kotoshu"
5
+ require_relative "cli/cache_command"
6
+ require_relative "cli/errors"
7
+
8
+ # Dictionary command class.
9
+ #
10
+ # @example
11
+ # kotoshu dict list
12
+ # kotoshu dict info en-US
13
+ class DictCommand < Thor
14
+ desc "list", "List available dictionaries"
15
+ def list
16
+ puts "Available dictionary types:"
17
+ puts " - unix_words: Unix system dictionary"
18
+ puts " - plain_text: Plain text word list"
19
+ puts " - custom: Custom in-memory dictionary"
20
+ puts " - hunspell: Hunspell (.dic/.aff)"
21
+ puts " - cspell: CSpell (.txt/.trie)"
22
+ end
23
+
24
+ desc "info TYPE", "Show information about a dictionary type"
25
+ def info(type)
26
+ case type.to_sym
27
+ when :unix_words
28
+ puts "UnixWords Dictionary:"
29
+ puts " Reads from Unix system dictionary files"
30
+ puts " Default paths:"
31
+ puts " - /usr/share/dict/words"
32
+ puts " - /usr/share/dict/web2"
33
+ puts " - /usr/share/dict/american-english"
34
+ when :plain_text
35
+ puts "PlainText Dictionary:"
36
+ puts " Reads from plain text word lists"
37
+ puts " One word per line, # comments supported"
38
+ when :custom
39
+ puts "Custom Dictionary:"
40
+ puts " In-memory dictionary for user-defined words"
41
+ when :hunspell
42
+ puts "Hunspell Dictionary:"
43
+ puts " Reads Hunspell .dic and .aff files"
44
+ puts " Supports morphological affix rules"
45
+ when :cspell
46
+ puts "CSpell Dictionary:"
47
+ puts " Reads CSpell .txt or .trie files"
48
+ puts " Uses trie data structure for fast lookups"
49
+ else
50
+ puts "Unknown dictionary type: #{type}"
51
+ puts "Run 'kotoshu dict list' for available types"
52
+ end
53
+ end
54
+ end
55
+
56
+ module Kotoshu
57
+ module Cli
58
+ # LAZY: CLI helper components (autoloaded on first reference)
59
+ autoload :NavigationManager, "kotoshu/cli/navigation_manager"
60
+ autoload :DisplayFormatter, "kotoshu/cli/display_formatter"
61
+ autoload :InteractiveReviewer, "kotoshu/cli/interactive_reviewer"
62
+ autoload :BatchReporter, "kotoshu/cli/batch_reporter"
63
+ autoload :AutoSetup, "kotoshu/cli/auto_setup"
64
+ autoload :StatusReport, "kotoshu/cli/status_report"
65
+ autoload :LanguageResolver, "kotoshu/cli/language_resolver"
66
+ autoload :ProgressReporter, "kotoshu/cli/progress_reporter"
67
+
68
+ # Command-line interface for Kotoshu spell checker.
69
+ #
70
+ # Two-stage model:
71
+ # Stage 1 (slow, network): `kotoshu setup LANG` downloads/registers resources
72
+ # Stage 2 (instant, cache-only): `kotoshu check FILE` uses cached resources
73
+ #
74
+ # Exit codes:
75
+ # 0 — no errors found / setup succeeded
76
+ # 1 — spelling errors found
77
+ # 2 — usage error (file not found, bad flags)
78
+ # 3 — resource not set up / setup failure (network, integrity)
79
+ #
80
+ # Commands raise Errors::CliError subclasses; the dispatcher in .start
81
+ # catches them and exits with the error's exit_status.
82
+ class Cli < Thor
83
+ class_option :language,
84
+ type: :string,
85
+ default: "auto",
86
+ desc: "Language code (auto, en, de, es, fr, pt, ru)",
87
+ aliases: ["-l"]
88
+
89
+ class_option :format,
90
+ type: :string,
91
+ enum: %w[text json sarif],
92
+ default: "text",
93
+ desc: "Output format (text, json, sarif)",
94
+ aliases: ["-f"]
95
+
96
+ class_option :interactive,
97
+ type: :boolean,
98
+ default: false,
99
+ desc: "Interactively review each error after check",
100
+ aliases: ["-i"]
101
+
102
+ class_option :verbose,
103
+ type: :boolean,
104
+ default: false,
105
+ desc: "Enable verbose output",
106
+ aliases: ["-v"]
107
+
108
+ desc "check [FILE]", "Check spelling in a file or stdin"
109
+ long_desc <<~DESC
110
+ Checks spelling in the given file (or stdin if no file is given).
111
+ Cache-only — never downloads. Run `kotoshu setup LANG` first.
112
+
113
+ Exit codes:
114
+ 0 — no errors
115
+ 1 — spelling errors found
116
+ 2 — usage error (bad flags, file not found)
117
+ 3 — language not set up (run `kotoshu setup LANG`)
118
+ DESC
119
+ def check(target = nil)
120
+ apply_configuration!
121
+
122
+ text, source = read_target(target)
123
+ result = run_check(text)
124
+ display_result(result, source)
125
+ interactive_review(result, source) if options[:interactive] && result.failed?
126
+ exit 1 if result.failed?
127
+ end
128
+
129
+ desc "setup [LANGUAGE] [LANGUAGE ...]", "Set up languages (download or register local files)"
130
+ long_desc <<~DESC
131
+ Stage 1 of the two-stage model. Downloads spelling/frequency/model
132
+ resources for the named language(s), or registers local .aff/.dic
133
+ files you already have on disk. After setup, `kotoshu check` runs
134
+ instantly with no network access.
135
+
136
+ With no args, lists currently set up languages.
137
+
138
+ Sources (one per invocation, applies to all listed languages):
139
+ --aff FILE --dic FILE use specific local Hunspell files
140
+ --from DIR look for {lang}.aff and {lang}.dic in DIR
141
+ (neither) download from kotoshu/dictionaries
142
+
143
+ Examples:
144
+ kotoshu setup en de fr # download from GitHub
145
+ kotoshu setup en --want spelling,frequency # also fetch Kelly list
146
+ kotoshu setup en --aff /p/en.aff --dic /p/en.dic
147
+ kotoshu setup en --from /usr/share/hunspell/
148
+ kotoshu setup --force en # re-download
149
+ kotoshu setup --list # show what's set up
150
+
151
+ Exit codes:
152
+ 0 — every language set up successfully
153
+ 3 — at least one language failed (network down, integrity, etc.)
154
+ DESC
155
+ method_option :aff, type: :string, desc: "Path to local .aff file"
156
+ method_option :dic, type: :string, desc: "Path to local .dic file"
157
+ method_option :from, type: :string, desc: "Directory containing local .aff/.dic"
158
+ method_option :frequency, type: :string, desc: "Path to local frequency.json"
159
+ method_option :want,
160
+ type: :string,
161
+ default: "spelling",
162
+ desc: "Comma-separated: spelling,frequency,model"
163
+ method_option :force,
164
+ type: :boolean,
165
+ default: false,
166
+ desc: "Re-fetch even if already cached"
167
+ method_option :strict,
168
+ type: :boolean,
169
+ default: false,
170
+ desc: "Re-raise on optional-resource failure during setup"
171
+ method_option :list,
172
+ type: :boolean,
173
+ default: false,
174
+ desc: "List currently set up languages and exit"
175
+ def setup(*languages)
176
+ apply_configuration!
177
+
178
+ if options[:list] || languages.empty?
179
+ list_setup
180
+ return
181
+ end
182
+
183
+ want = (options[:want] || "spelling").split(",").map(&:strip).map(&:to_sym)
184
+ opts = setup_source_options(languages)
185
+ opts[:want] = want
186
+ opts[:force] = options[:force]
187
+ opts[:strict] = options[:strict]
188
+
189
+ results = languages.map do |lang|
190
+ print "Setup #{lang}... "
191
+ begin
192
+ result = with_progress_reporter(label: lang) do
193
+ Kotoshu.setup(lang, **opts)
194
+ end
195
+ describe_setup_result(result)
196
+ { lang: lang, ok: true }
197
+ rescue Kotoshu::Error, ArgumentError => e
198
+ puts "FAIL: #{e.message}"
199
+ { lang: lang, ok: false }
200
+ end
201
+ end
202
+
203
+ failed = results.reject { |r| r[:ok] }
204
+ puts "Set up #{results.size} language(s)."
205
+ return if failed.empty?
206
+
207
+ raise Errors::ResourceUnavailable,
208
+ "failed to set up: #{failed.map { |r| r[:lang] }.join(', ')}"
209
+ end
210
+
211
+ # Back-compat alias. New code should use `setup`.
212
+ desc "fetch LANGUAGE [LANGUAGE ...]", "Alias for `setup` (deprecated)", hide: true
213
+ method_option :aff, type: :string, desc: "Path to local .aff file"
214
+ method_option :dic, type: :string, desc: "Path to local .dic file"
215
+ method_option :from, type: :string, desc: "Directory containing local .aff/.dic"
216
+ method_option :frequency, type: :string, desc: "Path to local frequency.json"
217
+ method_option :want,
218
+ type: :string,
219
+ default: "spelling",
220
+ desc: "Comma-separated: spelling,frequency,model"
221
+ method_option :force,
222
+ type: :boolean,
223
+ default: false,
224
+ desc: "Re-fetch even if already cached"
225
+ method_option :strict,
226
+ type: :boolean,
227
+ default: false,
228
+ desc: "Re-raise on optional-resource failure during setup"
229
+ method_option :list,
230
+ type: :boolean,
231
+ default: false,
232
+ desc: "List currently set up languages and exit"
233
+ def fetch(*languages)
234
+ setup(*languages)
235
+ end
236
+
237
+ desc "dict SUBCOMMAND", "Dictionary operations"
238
+ subcommand "dict", DictCommand
239
+
240
+ desc "cache SUBCOMMAND", "Cache management"
241
+ subcommand "cache", CacheCommand
242
+
243
+ desc "status", "Show setup, cache, and runtime status"
244
+ long_desc <<~DESC
245
+ Prints a snapshot of the kotoshu installation: which languages are
246
+ set up (with per-resource status), cache disk usage, audit log path,
247
+ default language, offline flag, and whether onnxruntime is loaded.
248
+
249
+ With --json, emits the same report as a JSON object for tooling.
250
+ DESC
251
+ method_option :json,
252
+ type: :boolean,
253
+ default: false,
254
+ desc: "Emit the report as JSON"
255
+ def status
256
+ report = StatusReport.build(version: Kotoshu::VERSION)
257
+ if options[:json]
258
+ puts status_json(report)
259
+ else
260
+ puts status_text(report)
261
+ end
262
+ end
263
+
264
+ desc "version", "Show version information"
265
+ def version
266
+ puts "Kotoshu version #{Kotoshu::VERSION}"
267
+ puts "Ruby #{RUBY_VERSION}"
268
+ end
269
+
270
+ map %w[--version -V] => :version
271
+
272
+ # Dispatch entry point — bypasses Thor's start rescue so we can honor
273
+ # exit_status from Errors::CliError subclasses. Thor::Error still falls
274
+ # back to exit 1 for framework-level errors (bad flags, etc.).
275
+ #
276
+ # ResourceNotSetupError from the strict two-stage model is intercepted
277
+ # here: AutoSetup asks the user once, then we retry the dispatch. In
278
+ # non-TTY or offline mode AutoSetup re-raises so scripts see stable
279
+ # behavior.
280
+ def self.start(given_args = ARGV, config = {})
281
+ config[:shell] ||= Thor::Base.shell.new
282
+ dispatch(nil, given_args.dup, nil, config)
283
+ rescue Kotoshu::ResourceNotSetupError => e
284
+ raise Errors::ResourceUnavailable, e.message unless AutoSetup.new.call(e)
285
+
286
+ retry
287
+ rescue Errors::CliError => e
288
+ warn "Error: #{e.message}"
289
+ exit e.exit_status
290
+ rescue Thor::Error => e
291
+ warn e.message
292
+ exit 1
293
+ end
294
+
295
+ def self.exit_on_failure?
296
+ false
297
+ end
298
+
299
+ private
300
+
301
+ def apply_configuration!
302
+ Kotoshu::Configuration.reset
303
+ cfg = Kotoshu::Configuration.instance
304
+ cfg.default_language = options[:language] if options[:language] && options[:language] != "auto"
305
+ end
306
+
307
+ # Install a ProgressReporter on Configuration.download_reporter
308
+ # for the duration of the block, then restore the prior value.
309
+ # The reporter writes to $stderr; in non-TTY contexts it still
310
+ # emits periodic line messages so CI logs show progress.
311
+ def with_progress_reporter(label:)
312
+ prior = Kotoshu.configuration.download_reporter
313
+ Kotoshu.configuration.download_reporter = ProgressReporter.new(
314
+ output: $stderr,
315
+ label: label
316
+ )
317
+ yield
318
+ ensure
319
+ Kotoshu.configuration.download_reporter = prior
320
+ end
321
+
322
+ def status_text(report)
323
+ lines = []
324
+ lines << "Kotoshu #{report.version}"
325
+ lines << ""
326
+
327
+ lines << "Setup:"
328
+ if report.resources.empty?
329
+ lines << " (no languages set up — run `kotoshu setup LANG`)"
330
+ else
331
+ report.resources.each do |r|
332
+ mark = r.available ? "✓" : "✗"
333
+ size = r.available ? StatusReport.format_bytes(r.size_bytes) : "—"
334
+ when_str = r.cached_at ? "cached #{r.cached_at.strftime('%Y-%m-%d')}" : ""
335
+ lines << format(" %-4s %-10s %s %s%s",
336
+ r.language, r.resource, mark, size,
337
+ when_str.empty? ? "" : ", #{when_str}")
338
+ end
339
+ end
340
+ lines << ""
341
+
342
+ lines << "Cache:"
343
+ lines << " Path #{report.cache_path}"
344
+ lines << " Size #{StatusReport.format_bytes(report.cache_size_bytes)}"
345
+ lines << " Languages #{report.languages_setup.size}"
346
+ lines << ""
347
+
348
+ lines << "Semantic:"
349
+ onnx_state = report.onnx_loaded ? "loaded" : "not loaded (gem install onnxruntime to enable)"
350
+ lines << " onnxruntime #{onnx_state}"
351
+ active_models = report.languages_with_model
352
+ models_str = active_models.empty? ? "0" : "#{active_models.size} (#{active_models.join(', ')})"
353
+ lines << " Active models #{models_str}"
354
+ lines << ""
355
+
356
+ lines << "Other:"
357
+ if report.audit_log_path
358
+ lines << " Audit log #{report.audit_log_path} (#{StatusReport.format_bytes(report.audit_log_size_bytes)})"
359
+ else
360
+ lines << " Audit log (none yet — created on first audited operation)"
361
+ end
362
+ lines << " Default lang #{report.default_language || '(none)'}"
363
+ lines << " Offline mode #{report.offline ? 'yes' : 'no'}"
364
+ lines.join("\n")
365
+ end
366
+
367
+ def status_json(report)
368
+ require "json"
369
+
370
+ payload = {
371
+ version: report.version,
372
+ setup: report.resources.map do |r|
373
+ {
374
+ language: r.language,
375
+ resource: r.resource.to_s,
376
+ available: r.available,
377
+ size_bytes: r.size_bytes,
378
+ cached_at: r.cached_at&.iso8601
379
+ }
380
+ end,
381
+ cache: {
382
+ path: report.cache_path,
383
+ size_bytes: report.cache_size_bytes,
384
+ languages: report.languages_setup.size
385
+ },
386
+ semantic: {
387
+ onnxruntime_loaded: report.onnx_loaded,
388
+ active_models: report.languages_with_model
389
+ },
390
+ audit_log: report.audit_log_path && {
391
+ path: report.audit_log_path,
392
+ size_bytes: report.audit_log_size_bytes
393
+ },
394
+ default_language: report.default_language,
395
+ offline: report.offline
396
+ }
397
+ JSON.pretty_generate(payload)
398
+ end
399
+
400
+ def read_target(target)
401
+ if target.nil?
402
+ [$stdin.read, "<stdin>"]
403
+ elsif File.exist?(target)
404
+ [File.read(target, encoding: Kotoshu.configuration.encoding), target]
405
+ else
406
+ raise Errors::UsageError, "File not found: #{target}"
407
+ end
408
+ end
409
+
410
+ def run_check(text)
411
+ language = resolve_language(text)
412
+ spellchecker = Kotoshu.spellchecker_for(language)
413
+ spellchecker.check(text)
414
+ rescue Kotoshu::DictionaryNotFoundError => e
415
+ raise Errors::ResourceUnavailable, e.message
416
+ end
417
+
418
+ def resolve_language(text)
419
+ result = LanguageResolver.new(
420
+ flag_value: options[:language],
421
+ default_language: Kotoshu.configuration.default_language
422
+ ).resolve(text: text)
423
+
424
+ $stderr.puts "# #{result.note}" if result.note
425
+ result.language
426
+ end
427
+
428
+ def setup_source_options(languages)
429
+ opts = {}
430
+ if options[:aff] || options[:dic]
431
+ raise Errors::UsageError, "--aff and --dic require exactly one language" unless languages.size == 1
432
+
433
+ raise Errors::UsageError, "--aff and --dic must both be given" unless options[:aff] && options[:dic]
434
+
435
+ opts[:aff] = options[:aff]
436
+ opts[:dic] = options[:dic]
437
+ elsif options[:from]
438
+ opts[:from] = options[:from]
439
+ end
440
+ opts[:frequency] = options[:frequency] if options[:frequency]
441
+ opts
442
+ end
443
+
444
+ def describe_setup_result(result)
445
+ spelling = result.spelling || "skipped"
446
+ frequency = result.frequency || "skipped"
447
+ source = result.source
448
+ puts "OK (spelling: #{spelling}, frequency: #{frequency}, source: #{source})"
449
+ end
450
+
451
+ def list_setup
452
+ langs = Kotoshu.languages_setup
453
+ if langs.empty?
454
+ puts "No languages set up. Run `kotoshu setup LANG` to add one."
455
+ return
456
+ end
457
+
458
+ puts "Set up languages:"
459
+ langs.each { |lang| puts " #{lang}" }
460
+ end
461
+
462
+ def display_result(result, source)
463
+ case options[:format]
464
+ when "json"
465
+ puts format_as_json(result, source)
466
+ when "sarif"
467
+ puts format_as_sarif(result, source)
468
+ else
469
+ puts format_as_text(result, source)
470
+ end
471
+ end
472
+
473
+ def format_as_text(result, source)
474
+ if result.success?
475
+ "OK #{source} (#{result.word_count} words, no errors)"
476
+ else
477
+ lines = []
478
+ lines << "FAIL #{source} (#{result.error_count} errors)"
479
+ result.each_error do |error|
480
+ suggestions_str = if error.has_suggestions?
481
+ " -> #{error.top_suggestions(3).join(", ")}"
482
+ else
483
+ ""
484
+ end
485
+ lines << " #{error.word}#{suggestions_str}"
486
+ end
487
+ lines.join("\n")
488
+ end
489
+ end
490
+
491
+ def format_as_json(result, source)
492
+ require "json"
493
+
494
+ output = result.as_json
495
+ output["source"] = source
496
+ JSON.pretty_generate(output)
497
+ end
498
+
499
+ def format_as_sarif(result, source)
500
+ require "json"
501
+
502
+ results = result.errors.map do |err|
503
+ suggestions = err.top_suggestions(3)
504
+ suggestion_text = suggestions.empty? ? "" : " Suggestions: #{suggestions.join(", ")}"
505
+ {
506
+ "ruleId" => "kotoshu/spelling",
507
+ "level" => "warning",
508
+ "message" => {
509
+ "text" => "'#{err.word}' is not in the dictionary.#{suggestion_text}"
510
+ },
511
+ "locations" => [
512
+ {
513
+ "physicalLocation" => {
514
+ "artifactLocation" => { "uri" => source_for_sarif(source) },
515
+ "region" => {
516
+ "charOffset" => err.position || 0,
517
+ "charLength" => err.word.length
518
+ }
519
+ }
520
+ }
521
+ ]
522
+ }
523
+ end
524
+
525
+ sarif = {
526
+ "version" => "2.1.0",
527
+ "$schema" => "https://json.schemastore.org/sarif-2.1.0.json",
528
+ "runs" => [
529
+ {
530
+ "tool" => {
531
+ "driver" => {
532
+ "name" => "kotoshu",
533
+ "version" => Kotoshu::VERSION,
534
+ "informationUri" => "https://github.com/kotoshu/kotoshu",
535
+ "rules" => [
536
+ {
537
+ "id" => "kotoshu/spelling",
538
+ "name" => "SpellingError",
539
+ "shortDescription" => {
540
+ "text" => "Word not found in the active dictionary."
541
+ }
542
+ }
543
+ ]
544
+ }
545
+ },
546
+ "results" => results
547
+ }
548
+ ]
549
+ }
550
+ JSON.pretty_generate(sarif)
551
+ end
552
+
553
+ def source_for_sarif(source)
554
+ source == "<stdin>" ? "stdin" : source
555
+ end
556
+
557
+ def interactive_review(result, source)
558
+ errors = result.errors
559
+ return if errors.empty?
560
+
561
+ index = 0
562
+ accepted = {}
563
+ skipped = Set.new
564
+
565
+ puts
566
+ puts "Interactive review: #{errors.size} error(s) in #{source}"
567
+ puts "Commands: [1-9] accept, [s] skip, [n]/Enter next, [p] prev, [l] list, [q] quit"
568
+
569
+ while index < errors.size
570
+ err = errors[index]
571
+ puts
572
+ puts "[#{index + 1}/#{errors.size}] '#{err.word}' (offset #{err.position || '?'})"
573
+ suggestions = err.top_suggestions(9)
574
+ if suggestions.empty?
575
+ puts " (no suggestions)"
576
+ else
577
+ suggestions.each_with_index { |s, i| puts " [#{i + 1}] #{s}" }
578
+ end
579
+ print "> "
580
+ input = $stdin.gets
581
+ break if input.nil?
582
+
583
+ input = input.chomp.downcase
584
+
585
+ case input
586
+ when "q"
587
+ puts "Quitting review."
588
+ break
589
+ when "n", ""
590
+ index += 1
591
+ when "p"
592
+ index = [index - 1, 0].max
593
+ when "l"
594
+ errors.each_with_index do |e, i|
595
+ marker = case
596
+ when accepted.key?(i) then "✓"
597
+ when skipped.include?(i) then "s"
598
+ else " "
599
+ end
600
+ puts " #{marker} #{i + 1}. #{e.word}"
601
+ end
602
+ when "s"
603
+ skipped << index
604
+ index += 1
605
+ when /\A[1-9]\z/
606
+ choice = input.to_i - 1
607
+ suggestion = suggestions[choice]
608
+ if suggestion
609
+ accepted[index] = suggestion
610
+ puts " → '#{err.word}' → '#{suggestion}' (recorded)"
611
+ index += 1
612
+ else
613
+ puts " No suggestion at that number."
614
+ end
615
+ else
616
+ puts " Unknown command."
617
+ end
618
+ end
619
+
620
+ puts
621
+ puts "Review complete: #{accepted.size} accepted, #{skipped.size} skipped, " \
622
+ "#{errors.size - accepted.size - skipped.size} unhandled."
623
+ puts "Note: 0.3 records decisions but does not rewrite source files." unless accepted.empty?
624
+ end
625
+ end
626
+ end
627
+ end