kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
data/lib/kotoshu/cli.rb
ADDED
|
@@ -0,0 +1,627 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "thor"
|
|
4
|
+
require_relative "../kotoshu"
|
|
5
|
+
require_relative "cli/cache_command"
|
|
6
|
+
require_relative "cli/errors"
|
|
7
|
+
|
|
8
|
+
# Dictionary command class.
|
|
9
|
+
#
|
|
10
|
+
# @example
|
|
11
|
+
# kotoshu dict list
|
|
12
|
+
# kotoshu dict info en-US
|
|
13
|
+
class DictCommand < Thor
|
|
14
|
+
desc "list", "List available dictionaries"
|
|
15
|
+
def list
|
|
16
|
+
puts "Available dictionary types:"
|
|
17
|
+
puts " - unix_words: Unix system dictionary"
|
|
18
|
+
puts " - plain_text: Plain text word list"
|
|
19
|
+
puts " - custom: Custom in-memory dictionary"
|
|
20
|
+
puts " - hunspell: Hunspell (.dic/.aff)"
|
|
21
|
+
puts " - cspell: CSpell (.txt/.trie)"
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
desc "info TYPE", "Show information about a dictionary type"
|
|
25
|
+
def info(type)
|
|
26
|
+
case type.to_sym
|
|
27
|
+
when :unix_words
|
|
28
|
+
puts "UnixWords Dictionary:"
|
|
29
|
+
puts " Reads from Unix system dictionary files"
|
|
30
|
+
puts " Default paths:"
|
|
31
|
+
puts " - /usr/share/dict/words"
|
|
32
|
+
puts " - /usr/share/dict/web2"
|
|
33
|
+
puts " - /usr/share/dict/american-english"
|
|
34
|
+
when :plain_text
|
|
35
|
+
puts "PlainText Dictionary:"
|
|
36
|
+
puts " Reads from plain text word lists"
|
|
37
|
+
puts " One word per line, # comments supported"
|
|
38
|
+
when :custom
|
|
39
|
+
puts "Custom Dictionary:"
|
|
40
|
+
puts " In-memory dictionary for user-defined words"
|
|
41
|
+
when :hunspell
|
|
42
|
+
puts "Hunspell Dictionary:"
|
|
43
|
+
puts " Reads Hunspell .dic and .aff files"
|
|
44
|
+
puts " Supports morphological affix rules"
|
|
45
|
+
when :cspell
|
|
46
|
+
puts "CSpell Dictionary:"
|
|
47
|
+
puts " Reads CSpell .txt or .trie files"
|
|
48
|
+
puts " Uses trie data structure for fast lookups"
|
|
49
|
+
else
|
|
50
|
+
puts "Unknown dictionary type: #{type}"
|
|
51
|
+
puts "Run 'kotoshu dict list' for available types"
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
module Kotoshu
|
|
57
|
+
module Cli
|
|
58
|
+
# LAZY: CLI helper components (autoloaded on first reference)
|
|
59
|
+
autoload :NavigationManager, "kotoshu/cli/navigation_manager"
|
|
60
|
+
autoload :DisplayFormatter, "kotoshu/cli/display_formatter"
|
|
61
|
+
autoload :InteractiveReviewer, "kotoshu/cli/interactive_reviewer"
|
|
62
|
+
autoload :BatchReporter, "kotoshu/cli/batch_reporter"
|
|
63
|
+
autoload :AutoSetup, "kotoshu/cli/auto_setup"
|
|
64
|
+
autoload :StatusReport, "kotoshu/cli/status_report"
|
|
65
|
+
autoload :LanguageResolver, "kotoshu/cli/language_resolver"
|
|
66
|
+
autoload :ProgressReporter, "kotoshu/cli/progress_reporter"
|
|
67
|
+
|
|
68
|
+
# Command-line interface for Kotoshu spell checker.
|
|
69
|
+
#
|
|
70
|
+
# Two-stage model:
|
|
71
|
+
# Stage 1 (slow, network): `kotoshu setup LANG` downloads/registers resources
|
|
72
|
+
# Stage 2 (instant, cache-only): `kotoshu check FILE` uses cached resources
|
|
73
|
+
#
|
|
74
|
+
# Exit codes:
|
|
75
|
+
# 0 — no errors found / setup succeeded
|
|
76
|
+
# 1 — spelling errors found
|
|
77
|
+
# 2 — usage error (file not found, bad flags)
|
|
78
|
+
# 3 — resource not set up / setup failure (network, integrity)
|
|
79
|
+
#
|
|
80
|
+
# Commands raise Errors::CliError subclasses; the dispatcher in .start
|
|
81
|
+
# catches them and exits with the error's exit_status.
|
|
82
|
+
class Cli < Thor
|
|
83
|
+
class_option :language,
|
|
84
|
+
type: :string,
|
|
85
|
+
default: "auto",
|
|
86
|
+
desc: "Language code (auto, en, de, es, fr, pt, ru)",
|
|
87
|
+
aliases: ["-l"]
|
|
88
|
+
|
|
89
|
+
class_option :format,
|
|
90
|
+
type: :string,
|
|
91
|
+
enum: %w[text json sarif],
|
|
92
|
+
default: "text",
|
|
93
|
+
desc: "Output format (text, json, sarif)",
|
|
94
|
+
aliases: ["-f"]
|
|
95
|
+
|
|
96
|
+
class_option :interactive,
|
|
97
|
+
type: :boolean,
|
|
98
|
+
default: false,
|
|
99
|
+
desc: "Interactively review each error after check",
|
|
100
|
+
aliases: ["-i"]
|
|
101
|
+
|
|
102
|
+
class_option :verbose,
|
|
103
|
+
type: :boolean,
|
|
104
|
+
default: false,
|
|
105
|
+
desc: "Enable verbose output",
|
|
106
|
+
aliases: ["-v"]
|
|
107
|
+
|
|
108
|
+
desc "check [FILE]", "Check spelling in a file or stdin"
|
|
109
|
+
long_desc <<~DESC
|
|
110
|
+
Checks spelling in the given file (or stdin if no file is given).
|
|
111
|
+
Cache-only — never downloads. Run `kotoshu setup LANG` first.
|
|
112
|
+
|
|
113
|
+
Exit codes:
|
|
114
|
+
0 — no errors
|
|
115
|
+
1 — spelling errors found
|
|
116
|
+
2 — usage error (bad flags, file not found)
|
|
117
|
+
3 — language not set up (run `kotoshu setup LANG`)
|
|
118
|
+
DESC
|
|
119
|
+
def check(target = nil)
|
|
120
|
+
apply_configuration!
|
|
121
|
+
|
|
122
|
+
text, source = read_target(target)
|
|
123
|
+
result = run_check(text)
|
|
124
|
+
display_result(result, source)
|
|
125
|
+
interactive_review(result, source) if options[:interactive] && result.failed?
|
|
126
|
+
exit 1 if result.failed?
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
desc "setup [LANGUAGE] [LANGUAGE ...]", "Set up languages (download or register local files)"
|
|
130
|
+
long_desc <<~DESC
|
|
131
|
+
Stage 1 of the two-stage model. Downloads spelling/frequency/model
|
|
132
|
+
resources for the named language(s), or registers local .aff/.dic
|
|
133
|
+
files you already have on disk. After setup, `kotoshu check` runs
|
|
134
|
+
instantly with no network access.
|
|
135
|
+
|
|
136
|
+
With no args, lists currently set up languages.
|
|
137
|
+
|
|
138
|
+
Sources (one per invocation, applies to all listed languages):
|
|
139
|
+
--aff FILE --dic FILE use specific local Hunspell files
|
|
140
|
+
--from DIR look for {lang}.aff and {lang}.dic in DIR
|
|
141
|
+
(neither) download from kotoshu/dictionaries
|
|
142
|
+
|
|
143
|
+
Examples:
|
|
144
|
+
kotoshu setup en de fr # download from GitHub
|
|
145
|
+
kotoshu setup en --want spelling,frequency # also fetch Kelly list
|
|
146
|
+
kotoshu setup en --aff /p/en.aff --dic /p/en.dic
|
|
147
|
+
kotoshu setup en --from /usr/share/hunspell/
|
|
148
|
+
kotoshu setup --force en # re-download
|
|
149
|
+
kotoshu setup --list # show what's set up
|
|
150
|
+
|
|
151
|
+
Exit codes:
|
|
152
|
+
0 — every language set up successfully
|
|
153
|
+
3 — at least one language failed (network down, integrity, etc.)
|
|
154
|
+
DESC
|
|
155
|
+
method_option :aff, type: :string, desc: "Path to local .aff file"
|
|
156
|
+
method_option :dic, type: :string, desc: "Path to local .dic file"
|
|
157
|
+
method_option :from, type: :string, desc: "Directory containing local .aff/.dic"
|
|
158
|
+
method_option :frequency, type: :string, desc: "Path to local frequency.json"
|
|
159
|
+
method_option :want,
|
|
160
|
+
type: :string,
|
|
161
|
+
default: "spelling",
|
|
162
|
+
desc: "Comma-separated: spelling,frequency,model"
|
|
163
|
+
method_option :force,
|
|
164
|
+
type: :boolean,
|
|
165
|
+
default: false,
|
|
166
|
+
desc: "Re-fetch even if already cached"
|
|
167
|
+
method_option :strict,
|
|
168
|
+
type: :boolean,
|
|
169
|
+
default: false,
|
|
170
|
+
desc: "Re-raise on optional-resource failure during setup"
|
|
171
|
+
method_option :list,
|
|
172
|
+
type: :boolean,
|
|
173
|
+
default: false,
|
|
174
|
+
desc: "List currently set up languages and exit"
|
|
175
|
+
def setup(*languages)
|
|
176
|
+
apply_configuration!
|
|
177
|
+
|
|
178
|
+
if options[:list] || languages.empty?
|
|
179
|
+
list_setup
|
|
180
|
+
return
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
want = (options[:want] || "spelling").split(",").map(&:strip).map(&:to_sym)
|
|
184
|
+
opts = setup_source_options(languages)
|
|
185
|
+
opts[:want] = want
|
|
186
|
+
opts[:force] = options[:force]
|
|
187
|
+
opts[:strict] = options[:strict]
|
|
188
|
+
|
|
189
|
+
results = languages.map do |lang|
|
|
190
|
+
print "Setup #{lang}... "
|
|
191
|
+
begin
|
|
192
|
+
result = with_progress_reporter(label: lang) do
|
|
193
|
+
Kotoshu.setup(lang, **opts)
|
|
194
|
+
end
|
|
195
|
+
describe_setup_result(result)
|
|
196
|
+
{ lang: lang, ok: true }
|
|
197
|
+
rescue Kotoshu::Error, ArgumentError => e
|
|
198
|
+
puts "FAIL: #{e.message}"
|
|
199
|
+
{ lang: lang, ok: false }
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
failed = results.reject { |r| r[:ok] }
|
|
204
|
+
puts "Set up #{results.size} language(s)."
|
|
205
|
+
return if failed.empty?
|
|
206
|
+
|
|
207
|
+
raise Errors::ResourceUnavailable,
|
|
208
|
+
"failed to set up: #{failed.map { |r| r[:lang] }.join(', ')}"
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Back-compat alias. New code should use `setup`.
|
|
212
|
+
desc "fetch LANGUAGE [LANGUAGE ...]", "Alias for `setup` (deprecated)", hide: true
|
|
213
|
+
method_option :aff, type: :string, desc: "Path to local .aff file"
|
|
214
|
+
method_option :dic, type: :string, desc: "Path to local .dic file"
|
|
215
|
+
method_option :from, type: :string, desc: "Directory containing local .aff/.dic"
|
|
216
|
+
method_option :frequency, type: :string, desc: "Path to local frequency.json"
|
|
217
|
+
method_option :want,
|
|
218
|
+
type: :string,
|
|
219
|
+
default: "spelling",
|
|
220
|
+
desc: "Comma-separated: spelling,frequency,model"
|
|
221
|
+
method_option :force,
|
|
222
|
+
type: :boolean,
|
|
223
|
+
default: false,
|
|
224
|
+
desc: "Re-fetch even if already cached"
|
|
225
|
+
method_option :strict,
|
|
226
|
+
type: :boolean,
|
|
227
|
+
default: false,
|
|
228
|
+
desc: "Re-raise on optional-resource failure during setup"
|
|
229
|
+
method_option :list,
|
|
230
|
+
type: :boolean,
|
|
231
|
+
default: false,
|
|
232
|
+
desc: "List currently set up languages and exit"
|
|
233
|
+
def fetch(*languages)
|
|
234
|
+
setup(*languages)
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
desc "dict SUBCOMMAND", "Dictionary operations"
|
|
238
|
+
subcommand "dict", DictCommand
|
|
239
|
+
|
|
240
|
+
desc "cache SUBCOMMAND", "Cache management"
|
|
241
|
+
subcommand "cache", CacheCommand
|
|
242
|
+
|
|
243
|
+
desc "status", "Show setup, cache, and runtime status"
|
|
244
|
+
long_desc <<~DESC
|
|
245
|
+
Prints a snapshot of the kotoshu installation: which languages are
|
|
246
|
+
set up (with per-resource status), cache disk usage, audit log path,
|
|
247
|
+
default language, offline flag, and whether onnxruntime is loaded.
|
|
248
|
+
|
|
249
|
+
With --json, emits the same report as a JSON object for tooling.
|
|
250
|
+
DESC
|
|
251
|
+
method_option :json,
|
|
252
|
+
type: :boolean,
|
|
253
|
+
default: false,
|
|
254
|
+
desc: "Emit the report as JSON"
|
|
255
|
+
def status
|
|
256
|
+
report = StatusReport.build(version: Kotoshu::VERSION)
|
|
257
|
+
if options[:json]
|
|
258
|
+
puts status_json(report)
|
|
259
|
+
else
|
|
260
|
+
puts status_text(report)
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
desc "version", "Show version information"
|
|
265
|
+
def version
|
|
266
|
+
puts "Kotoshu version #{Kotoshu::VERSION}"
|
|
267
|
+
puts "Ruby #{RUBY_VERSION}"
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
map %w[--version -V] => :version
|
|
271
|
+
|
|
272
|
+
# Dispatch entry point — bypasses Thor's start rescue so we can honor
|
|
273
|
+
# exit_status from Errors::CliError subclasses. Thor::Error still falls
|
|
274
|
+
# back to exit 1 for framework-level errors (bad flags, etc.).
|
|
275
|
+
#
|
|
276
|
+
# ResourceNotSetupError from the strict two-stage model is intercepted
|
|
277
|
+
# here: AutoSetup asks the user once, then we retry the dispatch. In
|
|
278
|
+
# non-TTY or offline mode AutoSetup re-raises so scripts see stable
|
|
279
|
+
# behavior.
|
|
280
|
+
def self.start(given_args = ARGV, config = {})
|
|
281
|
+
config[:shell] ||= Thor::Base.shell.new
|
|
282
|
+
dispatch(nil, given_args.dup, nil, config)
|
|
283
|
+
rescue Kotoshu::ResourceNotSetupError => e
|
|
284
|
+
raise Errors::ResourceUnavailable, e.message unless AutoSetup.new.call(e)
|
|
285
|
+
|
|
286
|
+
retry
|
|
287
|
+
rescue Errors::CliError => e
|
|
288
|
+
warn "Error: #{e.message}"
|
|
289
|
+
exit e.exit_status
|
|
290
|
+
rescue Thor::Error => e
|
|
291
|
+
warn e.message
|
|
292
|
+
exit 1
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
def self.exit_on_failure?
|
|
296
|
+
false
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
private
|
|
300
|
+
|
|
301
|
+
def apply_configuration!
|
|
302
|
+
Kotoshu::Configuration.reset
|
|
303
|
+
cfg = Kotoshu::Configuration.instance
|
|
304
|
+
cfg.default_language = options[:language] if options[:language] && options[:language] != "auto"
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
# Install a ProgressReporter on Configuration.download_reporter
|
|
308
|
+
# for the duration of the block, then restore the prior value.
|
|
309
|
+
# The reporter writes to $stderr; in non-TTY contexts it still
|
|
310
|
+
# emits periodic line messages so CI logs show progress.
|
|
311
|
+
def with_progress_reporter(label:)
|
|
312
|
+
prior = Kotoshu.configuration.download_reporter
|
|
313
|
+
Kotoshu.configuration.download_reporter = ProgressReporter.new(
|
|
314
|
+
output: $stderr,
|
|
315
|
+
label: label
|
|
316
|
+
)
|
|
317
|
+
yield
|
|
318
|
+
ensure
|
|
319
|
+
Kotoshu.configuration.download_reporter = prior
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
def status_text(report)
|
|
323
|
+
lines = []
|
|
324
|
+
lines << "Kotoshu #{report.version}"
|
|
325
|
+
lines << ""
|
|
326
|
+
|
|
327
|
+
lines << "Setup:"
|
|
328
|
+
if report.resources.empty?
|
|
329
|
+
lines << " (no languages set up — run `kotoshu setup LANG`)"
|
|
330
|
+
else
|
|
331
|
+
report.resources.each do |r|
|
|
332
|
+
mark = r.available ? "✓" : "✗"
|
|
333
|
+
size = r.available ? StatusReport.format_bytes(r.size_bytes) : "—"
|
|
334
|
+
when_str = r.cached_at ? "cached #{r.cached_at.strftime('%Y-%m-%d')}" : ""
|
|
335
|
+
lines << format(" %-4s %-10s %s %s%s",
|
|
336
|
+
r.language, r.resource, mark, size,
|
|
337
|
+
when_str.empty? ? "" : ", #{when_str}")
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
lines << ""
|
|
341
|
+
|
|
342
|
+
lines << "Cache:"
|
|
343
|
+
lines << " Path #{report.cache_path}"
|
|
344
|
+
lines << " Size #{StatusReport.format_bytes(report.cache_size_bytes)}"
|
|
345
|
+
lines << " Languages #{report.languages_setup.size}"
|
|
346
|
+
lines << ""
|
|
347
|
+
|
|
348
|
+
lines << "Semantic:"
|
|
349
|
+
onnx_state = report.onnx_loaded ? "loaded" : "not loaded (gem install onnxruntime to enable)"
|
|
350
|
+
lines << " onnxruntime #{onnx_state}"
|
|
351
|
+
active_models = report.languages_with_model
|
|
352
|
+
models_str = active_models.empty? ? "0" : "#{active_models.size} (#{active_models.join(', ')})"
|
|
353
|
+
lines << " Active models #{models_str}"
|
|
354
|
+
lines << ""
|
|
355
|
+
|
|
356
|
+
lines << "Other:"
|
|
357
|
+
if report.audit_log_path
|
|
358
|
+
lines << " Audit log #{report.audit_log_path} (#{StatusReport.format_bytes(report.audit_log_size_bytes)})"
|
|
359
|
+
else
|
|
360
|
+
lines << " Audit log (none yet — created on first audited operation)"
|
|
361
|
+
end
|
|
362
|
+
lines << " Default lang #{report.default_language || '(none)'}"
|
|
363
|
+
lines << " Offline mode #{report.offline ? 'yes' : 'no'}"
|
|
364
|
+
lines.join("\n")
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
def status_json(report)
|
|
368
|
+
require "json"
|
|
369
|
+
|
|
370
|
+
payload = {
|
|
371
|
+
version: report.version,
|
|
372
|
+
setup: report.resources.map do |r|
|
|
373
|
+
{
|
|
374
|
+
language: r.language,
|
|
375
|
+
resource: r.resource.to_s,
|
|
376
|
+
available: r.available,
|
|
377
|
+
size_bytes: r.size_bytes,
|
|
378
|
+
cached_at: r.cached_at&.iso8601
|
|
379
|
+
}
|
|
380
|
+
end,
|
|
381
|
+
cache: {
|
|
382
|
+
path: report.cache_path,
|
|
383
|
+
size_bytes: report.cache_size_bytes,
|
|
384
|
+
languages: report.languages_setup.size
|
|
385
|
+
},
|
|
386
|
+
semantic: {
|
|
387
|
+
onnxruntime_loaded: report.onnx_loaded,
|
|
388
|
+
active_models: report.languages_with_model
|
|
389
|
+
},
|
|
390
|
+
audit_log: report.audit_log_path && {
|
|
391
|
+
path: report.audit_log_path,
|
|
392
|
+
size_bytes: report.audit_log_size_bytes
|
|
393
|
+
},
|
|
394
|
+
default_language: report.default_language,
|
|
395
|
+
offline: report.offline
|
|
396
|
+
}
|
|
397
|
+
JSON.pretty_generate(payload)
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
def read_target(target)
|
|
401
|
+
if target.nil?
|
|
402
|
+
[$stdin.read, "<stdin>"]
|
|
403
|
+
elsif File.exist?(target)
|
|
404
|
+
[File.read(target, encoding: Kotoshu.configuration.encoding), target]
|
|
405
|
+
else
|
|
406
|
+
raise Errors::UsageError, "File not found: #{target}"
|
|
407
|
+
end
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
def run_check(text)
|
|
411
|
+
language = resolve_language(text)
|
|
412
|
+
spellchecker = Kotoshu.spellchecker_for(language)
|
|
413
|
+
spellchecker.check(text)
|
|
414
|
+
rescue Kotoshu::DictionaryNotFoundError => e
|
|
415
|
+
raise Errors::ResourceUnavailable, e.message
|
|
416
|
+
end
|
|
417
|
+
|
|
418
|
+
def resolve_language(text)
|
|
419
|
+
result = LanguageResolver.new(
|
|
420
|
+
flag_value: options[:language],
|
|
421
|
+
default_language: Kotoshu.configuration.default_language
|
|
422
|
+
).resolve(text: text)
|
|
423
|
+
|
|
424
|
+
$stderr.puts "# #{result.note}" if result.note
|
|
425
|
+
result.language
|
|
426
|
+
end
|
|
427
|
+
|
|
428
|
+
def setup_source_options(languages)
|
|
429
|
+
opts = {}
|
|
430
|
+
if options[:aff] || options[:dic]
|
|
431
|
+
raise Errors::UsageError, "--aff and --dic require exactly one language" unless languages.size == 1
|
|
432
|
+
|
|
433
|
+
raise Errors::UsageError, "--aff and --dic must both be given" unless options[:aff] && options[:dic]
|
|
434
|
+
|
|
435
|
+
opts[:aff] = options[:aff]
|
|
436
|
+
opts[:dic] = options[:dic]
|
|
437
|
+
elsif options[:from]
|
|
438
|
+
opts[:from] = options[:from]
|
|
439
|
+
end
|
|
440
|
+
opts[:frequency] = options[:frequency] if options[:frequency]
|
|
441
|
+
opts
|
|
442
|
+
end
|
|
443
|
+
|
|
444
|
+
def describe_setup_result(result)
|
|
445
|
+
spelling = result.spelling || "skipped"
|
|
446
|
+
frequency = result.frequency || "skipped"
|
|
447
|
+
source = result.source
|
|
448
|
+
puts "OK (spelling: #{spelling}, frequency: #{frequency}, source: #{source})"
|
|
449
|
+
end
|
|
450
|
+
|
|
451
|
+
def list_setup
|
|
452
|
+
langs = Kotoshu.languages_setup
|
|
453
|
+
if langs.empty?
|
|
454
|
+
puts "No languages set up. Run `kotoshu setup LANG` to add one."
|
|
455
|
+
return
|
|
456
|
+
end
|
|
457
|
+
|
|
458
|
+
puts "Set up languages:"
|
|
459
|
+
langs.each { |lang| puts " #{lang}" }
|
|
460
|
+
end
|
|
461
|
+
|
|
462
|
+
def display_result(result, source)
|
|
463
|
+
case options[:format]
|
|
464
|
+
when "json"
|
|
465
|
+
puts format_as_json(result, source)
|
|
466
|
+
when "sarif"
|
|
467
|
+
puts format_as_sarif(result, source)
|
|
468
|
+
else
|
|
469
|
+
puts format_as_text(result, source)
|
|
470
|
+
end
|
|
471
|
+
end
|
|
472
|
+
|
|
473
|
+
def format_as_text(result, source)
|
|
474
|
+
if result.success?
|
|
475
|
+
"OK #{source} (#{result.word_count} words, no errors)"
|
|
476
|
+
else
|
|
477
|
+
lines = []
|
|
478
|
+
lines << "FAIL #{source} (#{result.error_count} errors)"
|
|
479
|
+
result.each_error do |error|
|
|
480
|
+
suggestions_str = if error.has_suggestions?
|
|
481
|
+
" -> #{error.top_suggestions(3).join(", ")}"
|
|
482
|
+
else
|
|
483
|
+
""
|
|
484
|
+
end
|
|
485
|
+
lines << " #{error.word}#{suggestions_str}"
|
|
486
|
+
end
|
|
487
|
+
lines.join("\n")
|
|
488
|
+
end
|
|
489
|
+
end
|
|
490
|
+
|
|
491
|
+
def format_as_json(result, source)
|
|
492
|
+
require "json"
|
|
493
|
+
|
|
494
|
+
output = result.as_json
|
|
495
|
+
output["source"] = source
|
|
496
|
+
JSON.pretty_generate(output)
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
def format_as_sarif(result, source)
|
|
500
|
+
require "json"
|
|
501
|
+
|
|
502
|
+
results = result.errors.map do |err|
|
|
503
|
+
suggestions = err.top_suggestions(3)
|
|
504
|
+
suggestion_text = suggestions.empty? ? "" : " Suggestions: #{suggestions.join(", ")}"
|
|
505
|
+
{
|
|
506
|
+
"ruleId" => "kotoshu/spelling",
|
|
507
|
+
"level" => "warning",
|
|
508
|
+
"message" => {
|
|
509
|
+
"text" => "'#{err.word}' is not in the dictionary.#{suggestion_text}"
|
|
510
|
+
},
|
|
511
|
+
"locations" => [
|
|
512
|
+
{
|
|
513
|
+
"physicalLocation" => {
|
|
514
|
+
"artifactLocation" => { "uri" => source_for_sarif(source) },
|
|
515
|
+
"region" => {
|
|
516
|
+
"charOffset" => err.position || 0,
|
|
517
|
+
"charLength" => err.word.length
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
]
|
|
522
|
+
}
|
|
523
|
+
end
|
|
524
|
+
|
|
525
|
+
sarif = {
|
|
526
|
+
"version" => "2.1.0",
|
|
527
|
+
"$schema" => "https://json.schemastore.org/sarif-2.1.0.json",
|
|
528
|
+
"runs" => [
|
|
529
|
+
{
|
|
530
|
+
"tool" => {
|
|
531
|
+
"driver" => {
|
|
532
|
+
"name" => "kotoshu",
|
|
533
|
+
"version" => Kotoshu::VERSION,
|
|
534
|
+
"informationUri" => "https://github.com/kotoshu/kotoshu",
|
|
535
|
+
"rules" => [
|
|
536
|
+
{
|
|
537
|
+
"id" => "kotoshu/spelling",
|
|
538
|
+
"name" => "SpellingError",
|
|
539
|
+
"shortDescription" => {
|
|
540
|
+
"text" => "Word not found in the active dictionary."
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
]
|
|
544
|
+
}
|
|
545
|
+
},
|
|
546
|
+
"results" => results
|
|
547
|
+
}
|
|
548
|
+
]
|
|
549
|
+
}
|
|
550
|
+
JSON.pretty_generate(sarif)
|
|
551
|
+
end
|
|
552
|
+
|
|
553
|
+
def source_for_sarif(source)
|
|
554
|
+
source == "<stdin>" ? "stdin" : source
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
def interactive_review(result, source)
|
|
558
|
+
errors = result.errors
|
|
559
|
+
return if errors.empty?
|
|
560
|
+
|
|
561
|
+
index = 0
|
|
562
|
+
accepted = {}
|
|
563
|
+
skipped = Set.new
|
|
564
|
+
|
|
565
|
+
puts
|
|
566
|
+
puts "Interactive review: #{errors.size} error(s) in #{source}"
|
|
567
|
+
puts "Commands: [1-9] accept, [s] skip, [n]/Enter next, [p] prev, [l] list, [q] quit"
|
|
568
|
+
|
|
569
|
+
while index < errors.size
|
|
570
|
+
err = errors[index]
|
|
571
|
+
puts
|
|
572
|
+
puts "[#{index + 1}/#{errors.size}] '#{err.word}' (offset #{err.position || '?'})"
|
|
573
|
+
suggestions = err.top_suggestions(9)
|
|
574
|
+
if suggestions.empty?
|
|
575
|
+
puts " (no suggestions)"
|
|
576
|
+
else
|
|
577
|
+
suggestions.each_with_index { |s, i| puts " [#{i + 1}] #{s}" }
|
|
578
|
+
end
|
|
579
|
+
print "> "
|
|
580
|
+
input = $stdin.gets
|
|
581
|
+
break if input.nil?
|
|
582
|
+
|
|
583
|
+
input = input.chomp.downcase
|
|
584
|
+
|
|
585
|
+
case input
|
|
586
|
+
when "q"
|
|
587
|
+
puts "Quitting review."
|
|
588
|
+
break
|
|
589
|
+
when "n", ""
|
|
590
|
+
index += 1
|
|
591
|
+
when "p"
|
|
592
|
+
index = [index - 1, 0].max
|
|
593
|
+
when "l"
|
|
594
|
+
errors.each_with_index do |e, i|
|
|
595
|
+
marker = case
|
|
596
|
+
when accepted.key?(i) then "✓"
|
|
597
|
+
when skipped.include?(i) then "s"
|
|
598
|
+
else " "
|
|
599
|
+
end
|
|
600
|
+
puts " #{marker} #{i + 1}. #{e.word}"
|
|
601
|
+
end
|
|
602
|
+
when "s"
|
|
603
|
+
skipped << index
|
|
604
|
+
index += 1
|
|
605
|
+
when /\A[1-9]\z/
|
|
606
|
+
choice = input.to_i - 1
|
|
607
|
+
suggestion = suggestions[choice]
|
|
608
|
+
if suggestion
|
|
609
|
+
accepted[index] = suggestion
|
|
610
|
+
puts " → '#{err.word}' → '#{suggestion}' (recorded)"
|
|
611
|
+
index += 1
|
|
612
|
+
else
|
|
613
|
+
puts " No suggestion at that number."
|
|
614
|
+
end
|
|
615
|
+
else
|
|
616
|
+
puts " Unknown command."
|
|
617
|
+
end
|
|
618
|
+
end
|
|
619
|
+
|
|
620
|
+
puts
|
|
621
|
+
puts "Review complete: #{accepted.size} accepted, #{skipped.size} skipped, " \
|
|
622
|
+
"#{errors.size - accepted.size - skipped.size} unhandled."
|
|
623
|
+
puts "Note: 0.3 records decisions but does not rewrite source files." unless accepted.empty?
|
|
624
|
+
end
|
|
625
|
+
end
|
|
626
|
+
end
|
|
627
|
+
end
|