kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,312 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../documents/document'
4
+ require_relative '../analyzers/semantic_analyzer'
5
+ require_relative '../models/fasttext_model'
6
+ require_relative '../cache/model_cache'
7
+ require_relative '../cli/interactive_reviewer'
8
+ require_relative '../cli/batch_reporter'
9
+ require_relative '../language/identifier'
10
+
11
+ module Kotoshu
12
+ class CheckCommand < Thor
13
+ namespace :check
14
+
15
+ class_option :language, aliases: '-l', type: :string, default: 'auto',
16
+ desc: 'Language code (auto, de, en, es, fr, pt, ru)'
17
+ class_option :interactive, aliases: '-i', type: :boolean, default: false,
18
+ desc: 'Interactive mode for error review'
19
+ class_option :output, aliases: '-o', type: :string,
20
+ desc: 'Output file path (for batch mode)'
21
+ class_option :format, type: :string, enum: %w[text json yaml csv sarif], default: 'text',
22
+ desc: 'Output format (text, json, yaml, csv, sarif)'
23
+ class_option :model, type: :string, enum: %w[fasttext hunspell], default: 'hunspell',
24
+ desc: 'Analysis model (fasttext, hunspell)'
25
+ class_option :download, type: :boolean, default: true,
26
+ desc: 'Automatically download models if missing'
27
+ class_option :verbose, aliases: '-v', type: :boolean, default: false,
28
+ desc: 'Verbose output'
29
+
30
+ desc 'check FILE', 'Check spelling/grammar in a file'
31
+ def check(file)
32
+ # Validate file exists
33
+ unless File.exist?(file)
34
+ puts "Error: File not found: #{file}"
35
+ exit 1
36
+ end
37
+
38
+ # Detect language if auto
39
+ language = detect_language(file, options[:language])
40
+
41
+ # Load document
42
+ document = load_document(file, language)
43
+
44
+ # Load analyzer based on model type
45
+ analyzer = load_analyzer(language, options[:model])
46
+
47
+ puts "Analyzing #{file} (language: #{language})..." if options[:verbose]
48
+
49
+ # Run interactive or batch mode
50
+ if options[:interactive]
51
+ run_interactive_mode(document, analyzer)
52
+ else
53
+ run_batch_mode(document, analyzer)
54
+ end
55
+ end
56
+
57
+ desc 'string TEXT', 'Check spelling/grammar in a text string'
58
+ option :format, type: :string, enum: %w[text markdown], default: 'text',
59
+ desc: 'Text format (text, markdown)'
60
+ def string(text)
61
+ language_code = options[:language]
62
+
63
+ # Create document from string
64
+ format_sym = options[:format].to_sym
65
+ document = Documents::Document.from_string(text, language_code: language_code)
66
+
67
+ # Load analyzer
68
+ analyzer = load_analyzer(language_code, options[:model])
69
+
70
+ puts "Analyzing..." if options[:verbose]
71
+
72
+ # Always use batch mode for string input
73
+ reporter = run_batch_mode(document, analyzer)
74
+
75
+ # Print report
76
+ reporter.print(format: options[:format].to_sym)
77
+
78
+ # Exit with appropriate code
79
+ exit reporter.exit_code
80
+ end
81
+
82
+ desc 'stdin', 'Check spelling/grammar from stdin'
83
+ option :format, type: :string, enum: %w[text markdown], default: 'text',
84
+ desc: 'Text format (text, markdown)'
85
+ def stdin
86
+ text = $stdin.read
87
+
88
+ if text.nil? || text.empty?
89
+ puts "Error: No input provided"
90
+ exit 1
91
+ end
92
+
93
+ # Delegate to string command
94
+ invoke :string, [text], options
95
+ end
96
+
97
+ private
98
+
99
+ # Detect language from file or use specified language.
100
+ #
101
+ # @param filepath [String] Path to file
102
+ # @param language_code [String] Specified language code or 'auto'
103
+ # @return [String] Detected or specified language code
104
+ def detect_language(filepath, language_code)
105
+ return language_code unless language_code == 'auto'
106
+
107
+ puts "Detecting language..." if options[:verbose]
108
+
109
+ begin
110
+ lid = Language::LanguageIdentifier.new
111
+ result = lid.detect_from_file(filepath, top_k: 1).first
112
+
113
+ if result && result.confidence > 0.8
114
+ detected = result.language
115
+ puts " Detected: #{detected} (#{(result.confidence * 100).round(0)}% confidence)" if options[:verbose]
116
+ detected
117
+ else
118
+ puts " Language detection uncertain, using 'en'" if options[:verbose]
119
+ 'en'
120
+ end
121
+ rescue StandardError => e
122
+ puts " Language detection failed: #{e.message}" if options[:verbose]
123
+ puts " Using 'en' as default" if options[:verbose]
124
+ 'en'
125
+ end
126
+ end
127
+
128
+ # Load document from file.
129
+ #
130
+ # @param filepath [String] Path to file
131
+ # @param language_code [String] Language code
132
+ # @return [Documents::Document] Loaded document
133
+ def load_document(filepath, language_code = 'en')
134
+ Documents::Document.from_file(filepath, language_code: language_code)
135
+ rescue StandardError => e
136
+ puts "Error loading document: #{e.message}"
137
+ exit 1
138
+ end
139
+
140
+ # Load analyzer based on model type.
141
+ #
142
+ # @param language_code [String] Language code
143
+ # @param model_type [String] Model type
144
+ # @return [Object] Analyzer instance
145
+ def load_analyzer(language_code, model_type)
146
+ case model_type
147
+ when 'fasttext'
148
+ load_fasttext_analyzer(language_code)
149
+ when 'hunspell'
150
+ load_hunspell_analyzer(language_code)
151
+ else
152
+ raise ArgumentError, "Unknown model type: #{model_type}"
153
+ end
154
+ end
155
+
156
+ # Load FastText analyzer using ONNX model.
157
+ #
158
+ # ONNX is the ONLY supported format. No fallbacks.
159
+ #
160
+ # @param language_code [String] Language code
161
+ # @return [Analyzers::SemanticAnalyzer] FastText analyzer with ONNX model
162
+ def load_fasttext_analyzer(language_code)
163
+ cache = Cache::ModelCache.new
164
+ onnx_file = cache.get_onnx_model(language_code, force_download: options[:download])
165
+
166
+ unless onnx_file && File.exist?(onnx_file)
167
+ puts "Error: ONNX model not found for #{language_code}"
168
+ puts ""
169
+ puts "Download the model first:"
170
+ puts " kotoshu model download #{language_code} --type onnx"
171
+ puts ""
172
+ puts "Or convert from FastText .vec file:"
173
+ puts " kotoshu model convert cc.#{language_code}.300.vec fasttext.#{language_code}.onnx -l #{language_code}"
174
+ exit 1
175
+ end
176
+
177
+ puts "Loading ONNX model for #{language_code}..." if options[:verbose]
178
+ model = Models::OnnxModel.from_file(onnx_file)
179
+ model.preload_embedding_matrix if options[:verbose]
180
+ Analyzers::SemanticAnalyzer.new(model)
181
+ rescue StandardError => e
182
+ puts "Error loading FastText analyzer: #{e.message}"
183
+ puts ""
184
+ puts "Ensure ONNX Runtime is installed:"
185
+ puts " gem install onnxruntime"
186
+ exit 1
187
+ end
188
+
189
+ # Load Hunspell analyzer.
190
+ #
191
+ # @param language_code [String] Language code
192
+ # @return [Object] Hunspell analyzer
193
+ def load_hunspell_analyzer(language_code)
194
+ require_relative '../dictionary/hunspell'
195
+
196
+ # Load Hunspell dictionary
197
+ if options[:download]
198
+ puts "Loading Hunspell dictionary for #{language_code}..." if options[:verbose]
199
+ dict = Dictionary::Hunspell.from_github(language_code)
200
+ else
201
+ # Try local paths
202
+ dict = Dictionary::Hunspell.for_language(language_code)
203
+ end
204
+
205
+ # Create Hunspell-based analyzer
206
+ # Note: This would use HunspellDictionary for checking + EditDistanceStrategy for suggestions
207
+ # For now, we'll use a placeholder
208
+ require_relative '../spell_checker'
209
+ SpellChecker.new(dictionary: dict, language: language_code)
210
+ rescue StandardError => e
211
+ puts "Error loading Hunspell analyzer: #{e.message}"
212
+ exit 1
213
+ end
214
+
215
+ # Run interactive mode.
216
+ #
217
+ # @param document [Documents::Document] Document to check
218
+ # @param analyzer [Object] Analyzer instance
219
+ def run_interactive_mode(document, analyzer)
220
+ # Create interactive reviewer
221
+ reviewer = Cli::InteractiveReviewer.new(document, analyzer)
222
+
223
+ unless reviewer.has_errors?
224
+ puts "No errors found!"
225
+ return
226
+ end
227
+
228
+ # Run interactive loop
229
+ reviewer.run
230
+
231
+ # Apply corrections if user accepted any
232
+ if reviewer.navigation.modified.any?
233
+ apply_corrections(document, reviewer.navigation)
234
+ end
235
+ end
236
+
237
+ # Run batch mode.
238
+ #
239
+ # @param document [Documents::Document] Document to check
240
+ # @param analyzer [Object] Analyzer instance
241
+ # @return [Cli::BatchReporter] Batch reporter
242
+ def run_batch_mode(document, analyzer)
243
+ # For batch mode with Hunspell, we need different approach
244
+ if analyzer.is_a?(SpellChecker)
245
+ # Use traditional spell checker
246
+ result = analyzer.check_string(document.content)
247
+ # Convert result to navigation...
248
+ # This is a placeholder - full implementation would convert
249
+ end
250
+
251
+ # For SemanticAnalyzer, create reviewer and get batch reporter
252
+ if analyzer.is_a?(Analyzers::SemanticAnalyzer)
253
+ reviewer = Cli::InteractiveReviewer.new(document, analyzer)
254
+ reporter = reviewer.run_batch
255
+
256
+ # Write to file if specified
257
+ if options[:output]
258
+ case options[:format]
259
+ when 'json'
260
+ reporter.to_json(filepath: options[:output])
261
+ when 'yaml'
262
+ reporter.to_yaml(filepath: options[:output])
263
+ when 'csv'
264
+ reporter.to_csv(filepath: options[:output])
265
+ when 'sarif'
266
+ reporter.to_sarif(filepath: options[:output])
267
+ else
268
+ File.write(options[:output], reporter.to_text)
269
+ end
270
+
271
+ puts "Report written to: #{options[:output]}" if options[:verbose]
272
+ end
273
+
274
+ return reporter
275
+ end
276
+
277
+ # Fallback
278
+ nil
279
+ end
280
+
281
+ # Apply corrections to document.
282
+ #
283
+ # @param document [Documents::Document] Original document
284
+ # @param navigation [Cli::NavigationManager] Navigation state with corrections
285
+ def apply_corrections(document, navigation)
286
+ corrections = navigation.export_corrections
287
+
288
+ if corrections.empty?
289
+ return
290
+ end
291
+
292
+ # Apply corrections
293
+ corrected_doc = document.apply(corrections.map { |c|
294
+ # Convert correction hash to SemanticError
295
+ # This is a placeholder - full implementation would reconstruct errors
296
+ }.compact)
297
+
298
+ # Write corrected document
299
+ backup_path = document.name + ".bak"
300
+ output_path = document.name
301
+
302
+ # Create backup
303
+ File.write(backup_path, document.content)
304
+
305
+ # Write corrected version
306
+ File.write(output_path, corrected_doc.content)
307
+
308
+ puts "Created backup: #{backup_path}" if options[:verbose]
309
+ puts "Wrote corrections to: #{output_path}"
310
+ end
311
+ end
312
+ end
@@ -0,0 +1,295 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'thor'
4
+ require_relative '../models/fasttext_model'
5
+ require_relative '../models/onnx_model'
6
+ require_relative '../cache/model_cache'
7
+
8
+ module Kotoshu
9
+ class ModelCommand < Thor
10
+ namespace :model
11
+
12
+ desc 'convert INPUT OUTPUT', 'Convert FastText .vec file to ONNX format'
13
+ option :language, aliases: '-l', type: :string, required: true,
14
+ desc: 'Language code (de, en, es, fr, pt, ru)'
15
+ option :max_vectors, type: :numeric, default: 500_000,
16
+ desc: 'Maximum vectors to convert (default: 500k)'
17
+ option :validate, type: :boolean, default: true,
18
+ desc: 'Validate model after conversion'
19
+ def convert(input, output)
20
+ puts "Converting #{input} to #{output}..."
21
+
22
+ # Check if input file exists
23
+ unless File.exist?(input)
24
+ puts "Error: Input file not found: #{input}"
25
+ exit 1
26
+ end
27
+
28
+ # Build Python command
29
+ script_path = File.join(File.dirname(__FILE__), '../../scripts/convert_fasttext_to_onnx.py')
30
+
31
+ unless File.exist?(script_path)
32
+ puts "Error: Conversion script not found: #{script_path}"
33
+ exit 1
34
+ end
35
+
36
+ # Build command
37
+ cmd = [
38
+ 'python3',
39
+ script_path,
40
+ '--input', input,
41
+ '--output', output,
42
+ '--language', options[:language],
43
+ '--max-vectors', options[:max_vectors].to_s
44
+ ]
45
+
46
+ cmd << '--validate' if options[:validate]
47
+
48
+ puts "Running: #{cmd.join(' ')}"
49
+
50
+ # Execute conversion
51
+ system(*cmd)
52
+
53
+ if $?.success?
54
+ puts "\n✓ Conversion successful!"
55
+ puts " Model: #{output}"
56
+ puts " Vocab: #{output.sub('.onnx', '.vocab.json')}"
57
+ puts " Metadata: #{output.sub('.onnx', '.metadata.json')}"
58
+ puts " Optimized: #{output.sub('.onnx', '.ort.onnx')}"
59
+ else
60
+ puts "\n✗ Conversion failed!"
61
+ exit 1
62
+ end
63
+ end
64
+
65
+ desc 'download LANGUAGE', 'Download FastText model for a language'
66
+ option :type, type: :string, enum: %w[fasttext onnx], default: 'fasttext',
67
+ desc: 'Model type to download'
68
+ option :output, type: :string,
69
+ desc: 'Output path (default: $XDG_CACHE_HOME/kotoshu/languages/{code}/models/)'
70
+ option :force, type: :boolean, default: false,
71
+ desc: 'Force re-download even if cached'
72
+ def download(language)
73
+ puts "Downloading #{options[:type]} model for #{language}..."
74
+
75
+ cache = Cache::ModelCache.new
76
+
77
+ case options[:type]
78
+ when 'fasttext'
79
+ vec_file = cache.get_fasttext_model(language, force_download: options[:force])
80
+ puts "✓ Downloaded to: #{vec_file}"
81
+ when 'onnx'
82
+ onnx_file = cache.get_onnx_model(language, force_download: options[:force])
83
+ puts "✓ Downloaded to: #{onnx_file}"
84
+ end
85
+
86
+ # Show file info
87
+ show_model_info(language)
88
+ end
89
+
90
+ desc 'info LANGUAGE', 'Show information about available models'
91
+ option :type, type: :string, enum: %w[fasttext onnx],
92
+ desc: 'Model type to show (default: all)'
93
+ def info(language)
94
+ cache = Cache::ModelCache.new
95
+
96
+ puts "Model information for #{language}:"
97
+ puts ""
98
+
99
+ if options[:type].nil? || options[:type] == 'fasttext'
100
+ model_info = cache.model_info(language, :fasttext)
101
+ if model_info
102
+ puts "FastText:"
103
+ puts " File: #{model_info[:file]}"
104
+ puts " Size: #{model_info[:size].to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse} vectors"
105
+ puts " Source: #{model_info[:source]}"
106
+ puts ""
107
+ end
108
+ end
109
+
110
+ if options[:type].nil? || options[:type] == 'onnx'
111
+ model_info = cache.model_info(language, :onnx)
112
+ if model_info
113
+ puts "ONNX:"
114
+ puts " File: #{model_info[:file]}"
115
+ puts " Source: #{model_info[:source]}"
116
+ puts ""
117
+ end
118
+ end
119
+ end
120
+
121
+ desc 'list', 'List all available models'
122
+ def list
123
+ cache = Cache::ModelCache.new
124
+ all_models = cache.all_available_models
125
+
126
+ puts "Available models:"
127
+ puts ""
128
+
129
+ all_models.each do |model_type, languages|
130
+ puts "#{model_type.to_s.capitalize}:"
131
+
132
+ languages.each do |code, info|
133
+ puts " #{code}:"
134
+ puts " File: #{info[:file]}"
135
+ puts " Source: #{info[:source]}"
136
+ end
137
+
138
+ puts ""
139
+ end
140
+ end
141
+
142
+ desc 'validate MODEL_PATH', 'Validate an ONNX model'
143
+ def validate(model_path)
144
+ puts "Validating #{model_path}..."
145
+
146
+ unless File.exist?(model_path)
147
+ puts "Error: Model file not found: #{model_path}"
148
+ exit 1
149
+ end
150
+
151
+ # Try to load the model
152
+ begin
153
+ model = Models::OnnxModel.from_file(model_path)
154
+
155
+ puts "✓ Model loaded successfully"
156
+ puts " Language: #{model.language_code}"
157
+ puts " Dimension: #{model.dimension}"
158
+ puts " Vocabulary: #{model.vocabulary_size} words"
159
+
160
+ # Test lookup
161
+ test_word = model.vocabulary.first
162
+ if test_word
163
+ embedding = model.embedding_for(test_word)
164
+ puts " Test lookup: '#{test_word}' -> vector of size #{embedding.vector.size}"
165
+ end
166
+
167
+ puts "\n✓ Model is valid!"
168
+
169
+ rescue StandardError => e
170
+ puts "✗ Validation failed: #{e.message}"
171
+ exit 1
172
+ end
173
+ end
174
+
175
+ desc 'upload LANGUAGE MODEL_FILE', 'Upload model to dictionaries repository'
176
+ option :repo, type: :string, default: 'kotoshu/dictionaries',
177
+ desc: 'GitHub repository'
178
+ option :branch, type: :string, default: 'main',
179
+ desc: 'Target branch'
180
+ option :create_pr, type: :boolean, default: false,
181
+ desc: 'Create pull request instead of direct push'
182
+ def upload(language, model_file)
183
+ puts "Uploading #{model_file} to #{options[:repo]}..."
184
+
185
+ # Check if file exists
186
+ unless File.exist?(model_file)
187
+ puts "Error: File not found: #{model_file}"
188
+ exit 1
189
+ end
190
+
191
+ # Determine model type and destination path
192
+ if model_file.end_with?('.vec')
193
+ model_type = 'fasttext'
194
+ filename = File.basename(model_file)
195
+ dest_path = "#{language}/models/fasttext/#{filename}"
196
+ elsif model_file.end_with?('.onnx')
197
+ model_type = 'onnx'
198
+ filename = File.basename(model_file)
199
+ dest_path = "#{language}/models/onnx/#{filename}"
200
+
201
+ # Also upload vocab and metadata files
202
+ vocab_file = model_file.sub('.onnx', '.vocab.json')
203
+ metadata_file = model_file.sub('.onnx', '.metadata.json')
204
+ ort_file = model_file.sub('.onnx', '.ort.onnx')
205
+ else
206
+ puts "Error: Unknown file type. Expected .vec or .onnx"
207
+ exit 1
208
+ end
209
+
210
+ # Build gh command
211
+ cmd = [
212
+ 'gh', 'repo', 'clone', options[:repo], '/tmp/kotoshu-dictionaries'
213
+ ]
214
+
215
+ puts "Cloning repository..."
216
+ system(*cmd)
217
+
218
+ unless $?.success?
219
+ puts "Error: Failed to clone repository"
220
+ exit 1
221
+ end
222
+
223
+ # Copy files to destination
224
+ target_dir = File.join('/tmp/kotoshu-dictionaries', File.dirname(dest_path))
225
+ FileUtils.mkdir_p(target_dir)
226
+
227
+ FileUtils.cp(model_file, File.join('/tmp/kotoshu-dictionaries', dest_path))
228
+
229
+ if model_type == 'onnx'
230
+ if File.exist?(vocab_file)
231
+ FileUtils.cp(vocab_file, File.join('/tmp/kotoshu-dictionaries', dest_path.sub('.onnx', '.vocab.json')))
232
+ end
233
+ if File.exist?(metadata_file)
234
+ FileUtils.cp(metadata_file, File.join('/tmp/kotoshu-dictionaries', dest_path.sub('.onnx', '.metadata.json')))
235
+ end
236
+ if File.exist?(ort_file)
237
+ FileUtils.cp(ort_file, File.join('/tmp/kotoshu-dictionaries', dest_path.sub('.onnx', '.ort.onnx')))
238
+ end
239
+ end
240
+
241
+ # Commit and push
242
+ Dir.chdir('/tmp/kotoshu-dictionaries') do
243
+ system('git', 'add', '.')
244
+
245
+ message = "Add #{model_type} model for #{language}\n\n"
246
+ message += "Model: #{filename}\n"
247
+ message += "Language: #{language}\n"
248
+
249
+ system('git', 'commit', '-m', message)
250
+
251
+ if options[:create_pr]
252
+ # Create branch and PR
253
+ branch_name = "add-#{model_type}-#{language}"
254
+ system('git', 'checkout', '-b', branch_name)
255
+ system('git', 'push', 'origin', branch_name)
256
+ system('gh', 'pr', 'create', '--title', "Add #{model_type} model for #{language}", '--body', message)
257
+ else
258
+ # Direct push
259
+ system('git', 'push')
260
+ end
261
+ end
262
+
263
+ if $?.success?
264
+ puts "✓ Upload successful!"
265
+ puts " Path: #{dest_path}"
266
+ puts " Repository: #{options[:repo]}"
267
+ else
268
+ puts "✗ Upload failed!"
269
+ exit 1
270
+ end
271
+ end
272
+
273
+ private
274
+
275
+ # Show model file information.
276
+ #
277
+ # @param language [String] Language code
278
+ def show_model_info(language)
279
+ cache = Cache::ModelCache.new
280
+ model_path = File.join(cache.instance_variable_get(:@cache_path), language, 'models')
281
+
282
+ if Dir.exist?(model_path)
283
+ puts "\nModel files:"
284
+ Dir.glob(File.join(model_path, '**/*')).each do |file|
285
+ next if File.directory?(file)
286
+
287
+ size = File.size(file)
288
+ size_mb = (size.to_f / 1024 / 1024).round(2)
289
+
290
+ puts " #{File.basename(file)}: #{size_mb} MB"
291
+ end
292
+ end
293
+ end
294
+ end
295
+ end
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'spell_checker'
4
+
5
+ module Kotoshu
6
+ module Components
7
+ # Passthrough spell checker for languages that don't use spell checking.
8
+ #
9
+ # This checker always returns that words are "found" (correct). It's used
10
+ # for languages that don't have traditional spell checking, such as:
11
+ # - CJK languages (Japanese, Chinese) - use confusion rules instead
12
+ # - Languages with purely rule-based checking
13
+ #
14
+ # @example
15
+ # checker = PassthroughSpellChecker.new
16
+ # result = checker.check('任意のテキスト')
17
+ # # => { found: true, stem: nil, flags: [] }
18
+ #
19
+ # @example Getting suggestions (always empty)
20
+ # suggestions = checker.suggest('テキスト')
21
+ # # => []
22
+ class PassthroughSpellChecker < SpellChecker
23
+ # Create a new passthrough spell checker.
24
+ #
25
+ # @param reason [String] Optional reason why spell checking is not used
26
+ def initialize(reason: nil)
27
+ @reason = reason || "Language does not use spell checking"
28
+ end
29
+
30
+ # Always returns that the word is "found" (correct).
31
+ #
32
+ # @param _word [String] The word to check (ignored)
33
+ # @return [Hash] Always returns { found: true, stem: nil, flags: [] }
34
+ def check(_word)
35
+ { found: true, stem: nil, flags: [] }
36
+ end
37
+
38
+ # Returns no suggestions.
39
+ #
40
+ # Passthrough spell checkers don't provide suggestions.
41
+ #
42
+ # @param _word [String] The word (ignored)
43
+ # @param _max_suggestions [Integer] Max suggestions (ignored)
44
+ # @return [Array<Hash>] Always returns empty array
45
+ def suggest(_word, _max_suggestions: 10)
46
+ []
47
+ end
48
+
49
+ # Always returns true (all words are "correct").
50
+ #
51
+ # @param _word [String] The word to check (ignored)
52
+ # @return [Boolean] Always true
53
+ def correct?(_word)
54
+ true
55
+ end
56
+
57
+ # Get the reason why spell checking is not used.
58
+ #
59
+ # @return [String] Reason text
60
+ def reason
61
+ @reason
62
+ end
63
+
64
+ # Check if this is a passthrough checker.
65
+ #
66
+ # @return [Boolean] Always true for this class
67
+ def passthrough?
68
+ true
69
+ end
70
+ end
71
+ end
72
+ end