kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,444 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Dictionary Validation Script
|
|
5
|
+
#
|
|
6
|
+
# This script validates all dictionaries in the kotoshu/dictionaries catalog
|
|
7
|
+
# by loading them from GitHub and testing basic functionality.
|
|
8
|
+
#
|
|
9
|
+
# Usage:
|
|
10
|
+
# ruby script/validate_all_dictionaries.rb [--full] [--lang LANG] [--code CODE]
|
|
11
|
+
#
|
|
12
|
+
# Options:
|
|
13
|
+
# --full Run full validation including suggestion tests (slow)
|
|
14
|
+
# --lang LANG Only test dictionaries for this language (e.g., en, de, fr)
|
|
15
|
+
# --code CODE Only test this specific dictionary code (e.g., en-GB, de-AT)
|
|
16
|
+
# --format FMT Only test dictionaries with this format (hunspell, plain_text)
|
|
17
|
+
|
|
18
|
+
require_relative "../lib/kotoshu"
|
|
19
|
+
require "optparse"
|
|
20
|
+
require "benchmark"
|
|
21
|
+
require "json"
|
|
22
|
+
|
|
23
|
+
# ANSI color codes for terminal output
|
|
24
|
+
module Colors
|
|
25
|
+
RESET = "\e[0m"
|
|
26
|
+
RED = "\e[31m"
|
|
27
|
+
GREEN = "\e[32m"
|
|
28
|
+
YELLOW = "\e[33m"
|
|
29
|
+
BLUE = "\e[34m"
|
|
30
|
+
MAGENTA = "\e[35m"
|
|
31
|
+
CYAN = "\e[36m"
|
|
32
|
+
BOLD = "\e[1m"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Validation result for a single dictionary
|
|
36
|
+
class ValidationResult
|
|
37
|
+
attr_reader :code, :status, :load_time, :size, :test_results, :error
|
|
38
|
+
|
|
39
|
+
def initialize(code)
|
|
40
|
+
@code = code
|
|
41
|
+
@status = :pending # :pending, :success, :warning, :error
|
|
42
|
+
@load_time = nil
|
|
43
|
+
@size = nil
|
|
44
|
+
@test_results = {}
|
|
45
|
+
@error = nil
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def success!(load_time, size)
|
|
49
|
+
@status = :success
|
|
50
|
+
@load_time = load_time
|
|
51
|
+
@size = size
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def warning!(load_time, size, message)
|
|
55
|
+
@status = :warning
|
|
56
|
+
@load_time = load_time
|
|
57
|
+
@size = size
|
|
58
|
+
@test_results[:warning] = message
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def error!(error)
|
|
62
|
+
@status = :error
|
|
63
|
+
@error = error
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def add_test_result(name, passed, details = nil)
|
|
67
|
+
@test_results[name] = { passed: passed, details: details }
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def success?
|
|
71
|
+
@status == :success
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def error?
|
|
75
|
+
@status == :error
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def warning?
|
|
79
|
+
@status == :warning
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def to_h
|
|
83
|
+
{
|
|
84
|
+
code: @code,
|
|
85
|
+
status: @status,
|
|
86
|
+
load_time: @load_time,
|
|
87
|
+
size: @size,
|
|
88
|
+
test_results: @test_results,
|
|
89
|
+
error: @error&.message
|
|
90
|
+
}
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Main validator class
|
|
95
|
+
class DictionaryValidator
|
|
96
|
+
attr_reader :options, :results
|
|
97
|
+
|
|
98
|
+
def initialize(options = {})
|
|
99
|
+
@options = options
|
|
100
|
+
@results = []
|
|
101
|
+
@catalog = Kotoshu::Dictionaries::Catalog
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Run validation
|
|
105
|
+
def run
|
|
106
|
+
print_header
|
|
107
|
+
|
|
108
|
+
dictionaries = select_dictionaries
|
|
109
|
+
|
|
110
|
+
print "Validating #{dictionaries.size} dictionaries...\n\n"
|
|
111
|
+
|
|
112
|
+
dictionaries.each_with_index do |entry, index|
|
|
113
|
+
validate_dictionary(entry, index + 1, dictionaries.size)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
print_summary
|
|
117
|
+
|
|
118
|
+
write_report if @options[:report]
|
|
119
|
+
|
|
120
|
+
exit_with_code
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
private
|
|
124
|
+
|
|
125
|
+
def select_dictionaries
|
|
126
|
+
dicts = @catalog.all
|
|
127
|
+
|
|
128
|
+
dicts = dicts.select { |d| d.language == @options[:lang] } if @options[:lang]
|
|
129
|
+
dicts = [dicts.find { |d| d.code.casecmp(@options[:code]).zero? }].compact if @options[:code]
|
|
130
|
+
dicts = dicts.select { |d| d.format == @options[:format].to_sym } if @options[:format]
|
|
131
|
+
|
|
132
|
+
dicts
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def validate_dictionary(entry, index, total)
|
|
136
|
+
result = ValidationResult.new(entry.code)
|
|
137
|
+
|
|
138
|
+
print_status(entry, index, total, result)
|
|
139
|
+
|
|
140
|
+
begin
|
|
141
|
+
# Load dictionary with timing
|
|
142
|
+
dict = nil
|
|
143
|
+
load_time = Benchmark.realtime do
|
|
144
|
+
dict = entry.load
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Basic validation
|
|
148
|
+
size = dict.size
|
|
149
|
+
|
|
150
|
+
if size.zero?
|
|
151
|
+
result.warning!(load_time, size, "Dictionary has zero words")
|
|
152
|
+
elsif size < 100
|
|
153
|
+
result.warning!(load_time, size, "Dictionary has fewer than 100 words")
|
|
154
|
+
else
|
|
155
|
+
result.success!(load_time, size)
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Run tests if --full
|
|
159
|
+
run_full_tests(dict, entry, result) if @options[:full] && result.success?
|
|
160
|
+
rescue StandardError => e
|
|
161
|
+
result.error!(e)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
@results << result
|
|
165
|
+
print_result(entry, result)
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def run_full_tests(dict, entry, result)
|
|
169
|
+
# Test 1: Lookup basic word (varies by language)
|
|
170
|
+
test_word = basic_test_word(entry.language)
|
|
171
|
+
if dict.lookup?(test_word)
|
|
172
|
+
result.add_test_result(:basic_lookup, true, test_word)
|
|
173
|
+
else
|
|
174
|
+
result.add_test_result(:basic_lookup, false, "Could not find '#{test_word}'")
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# Test 2: Lookup non-existent word
|
|
178
|
+
nonsense_word = nonsense_test_word(entry.language)
|
|
179
|
+
if !dict.lookup?(nonsense_word)
|
|
180
|
+
result.add_test_result(:nonexistent_lookup, true, nonsense_word)
|
|
181
|
+
else
|
|
182
|
+
result.add_test_result(:nonexistent_lookup, false, "Incorrectly found '#{nonsense_word}'")
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Test 3: Suggestions (if supported)
|
|
186
|
+
begin
|
|
187
|
+
misspelled = misspelled_test_word(entry.language)
|
|
188
|
+
suggestions = dict.suggest(misspelled, max_suggestions: 5)
|
|
189
|
+
if suggestions&.any?
|
|
190
|
+
result.add_test_result(:suggestions, true, "Found #{suggestions.size} suggestions for '#{misspelled}'")
|
|
191
|
+
else
|
|
192
|
+
result.add_test_result(:suggestions, false, "No suggestions for '#{misspelled}'")
|
|
193
|
+
end
|
|
194
|
+
rescue StandardError => e
|
|
195
|
+
result.add_test_result(:suggestions, false, e.message)
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Test 4: Case sensitivity (if not case-sensitive)
|
|
199
|
+
return if dict.case_sensitive?
|
|
200
|
+
|
|
201
|
+
if dict.lookup?(test_word.upcase) || dict.lookup?(test_word.downcase)
|
|
202
|
+
result.add_test_result(:case_insensitive, true, "Case-insensitive lookup works")
|
|
203
|
+
else
|
|
204
|
+
result.add_test_result(:case_insensitive, false, "Case-insensitive lookup failed")
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def basic_test_word(language)
|
|
209
|
+
# Common words in different languages
|
|
210
|
+
{
|
|
211
|
+
"en" => "the",
|
|
212
|
+
"de" => "der",
|
|
213
|
+
"es" => "el",
|
|
214
|
+
"fr" => "le",
|
|
215
|
+
"it" => "il",
|
|
216
|
+
"pt" => "o",
|
|
217
|
+
"ru" => "и",
|
|
218
|
+
"nl" => "de",
|
|
219
|
+
"pl" => "i",
|
|
220
|
+
"cs" => "a",
|
|
221
|
+
"sv" => "och",
|
|
222
|
+
"da" => "og",
|
|
223
|
+
"no" => "og",
|
|
224
|
+
"fi" => "ja",
|
|
225
|
+
"tr" => "ve",
|
|
226
|
+
"ko" => "그",
|
|
227
|
+
"vi" => "là",
|
|
228
|
+
"ja" => "は",
|
|
229
|
+
"zh" => "的",
|
|
230
|
+
"ar" => "في",
|
|
231
|
+
"he" => "ו",
|
|
232
|
+
"el" => "το",
|
|
233
|
+
"hu" => "a",
|
|
234
|
+
"ro" => "şi",
|
|
235
|
+
"bg" => "и",
|
|
236
|
+
"uk" => "і",
|
|
237
|
+
"ga" => "an",
|
|
238
|
+
"cy" => "y",
|
|
239
|
+
"is" => "og",
|
|
240
|
+
"mt" => "u",
|
|
241
|
+
"lv" => "un",
|
|
242
|
+
"et" => "ja",
|
|
243
|
+
"lt" => "ir",
|
|
244
|
+
"sk" => "a",
|
|
245
|
+
"sl" => "in",
|
|
246
|
+
"hr" => "i",
|
|
247
|
+
"sr" => "и",
|
|
248
|
+
"sq" => "dhe",
|
|
249
|
+
"be" => "і",
|
|
250
|
+
"mk" => "и",
|
|
251
|
+
"hy" => "և",
|
|
252
|
+
"ka" => "და",
|
|
253
|
+
"fa" => "و",
|
|
254
|
+
"ur" => "اور",
|
|
255
|
+
"hi" => "और",
|
|
256
|
+
"bn" => "এবং",
|
|
257
|
+
"th" => "และ",
|
|
258
|
+
"id" => "dan",
|
|
259
|
+
"ms" => "dan",
|
|
260
|
+
"sw" => "na",
|
|
261
|
+
"af" => "en",
|
|
262
|
+
"ca" => "i",
|
|
263
|
+
"gl" => "e",
|
|
264
|
+
"eu" => "eta",
|
|
265
|
+
"lb" => "an",
|
|
266
|
+
"fy" => "en",
|
|
267
|
+
"ku" => "û",
|
|
268
|
+
"eo" => "kaj",
|
|
269
|
+
"ia" => "e"
|
|
270
|
+
}.fetch(language, "a")
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
def nonsense_test_word(_language)
|
|
274
|
+
# Nonsense words that shouldn't exist
|
|
275
|
+
"zzzzzzzzz"
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def misspelled_test_word(language)
|
|
279
|
+
# Common misspellings in different languages
|
|
280
|
+
{
|
|
281
|
+
"en" => "helo",
|
|
282
|
+
"de" => "hallo",
|
|
283
|
+
"es" => "ola",
|
|
284
|
+
"fr" => "bonjur",
|
|
285
|
+
"it" => "ciao",
|
|
286
|
+
"pt" => "ola",
|
|
287
|
+
"ru" => "привет",
|
|
288
|
+
"nl" => "halo",
|
|
289
|
+
"pl" => "czesc"
|
|
290
|
+
}.fetch(language, "teest")
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
def print_header
|
|
294
|
+
print "#{Colors::BOLD}Kotoshu Dictionary Validator#{Colors::RESET}\n"
|
|
295
|
+
print "#{"=" * 60}\n\n"
|
|
296
|
+
|
|
297
|
+
stats = @catalog.statistics
|
|
298
|
+
print "Catalog Statistics:\n"
|
|
299
|
+
print " Total dictionaries: #{stats[:total]}\n"
|
|
300
|
+
print " Hunspell dictionaries: #{stats[:hunspell]}\n"
|
|
301
|
+
print " Plain text dictionaries: #{stats[:plain_text]}\n"
|
|
302
|
+
print " Languages: #{stats[:languages]}\n"
|
|
303
|
+
print " Total words: #{stats[:total_words].round}\n"
|
|
304
|
+
print "\n"
|
|
305
|
+
print "#{"=" * 60}\n\n"
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def print_status(entry, index, total, _result)
|
|
309
|
+
print "[#{index}/#{total}] #{Colors::CYAN}#{entry.code}#{Colors::RESET} - #{entry.description}\n"
|
|
310
|
+
print " Format: #{entry.format}, License: #{entry.license}\n"
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
def print_result(_entry, result)
|
|
314
|
+
if result.success?
|
|
315
|
+
print " #{Colors::GREEN}✓ PASS#{Colors::RESET}"
|
|
316
|
+
print " - #{result.size.round} words, #{(result.load_time * 1000).round(1)}ms"
|
|
317
|
+
print " - Tests: #{result.test_results.size}" if @options[:full]
|
|
318
|
+
elsif result.warning?
|
|
319
|
+
print " #{Colors::YELLOW}⚠ WARN#{Colors::RESET}"
|
|
320
|
+
print " - #{result.size.round} words, #{(result.load_time * 1000).round(1)}ms"
|
|
321
|
+
print " - #{result.test_results[:warning]}"
|
|
322
|
+
else
|
|
323
|
+
print " #{Colors::RED}✗ FAIL#{Colors::RESET}"
|
|
324
|
+
print " - #{result.error.class}: #{result.error.message}"
|
|
325
|
+
end
|
|
326
|
+
print "\n"
|
|
327
|
+
|
|
328
|
+
# Print test results details
|
|
329
|
+
if @options[:full] && result.test_results.any?
|
|
330
|
+
result.test_results.each do |name, test_result|
|
|
331
|
+
next if name == :warning
|
|
332
|
+
|
|
333
|
+
status = test_result[:passed] ? "#{Colors::GREEN}✓#{Colors::RESET}" : "#{Colors::RED}✗#{Colors::RESET}"
|
|
334
|
+
print " #{status} #{name}: #{test_result[:details]}\n"
|
|
335
|
+
end
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
print "\n"
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
def print_summary
|
|
342
|
+
print "#{"=" * 60}\n"
|
|
343
|
+
print "#{Colors::BOLD}Validation Summary#{Colors::RESET}\n"
|
|
344
|
+
print "#{"=" * 60}\n\n"
|
|
345
|
+
|
|
346
|
+
total = @results.size
|
|
347
|
+
success = @results.count(&:success?)
|
|
348
|
+
warnings = @results.count(&:warning?)
|
|
349
|
+
errors = @results.count(&:error?)
|
|
350
|
+
|
|
351
|
+
print "Total: #{total}\n"
|
|
352
|
+
print "#{Colors::GREEN}✓ Passed: #{success}#{Colors::RESET}\n"
|
|
353
|
+
print "#{Colors::YELLOW}⚠ Warnings: #{warnings}#{Colors::RESET}\n"
|
|
354
|
+
print "#{Colors::RED}✗ Failed: #{errors}#{Colors::RESET}\n"
|
|
355
|
+
print "\n"
|
|
356
|
+
|
|
357
|
+
if success.positive?
|
|
358
|
+
avg_load_time = @results.select(&:success?).map(&:load_time).sum / success
|
|
359
|
+
avg_size = @results.select(&:success?).map(&:size).sum / success
|
|
360
|
+
print "Average load time: #{(avg_load_time * 1000).round(1)}ms\n"
|
|
361
|
+
print "Average size: #{avg_size.round} words\n"
|
|
362
|
+
print "\n"
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
if errors.positive?
|
|
366
|
+
print "#{Colors::BOLD}Failed Dictionaries:#{Colors::RESET}\n"
|
|
367
|
+
@results.select(&:error?).each do |result|
|
|
368
|
+
print " #{Colors::RED}#{result.code}#{Colors::RESET}: #{result.error.message}\n"
|
|
369
|
+
end
|
|
370
|
+
print "\n"
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
return unless warnings.positive?
|
|
374
|
+
|
|
375
|
+
print "#{Colors::BOLD}Warnings:#{Colors::RESET}\n"
|
|
376
|
+
@results.select(&:warning?).each do |result|
|
|
377
|
+
print " #{Colors::YELLOW}#{result.code}#{Colors::RESET}: #{result.test_results[:warning]}\n"
|
|
378
|
+
end
|
|
379
|
+
print "\n"
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
def write_report
|
|
383
|
+
report_path = "dictionary_validation_report.json"
|
|
384
|
+
File.write(report_path, JSON.pretty_generate({
|
|
385
|
+
timestamp: Time.now.iso8601,
|
|
386
|
+
summary: {
|
|
387
|
+
total: @results.size,
|
|
388
|
+
success: @results.count(&:success?),
|
|
389
|
+
warnings: @results.count(&:warning?),
|
|
390
|
+
errors: @results.count(&:error?)
|
|
391
|
+
},
|
|
392
|
+
results: @results.map(&:to_h)
|
|
393
|
+
}))
|
|
394
|
+
print "Report written to: #{report_path}\n"
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
def exit_with_code
|
|
398
|
+
# Exit with error code if any failures
|
|
399
|
+
exit 1 if @results.any?(&:error?)
|
|
400
|
+
exit 0
|
|
401
|
+
end
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
# Parse options
|
|
405
|
+
options = {
|
|
406
|
+
full: false,
|
|
407
|
+
lang: nil,
|
|
408
|
+
code: nil,
|
|
409
|
+
format: nil,
|
|
410
|
+
report: false
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
OptionParser.new do |opts|
|
|
414
|
+
opts.banner = "Usage: ruby script/validate_all_dictionaries.rb [options]"
|
|
415
|
+
|
|
416
|
+
opts.on("--full", "Run full validation including tests") do
|
|
417
|
+
options[:full] = true
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
opts.on("--lang LANG", "Filter by language (e.g., en, de, fr)") do |lang|
|
|
421
|
+
options[:lang] = lang
|
|
422
|
+
end
|
|
423
|
+
|
|
424
|
+
opts.on("--code CODE", "Filter by dictionary code (e.g., en-GB)") do |code|
|
|
425
|
+
options[:code] = code
|
|
426
|
+
end
|
|
427
|
+
|
|
428
|
+
opts.on("--format FORMAT", "Filter by format (hunspell, plain_text)") do |fmt|
|
|
429
|
+
options[:format] = fmt
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
opts.on("--report", "Write JSON report file") do
|
|
433
|
+
options[:report] = true
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
opts.on("-h", "--help", "Show this message") do
|
|
437
|
+
puts opts
|
|
438
|
+
exit
|
|
439
|
+
end
|
|
440
|
+
end.parse!
|
|
441
|
+
|
|
442
|
+
# Run validator
|
|
443
|
+
validator = DictionaryValidator.new(options)
|
|
444
|
+
validator.run
|
data/sig/kotoshu.rbs
ADDED
data/test_oop.rb
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "bundler/setup"
|
|
4
|
+
require_relative "lib/kotoshu"
|
|
5
|
+
|
|
6
|
+
# Test IndexedDictionary
|
|
7
|
+
puts "=== Testing IndexedDictionary ==="
|
|
8
|
+
dict = Kotoshu::Core::IndexedDictionary.new(%w[hello world help held heap])
|
|
9
|
+
puts "Has 'hello': #{dict.has_word?("hello")}"
|
|
10
|
+
puts "Has 'HELLO' (ignorecase): #{dict.has_word_ignorecase?("HELLO")}"
|
|
11
|
+
puts "Words starting with 'he': #{dict.find_by_prefix("he").inspect}"
|
|
12
|
+
puts "Words ending with 'ld': #{dict.find_by_suffix("ld").inspect}"
|
|
13
|
+
puts "Words with length 5: #{dict.find_by_length(5).inspect}"
|
|
14
|
+
puts "Statistics: #{dict.statistics.inspect}"
|
|
15
|
+
puts
|
|
16
|
+
|
|
17
|
+
# Test Trie
|
|
18
|
+
puts "=== Testing Trie ==="
|
|
19
|
+
trie = Kotoshu::Core::Trie::Builder.from_array(%w[hello help held heap world])
|
|
20
|
+
puts "Has 'hello': #{trie.lookup("hello")}"
|
|
21
|
+
puts "Has prefix 'he': #{trie.has_prefix?("he")}"
|
|
22
|
+
puts "Words with prefix 'he': #{trie.words_with_prefix("he").inspect}"
|
|
23
|
+
puts "Suggestions for 'hel': #{trie.suggestions("hel").inspect}"
|
|
24
|
+
puts "All words: #{trie.all_words.inspect}"
|
|
25
|
+
puts
|
|
26
|
+
|
|
27
|
+
# Test Suggestion
|
|
28
|
+
puts "=== Testing Suggestion ==="
|
|
29
|
+
suggestion = Kotoshu::Suggestions::Suggestion.new(
|
|
30
|
+
word: "hello",
|
|
31
|
+
distance: 1,
|
|
32
|
+
confidence: 0.9,
|
|
33
|
+
source: :test
|
|
34
|
+
)
|
|
35
|
+
puts "High confidence: #{suggestion.high_confidence?}"
|
|
36
|
+
puts "Combined score: #{suggestion.combined_score}"
|
|
37
|
+
puts "Same word as 'HELLO': #{suggestion.same_word?("HELLO")}"
|
|
38
|
+
puts
|
|
39
|
+
|
|
40
|
+
# Test SuggestionSet
|
|
41
|
+
puts "=== Testing SuggestionSet ==="
|
|
42
|
+
suggestions = Kotoshu::Suggestions::SuggestionSet.from_words(
|
|
43
|
+
%w[hello help held],
|
|
44
|
+
source: :test
|
|
45
|
+
)
|
|
46
|
+
puts "Size: #{suggestions.size}"
|
|
47
|
+
puts "First: #{suggestions.first.inspect}"
|
|
48
|
+
puts "Has word 'help': #{suggestions.has_word?("help")}"
|
|
49
|
+
puts "Top 2: #{suggestions.top(2).map(&:word).inspect}"
|
|
50
|
+
puts
|
|
51
|
+
|
|
52
|
+
# Test Context
|
|
53
|
+
puts "=== Testing Context ==="
|
|
54
|
+
context = Kotoshu::Suggestions::Context.new(
|
|
55
|
+
word: "helo",
|
|
56
|
+
dictionary: dict,
|
|
57
|
+
max_results: 5
|
|
58
|
+
)
|
|
59
|
+
puts "Word: #{context.word}"
|
|
60
|
+
puts "Max results: #{context.max_results}"
|
|
61
|
+
puts
|
|
62
|
+
|
|
63
|
+
# Test EditDistanceStrategy
|
|
64
|
+
puts "=== Testing EditDistanceStrategy ==="
|
|
65
|
+
strategy = Kotoshu::Suggestions::Strategies::EditDistanceStrategy.new
|
|
66
|
+
result = strategy.generate(context)
|
|
67
|
+
puts "Suggestions for 'helo': #{result.to_words.inspect}"
|
|
68
|
+
puts
|
|
69
|
+
|
|
70
|
+
# Test CompositeStrategy (Pipeline)
|
|
71
|
+
puts "=== Testing CompositeStrategy ==="
|
|
72
|
+
pipeline = Kotoshu.suggestion_pipeline(
|
|
73
|
+
Kotoshu::Suggestions::Strategies::EditDistanceStrategy.new
|
|
74
|
+
)
|
|
75
|
+
result = pipeline.generate(context)
|
|
76
|
+
puts "Pipeline suggestions: #{result.to_words.inspect}"
|
|
77
|
+
puts
|
|
78
|
+
|
|
79
|
+
puts "All tests passed!"
|