kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Example 7: Multi-Language Dictionaries
|
|
5
|
+
#
|
|
6
|
+
# This example demonstrates how to use Kotoshu with multiple languages
|
|
7
|
+
# by loading dictionaries from the kotoshu/dictionaries repository.
|
|
8
|
+
|
|
9
|
+
require_relative "../lib/kotoshu"
|
|
10
|
+
|
|
11
|
+
puts "=== Example 7: Multi-Language Dictionaries ==="
|
|
12
|
+
puts
|
|
13
|
+
|
|
14
|
+
# Example 1: Load a specific dictionary by code
|
|
15
|
+
puts "1. Loading a Dictionary by Code"
|
|
16
|
+
puts "-" * 40
|
|
17
|
+
|
|
18
|
+
catalog = Kotoshu::Dictionaries::Catalog
|
|
19
|
+
|
|
20
|
+
# Find and load British English dictionary
|
|
21
|
+
en_gb_entry = catalog.find("en-GB")
|
|
22
|
+
if en_gb_entry
|
|
23
|
+
puts "Found: #{en_gb_entry.description}"
|
|
24
|
+
puts "Source: #{en_gb_entry.source}"
|
|
25
|
+
puts "License: #{en_gb_entry.license}"
|
|
26
|
+
puts "URL: #{en_gb_entry.dic_url}"
|
|
27
|
+
puts
|
|
28
|
+
|
|
29
|
+
en_gb_dict = en_gb_entry.load
|
|
30
|
+
puts "Loaded #{en_gb_dict.size} words"
|
|
31
|
+
puts "Has 'colour': #{en_gb_dict.lookup?("colour")}"
|
|
32
|
+
puts "Has 'color': #{en_gb_dict.lookup?("color")}"
|
|
33
|
+
else
|
|
34
|
+
puts "Dictionary not found"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
puts
|
|
38
|
+
puts "=" * 40
|
|
39
|
+
puts
|
|
40
|
+
|
|
41
|
+
# Example 2: List all dictionaries for a language
|
|
42
|
+
puts "2. All English Dictionaries"
|
|
43
|
+
puts "-" * 40
|
|
44
|
+
|
|
45
|
+
english_dicts = catalog.by_language("en")
|
|
46
|
+
puts "Found #{english_dicts.size} English dictionaries:"
|
|
47
|
+
english_dicts.each do |entry|
|
|
48
|
+
puts " #{entry.code}: #{entry.name} (#{entry.word_count} words)"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
puts
|
|
52
|
+
puts "=" * 40
|
|
53
|
+
puts
|
|
54
|
+
|
|
55
|
+
# Example 3: List all available languages
|
|
56
|
+
puts "3. All Available Languages"
|
|
57
|
+
puts "-" * 40
|
|
58
|
+
|
|
59
|
+
languages = catalog.languages
|
|
60
|
+
puts "Supported languages (#{languages.size}):"
|
|
61
|
+
puts languages.join(", ")
|
|
62
|
+
|
|
63
|
+
puts
|
|
64
|
+
puts "=" * 40
|
|
65
|
+
puts
|
|
66
|
+
|
|
67
|
+
# Example 4: Create spellcheckers for different languages
|
|
68
|
+
puts "4. Multi-Language Spellcheckers"
|
|
69
|
+
puts "-" * 40
|
|
70
|
+
|
|
71
|
+
# Load dictionaries for multiple languages
|
|
72
|
+
languages_to_test = %w[en de es fr]
|
|
73
|
+
|
|
74
|
+
spellcheckers = {}
|
|
75
|
+
languages_to_test.each do |lang|
|
|
76
|
+
entry = catalog.find(lang)
|
|
77
|
+
next unless entry
|
|
78
|
+
|
|
79
|
+
begin
|
|
80
|
+
dict = entry.load
|
|
81
|
+
spellcheckers[lang] = Kotoshu::Spellchecker.new(dictionary: dict)
|
|
82
|
+
puts "✓ Loaded #{entry.name}: #{dict.size} words"
|
|
83
|
+
rescue StandardError => e
|
|
84
|
+
puts "✗ Failed to load #{entry.name}: #{e.message}"
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
puts
|
|
89
|
+
puts "Testing multi-language spellchecking:"
|
|
90
|
+
puts
|
|
91
|
+
|
|
92
|
+
# Test words in different languages
|
|
93
|
+
test_cases = {
|
|
94
|
+
"en" => { correct: "hello", incorrect: "helo" },
|
|
95
|
+
"de" => { correct: "hallo", incorrect: "hllo" },
|
|
96
|
+
"es" => { correct: "hola", incorrect: "hla" },
|
|
97
|
+
"fr" => { correct: "bonjour", incorrect: "bnjour" }
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
test_cases.each do |lang, words|
|
|
101
|
+
checker = spellcheckers[lang]
|
|
102
|
+
next unless checker
|
|
103
|
+
|
|
104
|
+
correct_result = checker.correct?(words[:correct])
|
|
105
|
+
incorrect_result = checker.check_word(words[:incorrect])
|
|
106
|
+
|
|
107
|
+
status = correct_result ? "✓" : "✗"
|
|
108
|
+
puts "#{status} #{lang.upcase} '#{words[:correct]}': #{correct_result}"
|
|
109
|
+
|
|
110
|
+
puts " Suggestions for '#{words[:incorrect]}': #{incorrect_result.top_suggestions(3).join(", ")}" if incorrect_result.has_suggestions?
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
puts
|
|
114
|
+
puts "=" * 40
|
|
115
|
+
puts
|
|
116
|
+
|
|
117
|
+
# Example 5: Hunspell vs Plain Text formats
|
|
118
|
+
puts "5. Dictionary Formats"
|
|
119
|
+
puts "-" * 40
|
|
120
|
+
|
|
121
|
+
hunspell_dicts = catalog.hunspell
|
|
122
|
+
plain_text_dicts = catalog.plain_text
|
|
123
|
+
|
|
124
|
+
puts "Hunspell dictionaries: #{hunspell_dicts.size}"
|
|
125
|
+
puts "Plain text dictionaries: #{plain_text_dicts.size}"
|
|
126
|
+
puts
|
|
127
|
+
|
|
128
|
+
# Show some examples of each
|
|
129
|
+
puts "Hunspell examples:"
|
|
130
|
+
hunspell_dicts.first(5).each do |entry|
|
|
131
|
+
puts " #{entry.code}: #{entry.description}"
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
puts
|
|
135
|
+
puts "Plain text examples:"
|
|
136
|
+
plain_text_dicts.each do |entry|
|
|
137
|
+
puts " #{entry.code}: #{entry.description}"
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
puts
|
|
141
|
+
puts "=" * 40
|
|
142
|
+
puts
|
|
143
|
+
|
|
144
|
+
# Example 6: Filter by license
|
|
145
|
+
puts "6. Dictionaries by License"
|
|
146
|
+
puts "-" * 40
|
|
147
|
+
|
|
148
|
+
public_domain = catalog.by_license("Public Domain")
|
|
149
|
+
gpl = catalog.by_license("GPL")
|
|
150
|
+
|
|
151
|
+
puts "Public Domain dictionaries: #{public_domain.size}"
|
|
152
|
+
public_domain.each do |entry|
|
|
153
|
+
puts " #{entry.code}: #{entry.name}"
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
puts
|
|
157
|
+
puts "GPL dictionaries: #{gpl.size}"
|
|
158
|
+
gpl.first(5).each do |entry|
|
|
159
|
+
puts " #{entry.code}: #{entry.name}"
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
puts
|
|
163
|
+
puts "=" * 40
|
|
164
|
+
puts
|
|
165
|
+
|
|
166
|
+
# Example 7: Catalog statistics
|
|
167
|
+
puts "7. Catalog Statistics"
|
|
168
|
+
puts "-" * 40
|
|
169
|
+
|
|
170
|
+
stats = catalog.statistics
|
|
171
|
+
|
|
172
|
+
puts "Total dictionaries: #{stats[:total]}"
|
|
173
|
+
puts " Hunspell: #{stats[:hunspell]}"
|
|
174
|
+
puts " Plain text: #{stats[:plain_text]}"
|
|
175
|
+
puts
|
|
176
|
+
puts "Languages: #{stats[:languages]}"
|
|
177
|
+
puts "Total words: #{stats[:total_words].round}"
|
|
178
|
+
puts
|
|
179
|
+
puts "By format:"
|
|
180
|
+
stats[:formats].each do |format, count|
|
|
181
|
+
puts " #{format}: #{count}"
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
puts
|
|
185
|
+
puts "By license:"
|
|
186
|
+
stats[:licenses].each do |license, count|
|
|
187
|
+
puts " #{license}: #{count}"
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
puts
|
|
191
|
+
puts "=" * 40
|
|
192
|
+
puts
|
|
193
|
+
|
|
194
|
+
# Example 8: Create spellcheckers with regional variants
|
|
195
|
+
puts "8. English Regional Variants"
|
|
196
|
+
puts "-" * 40
|
|
197
|
+
|
|
198
|
+
english_variants = %w[en en-GB en-CA en-AU en-ZA]
|
|
199
|
+
|
|
200
|
+
english_variants.each do |code|
|
|
201
|
+
entry = catalog.find(code)
|
|
202
|
+
next unless entry
|
|
203
|
+
|
|
204
|
+
begin
|
|
205
|
+
dict = entry.load
|
|
206
|
+
checker = Kotoshu::Spellchecker.new(dictionary: dict)
|
|
207
|
+
|
|
208
|
+
# Test a word with different spellings
|
|
209
|
+
colour_result = checker.correct?("colour")
|
|
210
|
+
color_result = checker.correct?("color")
|
|
211
|
+
|
|
212
|
+
puts "#{entry.name}:"
|
|
213
|
+
puts " 'colour': #{colour_result ? "✓" : "✗"}"
|
|
214
|
+
puts " 'color': #{color_result ? "✓" : "✗"}"
|
|
215
|
+
rescue StandardError => e
|
|
216
|
+
puts "#{entry.name}: ✗ Error - #{e.message}"
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
puts
|
|
221
|
+
puts "=" * 40
|
|
222
|
+
puts
|
|
223
|
+
|
|
224
|
+
# Example 9: Loading large dictionaries with performance
|
|
225
|
+
puts "9. Large Dictionary Performance"
|
|
226
|
+
puts "-" * 40
|
|
227
|
+
|
|
228
|
+
require "benchmark"
|
|
229
|
+
|
|
230
|
+
large_dicts = %w[en de es fr ru]
|
|
231
|
+
|
|
232
|
+
large_dicts.each do |lang|
|
|
233
|
+
entry = catalog.find(lang)
|
|
234
|
+
next unless entry
|
|
235
|
+
|
|
236
|
+
begin
|
|
237
|
+
load_time = Benchmark.realtime do
|
|
238
|
+
dict = entry.load
|
|
239
|
+
checker = Kotoshu::Spellchecker.new(dictionary: dict)
|
|
240
|
+
checker.correct?("hello")
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
puts "#{entry.name}: #{(load_time * 1000).round(1)}ms (load + check)"
|
|
244
|
+
rescue StandardError => e
|
|
245
|
+
puts "#{entry.name}: ✗ Error - #{e.message}"
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
puts
|
|
250
|
+
puts "=" * 40
|
|
251
|
+
puts
|
|
252
|
+
|
|
253
|
+
# Example 10: Dictionary metadata
|
|
254
|
+
puts "10. Dictionary Metadata"
|
|
255
|
+
puts "-" * 40
|
|
256
|
+
|
|
257
|
+
entry = catalog.find("ru")
|
|
258
|
+
if entry
|
|
259
|
+
puts "Code: #{entry.code}"
|
|
260
|
+
puts "Name: #{entry.name}"
|
|
261
|
+
puts "Language: #{entry.language}"
|
|
262
|
+
puts "Region: #{entry.region || "N/A"}"
|
|
263
|
+
puts "Format: #{entry.format}"
|
|
264
|
+
puts "Source: #{entry.source}"
|
|
265
|
+
puts "License: #{entry.license}"
|
|
266
|
+
puts "Word count: #{entry.word_count}"
|
|
267
|
+
puts "Dictionary URL: #{entry.dic_url}"
|
|
268
|
+
puts "Affix URL: #{entry.aff_url}" if entry.aff_url
|
|
269
|
+
puts "Metadata: #{entry.metadata.inspect}"
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
puts
|
|
273
|
+
puts "=" * 40
|
|
274
|
+
puts
|
|
275
|
+
|
|
276
|
+
puts "For more information, see:"
|
|
277
|
+
puts " https://github.com/kotoshu/dictionaries"
|
|
278
|
+
puts
|
data/exe/kotoshu
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Algorithms
|
|
5
|
+
# Capitalization handling for different languages.
|
|
6
|
+
#
|
|
7
|
+
# Ported from Spylls (Python) capitalization.py
|
|
8
|
+
#
|
|
9
|
+
# This module provides capitalization detection and conversion for different
|
|
10
|
+
# language casing rules, including special handling for Turkic and German languages.
|
|
11
|
+
module Capitalization
|
|
12
|
+
# Type of capitalization detected by Casing.guess.
|
|
13
|
+
#
|
|
14
|
+
# NO:: all lowercase ("foo")
|
|
15
|
+
# INIT:: titlecase, only initial letter is capitalized ("Foo")
|
|
16
|
+
# ALL:: all uppercase ("FOO")
|
|
17
|
+
# HUH:: mixed capitalization ("fooBar")
|
|
18
|
+
# HUHINIT:: mixed capitalization, first letter is capitalized ("FooBar")
|
|
19
|
+
module Type
|
|
20
|
+
NO = :no
|
|
21
|
+
INIT = :init
|
|
22
|
+
ALL = :all
|
|
23
|
+
HUH = :huh
|
|
24
|
+
HUHINIT = :huhinit
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Base class for casing-related algorithms specific for dictionary's language.
|
|
28
|
+
#
|
|
29
|
+
# This is a class (not a set of functions) because it needs to have
|
|
30
|
+
# subclasses for specific language casing, which have only some aspects
|
|
31
|
+
# different from generic one.
|
|
32
|
+
class Casing
|
|
33
|
+
# Guess word's capitalization. Redefined in GermanCasing.
|
|
34
|
+
#
|
|
35
|
+
# @param word [String] The word to analyze
|
|
36
|
+
# @return [Symbol] One of the Type constants
|
|
37
|
+
def guess(word)
|
|
38
|
+
return Type::NO if word.downcase == word
|
|
39
|
+
return Type::ALL if word.upcase == word
|
|
40
|
+
return Type::INIT if word[0].upcase == word[0] && word[1..].downcase == word[1..]
|
|
41
|
+
|
|
42
|
+
if word[0].upcase == word[0]
|
|
43
|
+
Type::HUHINIT
|
|
44
|
+
else
|
|
45
|
+
Type::HUH
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Lowercases the word. Returns list of possible lowercasings for all
|
|
50
|
+
# casing classes to behave consistently.
|
|
51
|
+
#
|
|
52
|
+
# In GermanCasing (and only there), lowercasing word like "STRASSE"
|
|
53
|
+
# produces two possibilities: "strasse" and "ße" (ß is most of the time
|
|
54
|
+
# upcased to SS, so we can't decide which of downcased words is "right"
|
|
55
|
+
# and need to check both).
|
|
56
|
+
#
|
|
57
|
+
# Also redefined in TurkicCasing, because in Turkic languages lowercase
|
|
58
|
+
# "i" is uppercased as "İ", and uppercase "I" is downcased as "ı".
|
|
59
|
+
#
|
|
60
|
+
# @param word [String] The word to lowercase
|
|
61
|
+
# @return [Array<String>] List of possible lowercasings
|
|
62
|
+
def lower(word)
|
|
63
|
+
# Can't be properly lowercased in non-Turkic collation
|
|
64
|
+
return [] if word.nil? || word.empty? || word[0] == 'İ'
|
|
65
|
+
|
|
66
|
+
# Turkic "lowercase dot i" to latinic "i", just in case
|
|
67
|
+
[word.downcase.gsub('i̇', 'i')]
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Uppercase the word. Redefined in TurkicCasing, because in Turkic
|
|
71
|
+
# languages lowercase "i" is uppercased as "İ", and uppercase "I"
|
|
72
|
+
# is downcased as "ı".
|
|
73
|
+
#
|
|
74
|
+
# @param word [String] The word to uppercase
|
|
75
|
+
# @return [String] Uppercased word
|
|
76
|
+
def upper(word)
|
|
77
|
+
word.upcase
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Capitalize (convert word to all lowercase and first letter uppercase).
|
|
81
|
+
# Returns a list of results for same reasons as lower.
|
|
82
|
+
#
|
|
83
|
+
# @param word [String] The word to capitalize
|
|
84
|
+
# @return [Enumerator<String>] Enum of capitalized variants
|
|
85
|
+
def capitalize(word)
|
|
86
|
+
return enum_for(:capitalize, word) unless block_given?
|
|
87
|
+
|
|
88
|
+
if word.length == 1
|
|
89
|
+
yield upper(word[0])
|
|
90
|
+
else
|
|
91
|
+
upper_first = upper(word[0])
|
|
92
|
+
lower(word[1..]).each do |lowered|
|
|
93
|
+
yield upper_first + lowered
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Just change the case of the first letter to lower.
|
|
99
|
+
# Returns a list of results for same reasons as lower.
|
|
100
|
+
#
|
|
101
|
+
# @param word [String] The word to process
|
|
102
|
+
# @return [Enumerator<String>] Enum of variants with lowercased first letter
|
|
103
|
+
def lowerfirst(word)
|
|
104
|
+
return enum_for(:lowerfirst, word) unless block_given?
|
|
105
|
+
|
|
106
|
+
lower(word[0]).each do |lowered|
|
|
107
|
+
yield lowered + word[1..]
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Returns hypotheses of how the word might have been cased (in dictionary),
|
|
112
|
+
# if we consider it is spelled correctly.
|
|
113
|
+
#
|
|
114
|
+
# Example: If word is "Kitten", hypotheses are "kitten", "Kitten".
|
|
115
|
+
#
|
|
116
|
+
# @param word [String] The word to analyze
|
|
117
|
+
# @return [Array<Symbol, Array<String>>] Pair of [captype, variants]
|
|
118
|
+
def variants(word)
|
|
119
|
+
captype = guess(word)
|
|
120
|
+
|
|
121
|
+
result = case captype
|
|
122
|
+
when Type::NO
|
|
123
|
+
[word]
|
|
124
|
+
when Type::INIT
|
|
125
|
+
[word, *lower(word)]
|
|
126
|
+
when Type::HUHINIT
|
|
127
|
+
[word, *lowerfirst(word).to_a]
|
|
128
|
+
when Type::HUH
|
|
129
|
+
[word]
|
|
130
|
+
when Type::ALL
|
|
131
|
+
[word, *lower(word), *capitalize(word).to_a]
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
[captype, result]
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Returns hypotheses of how the word might have been cased if it is a
|
|
138
|
+
# misspelling.
|
|
139
|
+
#
|
|
140
|
+
# Example: "DiCtionary" (HUHINIT capitalization) produces hypotheses
|
|
141
|
+
# "DiCtionary", "diCtionary", "dictionary", "Dictionary", and all of
|
|
142
|
+
# them are checked by Suggest.
|
|
143
|
+
#
|
|
144
|
+
# @param word [String] The word to analyze
|
|
145
|
+
# @return [Array<Symbol, Array<String>>] Pair of [captype, variants]
|
|
146
|
+
def corrections(word)
|
|
147
|
+
captype = guess(word)
|
|
148
|
+
|
|
149
|
+
result = case captype
|
|
150
|
+
when Type::NO
|
|
151
|
+
[word]
|
|
152
|
+
when Type::INIT
|
|
153
|
+
[word, *lower(word)]
|
|
154
|
+
when Type::HUHINIT
|
|
155
|
+
[word, *lowerfirst(word).to_a, *lower(word), *capitalize(word).to_a]
|
|
156
|
+
when Type::HUH
|
|
157
|
+
[word, *lower(word)]
|
|
158
|
+
when Type::ALL
|
|
159
|
+
[word, *lower(word), *capitalize(word).to_a]
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
[captype, result]
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Used by suggest: by known (valid) suggestion, and initial word's
|
|
166
|
+
# capitalization, produce proper suggestion capitalization.
|
|
167
|
+
#
|
|
168
|
+
# Example: If misspelling was "Kiten" (INIT capitalization),
|
|
169
|
+
# found suggestion "kitten", then this method makes it "Kitten".
|
|
170
|
+
#
|
|
171
|
+
# @param word [String] The valid suggestion word
|
|
172
|
+
# @param cap [Symbol] Original word's capitalization type
|
|
173
|
+
# @return [String] Properly capitalized suggestion
|
|
174
|
+
def coerce(word, cap)
|
|
175
|
+
case cap
|
|
176
|
+
when Type::INIT, Type::HUHINIT
|
|
177
|
+
upper(word[0]) + word[1..]
|
|
178
|
+
when Type::ALL
|
|
179
|
+
upper(word)
|
|
180
|
+
else
|
|
181
|
+
word
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Redefines upper and lower, because in Turkic languages lowercase "i"
|
|
187
|
+
# is uppercased as "İ", and uppercase "I" is downcased as "ı".
|
|
188
|
+
#
|
|
189
|
+
# Example:
|
|
190
|
+
# turkic = Kotoshu::Algorithms::Capitalization::TurkicCasing.new
|
|
191
|
+
# turkic.lower('Izmir') # => ['ızmir']
|
|
192
|
+
# turkic.upper('Izmir') # => 'IZMİR'
|
|
193
|
+
class TurkicCasing < Casing
|
|
194
|
+
U2L = {
|
|
195
|
+
'İ' => 'i',
|
|
196
|
+
'I' => 'ı'
|
|
197
|
+
}.freeze
|
|
198
|
+
|
|
199
|
+
L2U = {
|
|
200
|
+
'i' => 'İ',
|
|
201
|
+
'ı' => 'I'
|
|
202
|
+
}.freeze
|
|
203
|
+
|
|
204
|
+
# Translate uppercase Turkic characters to lowercase.
|
|
205
|
+
#
|
|
206
|
+
# @param word [String] The word to lowercase
|
|
207
|
+
# @return [Array<String>] List of lowercased variants
|
|
208
|
+
def lower(word)
|
|
209
|
+
translated = word.chars.map { |c| U2L[c] || c }.join
|
|
210
|
+
super(translated)
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Translate lowercase Turkic characters to uppercase.
|
|
214
|
+
#
|
|
215
|
+
# @param word [String] The word to uppercase
|
|
216
|
+
# @return [String] Uppercased word
|
|
217
|
+
def upper(word)
|
|
218
|
+
translated = word.chars.map { |c| L2U[c] || c }.join
|
|
219
|
+
super(translated)
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Redefines lower because in German "SS" can be lowercased both as "ss" and "ß".
|
|
224
|
+
#
|
|
225
|
+
# Example:
|
|
226
|
+
# german = Kotoshu::Algorithms::Capitalization::GermanCasing.new
|
|
227
|
+
# german.lower('STRASSE') # => ['straße', 'strasse']
|
|
228
|
+
class GermanCasing < Casing
|
|
229
|
+
# Generate sharp S (ß) variants for all "ss" occurrences.
|
|
230
|
+
#
|
|
231
|
+
# @param text [String] The text to process
|
|
232
|
+
# @param start [Integer] Starting position for search
|
|
233
|
+
# @return [Array<String>] All variants with ß replacements
|
|
234
|
+
def sharp_s_variants(text, start = 0)
|
|
235
|
+
pos = text.index('ss', start)
|
|
236
|
+
return [] unless pos
|
|
237
|
+
|
|
238
|
+
replaced = text[0...pos] + 'ß' + text[(pos + 2)..]
|
|
239
|
+
[replaced,
|
|
240
|
+
*sharp_s_variants(replaced, pos + 1),
|
|
241
|
+
*sharp_s_variants(text, pos + 2)]
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Lowercase word, generating both "ss" and "ß" variants where applicable.
|
|
245
|
+
#
|
|
246
|
+
# @param word [String] The word to lowercase
|
|
247
|
+
# @return [Array<String>] List of lowercased variants
|
|
248
|
+
def lower(word)
|
|
249
|
+
lowered = super.first
|
|
250
|
+
return [lowered] unless word.include?('SS')
|
|
251
|
+
|
|
252
|
+
[*sharp_s_variants(lowered), lowered]
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
# Guess word's capitalization, accounting for German ß handling.
|
|
256
|
+
#
|
|
257
|
+
# In German uppercased words, ß (which is lowercase, and usually uppercased
|
|
258
|
+
# as SS) is allowed: "straße" => "STRAßE"
|
|
259
|
+
#
|
|
260
|
+
# @param word [String] The word to analyze
|
|
261
|
+
# @return [Symbol] One of the Type constants
|
|
262
|
+
def guess(word)
|
|
263
|
+
result = super
|
|
264
|
+
|
|
265
|
+
# Check if removing ß makes it ALL caps
|
|
266
|
+
if word.include?('ß')
|
|
267
|
+
word_without_ss = word.gsub('ß', '')
|
|
268
|
+
return Type::ALL if super(word_without_ss) == Type::ALL
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
result
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
end
|