kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "open-uri"
|
|
4
|
+
require_relative "base"
|
|
5
|
+
|
|
6
|
+
module Kotoshu
|
|
7
|
+
module Dictionary
|
|
8
|
+
# Plain text dictionary backend.
|
|
9
|
+
#
|
|
10
|
+
# This dictionary reads from simple plain text word lists,
|
|
11
|
+
# with support for comments and various formatting options.
|
|
12
|
+
#
|
|
13
|
+
# File format:
|
|
14
|
+
# - One word per line
|
|
15
|
+
# - Lines starting with # are comments
|
|
16
|
+
# - Empty lines are ignored
|
|
17
|
+
# - Supports multi-word phrases (e.g., "New York")
|
|
18
|
+
#
|
|
19
|
+
# @example Creating from a file
|
|
20
|
+
# dict = PlainText.new("words.txt", language_code: "en-US")
|
|
21
|
+
# dict.lookup?("hello") # => true
|
|
22
|
+
#
|
|
23
|
+
# @example Creating from a URL
|
|
24
|
+
# dict = PlainText.new("https://raw.githubusercontent.com/kotoshu/dictionaries/main/en_US/words.txt",
|
|
25
|
+
# language_code: "en-US")
|
|
26
|
+
#
|
|
27
|
+
# @example Creating from an array
|
|
28
|
+
# dict = PlainText.from_words(%w[hello world test], language_code: "en")
|
|
29
|
+
class PlainText < Base
|
|
30
|
+
# @return [String] The path to the dictionary file (or nil if created from array)
|
|
31
|
+
attr_reader :path
|
|
32
|
+
|
|
33
|
+
# @return [Boolean] Whether lookups are case-sensitive
|
|
34
|
+
attr_reader :case_sensitive
|
|
35
|
+
|
|
36
|
+
# @return [Regexp, nil] Pattern for word filtering
|
|
37
|
+
attr_reader :word_pattern
|
|
38
|
+
|
|
39
|
+
# Create a new PlainText dictionary.
|
|
40
|
+
#
|
|
41
|
+
# @param path [String] Path to the dictionary file or URL
|
|
42
|
+
# @param language_code [String] The language code
|
|
43
|
+
# @param locale [String, nil] The locale (optional)
|
|
44
|
+
# @param case_sensitive [Boolean] Whether lookups are case-sensitive
|
|
45
|
+
# @param word_pattern [Regexp, nil] Pattern to filter words (optional)
|
|
46
|
+
# @param metadata [Hash] Additional metadata (optional)
|
|
47
|
+
def initialize(path, language_code:, locale: nil, case_sensitive: false,
|
|
48
|
+
word_pattern: nil, metadata: {})
|
|
49
|
+
super(language_code, locale: locale, metadata: metadata)
|
|
50
|
+
|
|
51
|
+
@original_path = path
|
|
52
|
+
@path = resolve_path(path)
|
|
53
|
+
@case_sensitive = case_sensitive
|
|
54
|
+
@word_pattern = word_pattern
|
|
55
|
+
@words = load_words(@path)
|
|
56
|
+
@word_set = build_word_set
|
|
57
|
+
|
|
58
|
+
# Register this dictionary type
|
|
59
|
+
self.class.register_type(:plain_text) unless Dictionary.registry.key?(:plain_text)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Check if a word exists in the dictionary.
|
|
63
|
+
#
|
|
64
|
+
# @param word [String] The word to look up
|
|
65
|
+
# @return [Boolean] True if the word exists
|
|
66
|
+
def lookup(word)
|
|
67
|
+
return false if word.nil? || word.empty?
|
|
68
|
+
|
|
69
|
+
lookup_word = @case_sensitive ? word : word.downcase
|
|
70
|
+
@word_set.key?(lookup_word)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Generate spelling suggestions.
|
|
74
|
+
#
|
|
75
|
+
# Uses edit distance to find similar words in the dictionary.
|
|
76
|
+
#
|
|
77
|
+
# @param word [String] The misspelled word
|
|
78
|
+
# @param max_suggestions [Integer] Maximum suggestions
|
|
79
|
+
# @return [Array<String>] List of suggested words
|
|
80
|
+
def suggest(word, max_suggestions: 10)
|
|
81
|
+
return [] if word.nil? || word.empty?
|
|
82
|
+
|
|
83
|
+
lookup_word = @case_sensitive ? word : word.downcase
|
|
84
|
+
|
|
85
|
+
# Find words with same prefix
|
|
86
|
+
prefix_len = [lookup_word.length - 1, 3].max
|
|
87
|
+
prefix = lookup_word[0...prefix_len]
|
|
88
|
+
candidates = @words.select { |w| w.start_with?(prefix) }
|
|
89
|
+
|
|
90
|
+
# Calculate edit distances
|
|
91
|
+
candidates.map do |dict_word|
|
|
92
|
+
dist = edit_distance(lookup_word, dict_word)
|
|
93
|
+
[dict_word, dist]
|
|
94
|
+
end.select { |_, dist| dist.positive? && dist <= 2 }
|
|
95
|
+
.sort_by { |_, dist| dist }
|
|
96
|
+
.first(max_suggestions)
|
|
97
|
+
.map(&:first)
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Add a word to the dictionary.
|
|
101
|
+
#
|
|
102
|
+
# @param word [String] The word to add
|
|
103
|
+
# @param flags [Array<String>] Flags (ignored for PlainText)
|
|
104
|
+
# @return [Boolean] True if added
|
|
105
|
+
def add_word(word, flags: [])
|
|
106
|
+
return false if word.nil? || word.empty?
|
|
107
|
+
|
|
108
|
+
lookup_word = @case_sensitive ? word : word.downcase
|
|
109
|
+
return false if @word_set.key?(lookup_word)
|
|
110
|
+
|
|
111
|
+
@words << lookup_word
|
|
112
|
+
@word_set[lookup_word] = @words.length - 1
|
|
113
|
+
|
|
114
|
+
true
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Remove a word from the dictionary.
|
|
118
|
+
#
|
|
119
|
+
# @param word [String] The word to remove
|
|
120
|
+
# @return [Boolean] True if removed
|
|
121
|
+
def remove_word(word)
|
|
122
|
+
return false if word.nil? || word.empty?
|
|
123
|
+
|
|
124
|
+
lookup_word = @case_sensitive ? word : word.downcase
|
|
125
|
+
return false unless @word_set.key?(lookup_word)
|
|
126
|
+
|
|
127
|
+
index = @word_set.delete(lookup_word)
|
|
128
|
+
@words.delete_at(index)
|
|
129
|
+
|
|
130
|
+
true
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Get all words in the dictionary.
|
|
134
|
+
#
|
|
135
|
+
# @return [Array<String>] All words
|
|
136
|
+
def words
|
|
137
|
+
@words.dup
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Create a dictionary from an array of words.
|
|
141
|
+
#
|
|
142
|
+
# @param words [Array<String>] The words
|
|
143
|
+
# @param language_code [String] The language code
|
|
144
|
+
# @param locale [String, nil] The locale (optional)
|
|
145
|
+
# @param case_sensitive [Boolean] Whether lookups are case-sensitive
|
|
146
|
+
# @return [PlainText] New dictionary
|
|
147
|
+
#
|
|
148
|
+
# @example
|
|
149
|
+
# dict = PlainText.from_words(%w[hello world test], language_code: "en")
|
|
150
|
+
def self.from_words(words, language_code:, locale: nil, case_sensitive: false)
|
|
151
|
+
dict = allocate
|
|
152
|
+
|
|
153
|
+
dict.instance_variable_set(:@language_code, language_code.dup.freeze)
|
|
154
|
+
dict.instance_variable_set(:@locale, locale&.dup&.freeze)
|
|
155
|
+
dict.instance_variable_set(:@path, nil)
|
|
156
|
+
dict.instance_variable_set(:@case_sensitive, case_sensitive)
|
|
157
|
+
dict.instance_variable_set(:@word_pattern, nil)
|
|
158
|
+
dict.instance_variable_set(:@words, words.dup.map { |w| case_sensitive ? w : w.downcase })
|
|
159
|
+
dict.instance_variable_set(:@word_set, dict.instance_variable_get(:@words).each_with_index.to_h)
|
|
160
|
+
dict.instance_variable_set(:@metadata, {}.freeze)
|
|
161
|
+
|
|
162
|
+
# Register this dictionary type (unless already registered)
|
|
163
|
+
register_type(:plain_text) unless Dictionary.registry.key?(:plain_text)
|
|
164
|
+
|
|
165
|
+
dict
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Create a dictionary from a string.
|
|
169
|
+
#
|
|
170
|
+
# @param text [String] The text containing words (newline separated)
|
|
171
|
+
# @param language_code [String] The language code
|
|
172
|
+
# @param locale [String, nil] The locale (optional)
|
|
173
|
+
# @param case_sensitive [Boolean] Whether lookups are case-sensitive
|
|
174
|
+
# @return [PlainText] New dictionary
|
|
175
|
+
#
|
|
176
|
+
# @example
|
|
177
|
+
# text = "hello\nworld\ntest"
|
|
178
|
+
# dict = PlainText.from_string(text, language_code: "en")
|
|
179
|
+
def self.from_string(text, language_code:, locale: nil, case_sensitive: false)
|
|
180
|
+
words = text.split("\n").reject { |l| l.empty? || l.strip.start_with?("#") }
|
|
181
|
+
.map(&:strip)
|
|
182
|
+
|
|
183
|
+
from_words(words, language_code: language_code, locale: locale,
|
|
184
|
+
case_sensitive: case_sensitive)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
private
|
|
188
|
+
|
|
189
|
+
# Resolve path - handles URLs by downloading to temp location.
|
|
190
|
+
#
|
|
191
|
+
# @param path [String] File path or URL
|
|
192
|
+
# @return [String] Local file path
|
|
193
|
+
def resolve_path(path)
|
|
194
|
+
return File.expand_path(path) unless url?(path)
|
|
195
|
+
|
|
196
|
+
# Download URL to temp file
|
|
197
|
+
download_to_temp(path)
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# Check if path is a URL.
|
|
201
|
+
#
|
|
202
|
+
# @param path [String] Path to check
|
|
203
|
+
# @return [Boolean] True if URL
|
|
204
|
+
def url?(path)
|
|
205
|
+
path.start_with?("http://", "https://")
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# Download URL to temporary file.
|
|
209
|
+
#
|
|
210
|
+
# @param url [String] URL to download
|
|
211
|
+
# @return [String] Path to downloaded file
|
|
212
|
+
def download_to_temp(url)
|
|
213
|
+
require "tempfile"
|
|
214
|
+
|
|
215
|
+
uri = URI.parse(url)
|
|
216
|
+
filename = File.basename(uri.path)
|
|
217
|
+
|
|
218
|
+
temp = Tempfile.new([filename, ".txt"], encoding: "UTF-8")
|
|
219
|
+
temp.binmode
|
|
220
|
+
|
|
221
|
+
URI.open(uri, "rb") do |remote_file|
|
|
222
|
+
IO.copy_stream(remote_file, temp)
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
temp.close
|
|
226
|
+
temp.path
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Load words from dictionary file.
|
|
230
|
+
#
|
|
231
|
+
# @param path [String] The file path
|
|
232
|
+
# @return [Array<String>] List of words
|
|
233
|
+
def load_words(path)
|
|
234
|
+
raise DictionaryNotFoundError, path unless File.exist?(path)
|
|
235
|
+
|
|
236
|
+
File.foreach(path, chomp: true)
|
|
237
|
+
.reject { |line| line.empty? || line.strip.start_with?("#") }
|
|
238
|
+
.map(&:strip)
|
|
239
|
+
.select { |word| @word_pattern.nil? || word.match?(@word_pattern) }
|
|
240
|
+
.map { |word| @case_sensitive ? word : word.downcase }
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Build a hash set for O(1) lookups.
|
|
244
|
+
#
|
|
245
|
+
# @return [Hash] Word to index mapping
|
|
246
|
+
def build_word_set
|
|
247
|
+
@words.each_with_index.to_h
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
# Calculate Levenshtein edit distance.
|
|
251
|
+
#
|
|
252
|
+
# @param str1 [String] First string
|
|
253
|
+
# @param str2 [String] Second string
|
|
254
|
+
# @return [Integer] Edit distance
|
|
255
|
+
def edit_distance(str1, str2)
|
|
256
|
+
return str2.length if str1.empty?
|
|
257
|
+
return str1.length if str2.empty?
|
|
258
|
+
|
|
259
|
+
# Use smaller string for inner loop
|
|
260
|
+
str1, str2 = str2, str1 if str1.length > str2.length
|
|
261
|
+
|
|
262
|
+
previous = (0..str1.length).to_a
|
|
263
|
+
|
|
264
|
+
str2.each_char.with_index do |char2, j|
|
|
265
|
+
current = [j + 1]
|
|
266
|
+
|
|
267
|
+
str1.each_char.with_index do |char1, i|
|
|
268
|
+
insert_cost = current[i] + 1
|
|
269
|
+
delete_cost = previous[i + 1] + 1
|
|
270
|
+
substitute_cost = previous[i] + (char1 == char2 ? 0 : 1)
|
|
271
|
+
|
|
272
|
+
current << [insert_cost, delete_cost, substitute_cost].min
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
previous = current
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
previous.last
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
end
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base"
|
|
4
|
+
|
|
5
|
+
module Kotoshu
|
|
6
|
+
module Dictionary
|
|
7
|
+
# Repository for managing multiple dictionary instances.
|
|
8
|
+
#
|
|
9
|
+
# This class provides a centralized registry for dictionaries,
|
|
10
|
+
# allowing them to be registered and retrieved by key.
|
|
11
|
+
#
|
|
12
|
+
# @example Registering and retrieving dictionaries
|
|
13
|
+
# repo = Repository.new
|
|
14
|
+
# repo.register(:en_US, unix_dict)
|
|
15
|
+
# repo.register(:custom, custom_dict)
|
|
16
|
+
# repo.get(:en_US) # => unix_dict
|
|
17
|
+
#
|
|
18
|
+
# @example Using the global repository
|
|
19
|
+
# Repository.register(:en_US, dict)
|
|
20
|
+
# Repository.get(:en_US)
|
|
21
|
+
class Repository
|
|
22
|
+
# @return [Hash] The dictionary storage
|
|
23
|
+
attr_reader :dictionaries
|
|
24
|
+
|
|
25
|
+
# Create a new repository.
|
|
26
|
+
#
|
|
27
|
+
# @param dictionaries [Hash] Initial dictionaries (optional)
|
|
28
|
+
def initialize(dictionaries = {})
|
|
29
|
+
@dictionaries = dictionaries.dup
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Register a dictionary.
|
|
33
|
+
#
|
|
34
|
+
# @param key [Symbol, String] The key to register under
|
|
35
|
+
# @param dictionary [Base] The dictionary instance
|
|
36
|
+
# @return [self] Self for chaining
|
|
37
|
+
#
|
|
38
|
+
# @example
|
|
39
|
+
# repo.register(:en_US, unix_dict)
|
|
40
|
+
def register(key, dictionary)
|
|
41
|
+
@dictionaries[key.to_sym] = dictionary
|
|
42
|
+
self
|
|
43
|
+
end
|
|
44
|
+
alias add register
|
|
45
|
+
alias []= register
|
|
46
|
+
|
|
47
|
+
# Get a dictionary by key.
|
|
48
|
+
#
|
|
49
|
+
# @param key [Symbol, String] The key
|
|
50
|
+
# @return [Base, nil] The dictionary or nil if not found
|
|
51
|
+
#
|
|
52
|
+
# @example
|
|
53
|
+
# repo.get(:en_US)
|
|
54
|
+
def get(key)
|
|
55
|
+
@dictionaries[key.to_sym]
|
|
56
|
+
end
|
|
57
|
+
alias [] get
|
|
58
|
+
|
|
59
|
+
# Check if a key is registered.
|
|
60
|
+
#
|
|
61
|
+
# @param key [Symbol, String] The key
|
|
62
|
+
# @return [Boolean] True if the key exists
|
|
63
|
+
#
|
|
64
|
+
# @example
|
|
65
|
+
# repo.registered?(:en_US) # => true
|
|
66
|
+
def registered?(key)
|
|
67
|
+
@dictionaries.key?(key.to_sym)
|
|
68
|
+
end
|
|
69
|
+
alias has_key? registered?
|
|
70
|
+
alias key? registered?
|
|
71
|
+
|
|
72
|
+
# Unregister a dictionary.
|
|
73
|
+
#
|
|
74
|
+
# @param key [Symbol, String] The key
|
|
75
|
+
# @return [Base, nil] The removed dictionary or nil
|
|
76
|
+
#
|
|
77
|
+
# @example
|
|
78
|
+
# repo.unregister(:en_US)
|
|
79
|
+
def unregister(key)
|
|
80
|
+
@dictionaries.delete(key.to_sym)
|
|
81
|
+
end
|
|
82
|
+
alias remove unregister
|
|
83
|
+
|
|
84
|
+
# Clear all dictionaries.
|
|
85
|
+
#
|
|
86
|
+
# @return [self] Self for chaining
|
|
87
|
+
def clear
|
|
88
|
+
@dictionaries.clear
|
|
89
|
+
self
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Get all registered keys.
|
|
93
|
+
#
|
|
94
|
+
# @return [Array<Symbol>] All keys
|
|
95
|
+
def keys
|
|
96
|
+
@dictionaries.keys
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Iterate over registered keys.
|
|
100
|
+
#
|
|
101
|
+
# @yield [key] Block to execute for each key
|
|
102
|
+
# @return [Enumerator] Enumerator if no block given
|
|
103
|
+
def each_key(&block)
|
|
104
|
+
return enum_for(:each_key) unless block_given?
|
|
105
|
+
|
|
106
|
+
@dictionaries.each_key(&block)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Get all dictionaries.
|
|
110
|
+
#
|
|
111
|
+
# @return [Array<Base>] All dictionaries
|
|
112
|
+
def values
|
|
113
|
+
@dictionaries.values
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Get the number of registered dictionaries.
|
|
117
|
+
#
|
|
118
|
+
# @return [Integer] Dictionary count
|
|
119
|
+
def size
|
|
120
|
+
@dictionaries.size
|
|
121
|
+
end
|
|
122
|
+
alias count size
|
|
123
|
+
alias length size
|
|
124
|
+
|
|
125
|
+
# Check if the repository is empty.
|
|
126
|
+
#
|
|
127
|
+
# @return [Boolean] True if empty
|
|
128
|
+
def empty?
|
|
129
|
+
@dictionaries.empty?
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Iterate over dictionaries.
|
|
133
|
+
#
|
|
134
|
+
# @yield [key, dictionary] Each key and dictionary
|
|
135
|
+
# @return [Enumerator] Enumerator if no block given
|
|
136
|
+
def each(&block)
|
|
137
|
+
return enum_for(:each) unless block_given?
|
|
138
|
+
|
|
139
|
+
@dictionaries.each(&block)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Merge another repository into this one.
|
|
143
|
+
#
|
|
144
|
+
# @param other [Repository, Hash] The repository or hash to merge
|
|
145
|
+
# @return [self] Self for chaining
|
|
146
|
+
#
|
|
147
|
+
# @example
|
|
148
|
+
# repo1.merge(repo2)
|
|
149
|
+
def merge(other)
|
|
150
|
+
dicts_to_merge = other.is_a?(Repository) ? other.dictionaries : other
|
|
151
|
+
|
|
152
|
+
@dictionaries.merge!(dicts_to_merge)
|
|
153
|
+
self
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Find dictionaries by language code.
|
|
157
|
+
#
|
|
158
|
+
# @param language_code [String] The language code
|
|
159
|
+
# @return [Array<Base>] Matching dictionaries
|
|
160
|
+
#
|
|
161
|
+
# @example
|
|
162
|
+
# repo.find_by_language("en-US")
|
|
163
|
+
def find_by_language(language_code)
|
|
164
|
+
@dictionaries.values.select do |dict|
|
|
165
|
+
dict.language_code.casecmp(language_code).zero?
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Convert to hash.
|
|
170
|
+
#
|
|
171
|
+
# @return [Hash] Hash representation
|
|
172
|
+
def to_h
|
|
173
|
+
@dictionaries.dup
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# String representation.
|
|
177
|
+
#
|
|
178
|
+
# @return [String] String representation
|
|
179
|
+
def to_s
|
|
180
|
+
"Repository(size: #{size})"
|
|
181
|
+
end
|
|
182
|
+
alias inspect to_s
|
|
183
|
+
|
|
184
|
+
# Global repository instance.
|
|
185
|
+
#
|
|
186
|
+
# @return [Repository] The global repository
|
|
187
|
+
#
|
|
188
|
+
# @example Using the global repository
|
|
189
|
+
# Repository.instance.register(:en_US, dict)
|
|
190
|
+
def self.instance
|
|
191
|
+
@instance ||= new
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Register a dictionary in the global repository.
|
|
195
|
+
#
|
|
196
|
+
# @param key [Symbol, String] The key
|
|
197
|
+
# @param dictionary [Base] The dictionary
|
|
198
|
+
# @return [Repository] The global repository
|
|
199
|
+
#
|
|
200
|
+
# @example
|
|
201
|
+
# Repository.register(:en_US, dict)
|
|
202
|
+
def self.register(key, dictionary)
|
|
203
|
+
instance.register(key, dictionary)
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Get a dictionary from the global repository.
|
|
207
|
+
#
|
|
208
|
+
# @param key [Symbol, String] The key
|
|
209
|
+
# @return [Base, nil] The dictionary or nil
|
|
210
|
+
#
|
|
211
|
+
# @example
|
|
212
|
+
# Repository.get(:en_US)
|
|
213
|
+
def self.get(key)
|
|
214
|
+
instance.get(key)
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Unregister a dictionary from the global repository.
|
|
218
|
+
#
|
|
219
|
+
# @param key [Symbol, String] The key
|
|
220
|
+
# @return [Base, nil] The removed dictionary or nil
|
|
221
|
+
def self.unregister(key)
|
|
222
|
+
instance.unregister(key)
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Clear the global repository.
|
|
226
|
+
#
|
|
227
|
+
# @return [Repository] The global repository
|
|
228
|
+
def self.clear
|
|
229
|
+
instance.clear
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
# Get all keys from the global repository.
|
|
233
|
+
#
|
|
234
|
+
# @return [Array<Symbol>] All keys
|
|
235
|
+
def self.keys
|
|
236
|
+
instance.keys
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
# Check if a key is registered in the global repository.
|
|
240
|
+
#
|
|
241
|
+
# @param key [Symbol, String] The key
|
|
242
|
+
# @return [Boolean] True if the key exists
|
|
243
|
+
def self.registered?(key)
|
|
244
|
+
instance.registered?(key)
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
end
|