kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "configuration"
|
|
4
|
+
require_relative "suggestions/generator"
|
|
5
|
+
require_relative "core/models/result/word_result"
|
|
6
|
+
require_relative "core/models/result/document_result"
|
|
7
|
+
|
|
8
|
+
module Kotoshu
|
|
9
|
+
# Main spellchecker class.
|
|
10
|
+
#
|
|
11
|
+
# This is the primary facade for spell checking operations,
|
|
12
|
+
# providing methods to check words, text, and files.
|
|
13
|
+
#
|
|
14
|
+
# @example Creating a spellchecker with a dictionary
|
|
15
|
+
# dict = Kotoshu::Dictionary::UnixWords.new("/usr/share/dict/words", language_code: "en-US")
|
|
16
|
+
# spellchecker = Spellchecker.new(dictionary: dict)
|
|
17
|
+
# spellchecker.correct?("hello") # => true
|
|
18
|
+
#
|
|
19
|
+
# @example Using configuration
|
|
20
|
+
# spellchecker = Spellchecker.new(
|
|
21
|
+
# dictionary_path: "/usr/share/dict/words",
|
|
22
|
+
# language: "en-US"
|
|
23
|
+
# )
|
|
24
|
+
class Spellchecker
|
|
25
|
+
# @return [Suggestions::Generator] The suggestion generator
|
|
26
|
+
attr_reader :generator
|
|
27
|
+
|
|
28
|
+
# @return [Configuration] The configuration
|
|
29
|
+
attr_reader :config
|
|
30
|
+
|
|
31
|
+
# @return [ResourceBundle, nil] The resource bundle if provided
|
|
32
|
+
attr_reader :resource_bundle
|
|
33
|
+
|
|
34
|
+
# Create a new spellchecker.
|
|
35
|
+
#
|
|
36
|
+
# @param dictionary [Dictionary::Base, nil] The dictionary (optional)
|
|
37
|
+
# @param config [Configuration, Hash] Configuration or settings
|
|
38
|
+
# @param resource_bundle [ResourceBundle, nil] Pre-resolved resource bundle
|
|
39
|
+
# @param kwargs [Hash] Additional configuration options
|
|
40
|
+
#
|
|
41
|
+
# @example With dictionary
|
|
42
|
+
# spellchecker = Spellchecker.new(dictionary: dict)
|
|
43
|
+
#
|
|
44
|
+
# @example With resource bundle (0.2+)
|
|
45
|
+
# bundle = Kotoshu::ResourceManager.resolve(language: "en")
|
|
46
|
+
# spellchecker = Spellchecker.new(resource_bundle: bundle)
|
|
47
|
+
# spellchecker.correct?("hello") # => true
|
|
48
|
+
#
|
|
49
|
+
# @example With configuration hash
|
|
50
|
+
# spellchecker = Spellchecker.new(
|
|
51
|
+
# dictionary_path: "/usr/share/dict/words",
|
|
52
|
+
# language: "en-US"
|
|
53
|
+
# )
|
|
54
|
+
#
|
|
55
|
+
# @example With Configuration object
|
|
56
|
+
# config = Configuration.new(dictionary_path: "words.txt")
|
|
57
|
+
# spellchecker = Spellchecker.new(config: config)
|
|
58
|
+
def initialize(dictionary: nil, config: nil, resource_bundle: nil, **kwargs)
|
|
59
|
+
@resource_bundle = resource_bundle
|
|
60
|
+
|
|
61
|
+
if resource_bundle
|
|
62
|
+
dictionary ||= resource_bundle.dictionary
|
|
63
|
+
kwargs[:language] = resource_bundle.language unless kwargs.key?(:language)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
if config.is_a?(Configuration)
|
|
67
|
+
@config = config
|
|
68
|
+
else
|
|
69
|
+
settings = kwargs.dup
|
|
70
|
+
settings[:dictionary_path] = dictionary.path if dictionary.respond_to?(:path)
|
|
71
|
+
@config = Configuration.new(settings)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
@config.dictionary = dictionary if dictionary
|
|
75
|
+
|
|
76
|
+
dict = @config.dictionary
|
|
77
|
+
max_suggestions = @config.max_suggestions
|
|
78
|
+
|
|
79
|
+
@generator = Suggestions::Generator.new(
|
|
80
|
+
dict,
|
|
81
|
+
max_suggestions: max_suggestions,
|
|
82
|
+
algorithms: @config.suggestion_algorithms
|
|
83
|
+
)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Check if a word is spelled correctly.
|
|
87
|
+
#
|
|
88
|
+
# @param word [String] The word to check
|
|
89
|
+
# @return [Boolean] True if the word is correct
|
|
90
|
+
#
|
|
91
|
+
# @example
|
|
92
|
+
# spellchecker.correct?("hello") # => true
|
|
93
|
+
# spellchecker.correct?("helo") # => false
|
|
94
|
+
def correct?(word)
|
|
95
|
+
return false if word.nil? || word.empty?
|
|
96
|
+
|
|
97
|
+
@generator.correct?(word)
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Check if a word is misspelled.
|
|
101
|
+
#
|
|
102
|
+
# @param word [String] The word to check
|
|
103
|
+
# @return [Boolean] True if the word is misspelled
|
|
104
|
+
def incorrect?(word)
|
|
105
|
+
!correct?(word)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Get spelling suggestions for a word.
|
|
109
|
+
#
|
|
110
|
+
# @param word [String] The misspelled word
|
|
111
|
+
# @param max_suggestions [Integer] Maximum suggestions (optional)
|
|
112
|
+
# @return [Suggestions::SuggestionSet] Generated suggestions
|
|
113
|
+
#
|
|
114
|
+
# @example
|
|
115
|
+
# suggestions = spellchecker.suggest("helo")
|
|
116
|
+
# suggestions.to_words # => ["hello", "help", "held", ...]
|
|
117
|
+
def suggest(word, max_suggestions: nil)
|
|
118
|
+
return Suggestions::SuggestionSet.empty if word.nil? || word.empty?
|
|
119
|
+
|
|
120
|
+
@generator.generate(word, max_suggestions: max_suggestions)
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Check a word and return a result object.
|
|
124
|
+
#
|
|
125
|
+
# @param word [String] The word to check
|
|
126
|
+
# @return [Models::Result::WordResult] The check result
|
|
127
|
+
#
|
|
128
|
+
# @example
|
|
129
|
+
# result = spellchecker.check_word("hello")
|
|
130
|
+
# result.correct? # => true
|
|
131
|
+
#
|
|
132
|
+
# @example With misspelled word
|
|
133
|
+
# result = spellchecker.check_word("helo")
|
|
134
|
+
# result.correct? # => false
|
|
135
|
+
# result.suggestions # => SuggestionSet with suggestions
|
|
136
|
+
def check_word(word)
|
|
137
|
+
if word.nil? || word.empty?
|
|
138
|
+
return Models::Result::WordResult.new("", correct: false,
|
|
139
|
+
suggestions: Suggestions::SuggestionSet.empty)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
if correct?(word)
|
|
143
|
+
Models::Result::WordResult.correct(word)
|
|
144
|
+
else
|
|
145
|
+
suggestions = suggest(word)
|
|
146
|
+
Models::Result::WordResult.incorrect(word, suggestions: suggestions)
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Check text for spelling errors.
|
|
151
|
+
#
|
|
152
|
+
# @param text [String] The text to check
|
|
153
|
+
# @return [Models::Result::DocumentResult] The check result
|
|
154
|
+
#
|
|
155
|
+
# @example
|
|
156
|
+
# result = spellchecker.check("Hello wrold")
|
|
157
|
+
# result.success? # => false
|
|
158
|
+
# result.errors.map(&:word) # => ["wrold"]
|
|
159
|
+
def check(text)
|
|
160
|
+
return Models::Result::DocumentResult.success if text.nil? || text.empty?
|
|
161
|
+
|
|
162
|
+
words = tokenize(text)
|
|
163
|
+
errors = []
|
|
164
|
+
position = 0
|
|
165
|
+
|
|
166
|
+
words.each do |word_data|
|
|
167
|
+
word, pos = word_data
|
|
168
|
+
result = check_word(word)
|
|
169
|
+
|
|
170
|
+
if result.incorrect?
|
|
171
|
+
errors << Models::Result::WordResult.new(
|
|
172
|
+
word,
|
|
173
|
+
correct: false,
|
|
174
|
+
suggestions: result.suggestions,
|
|
175
|
+
position: pos
|
|
176
|
+
)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
position = pos
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
Models::Result::DocumentResult.new(
|
|
183
|
+
file: nil,
|
|
184
|
+
errors: errors,
|
|
185
|
+
word_count: words.size
|
|
186
|
+
)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Check a file for spelling errors.
|
|
190
|
+
#
|
|
191
|
+
# @param path [String] The file path
|
|
192
|
+
# @return [Models::Result::DocumentResult] The check result
|
|
193
|
+
#
|
|
194
|
+
# @example
|
|
195
|
+
# result = spellchecker.check_file("README.md")
|
|
196
|
+
# result.to_s # => "File 'README.md': 3 spelling error(s) found"
|
|
197
|
+
def check_file(path)
|
|
198
|
+
raise DictionaryNotFoundError, path unless File.exist?(path)
|
|
199
|
+
|
|
200
|
+
text = File.read(path, encoding: @config.encoding)
|
|
201
|
+
result = check(text)
|
|
202
|
+
|
|
203
|
+
# Create a new result with the file path
|
|
204
|
+
Models::Result::DocumentResult.new(
|
|
205
|
+
file: path,
|
|
206
|
+
errors: result.errors,
|
|
207
|
+
word_count: result.word_count
|
|
208
|
+
)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Check a directory for spelling errors.
|
|
212
|
+
#
|
|
213
|
+
# @param path [String] The directory path
|
|
214
|
+
# @param pattern [String] File pattern to match (default: "*.txt")
|
|
215
|
+
# @return [Array<Models::Result::DocumentResult>] Results for each file
|
|
216
|
+
#
|
|
217
|
+
# @example
|
|
218
|
+
# results = spellchecker.check_directory("docs/")
|
|
219
|
+
# results.select(&:failed?).map(&:file)
|
|
220
|
+
def check_directory(path, pattern: "*.txt")
|
|
221
|
+
raise DictionaryNotFoundError, path unless File.exist?(path) && File.directory?(path)
|
|
222
|
+
|
|
223
|
+
files = Dir.glob(File.join(path, pattern))
|
|
224
|
+
files.map { |file| check_file(file) }
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# Tokenize text into words.
|
|
228
|
+
#
|
|
229
|
+
# @param text [String] The text to tokenize
|
|
230
|
+
# @return [Array<Array>] Array of [word, position] pairs
|
|
231
|
+
#
|
|
232
|
+
# @example
|
|
233
|
+
# spellchecker.tokenize("Hello world!")
|
|
234
|
+
# # => [["Hello", 0], ["world", 6]]
|
|
235
|
+
def tokenize(text)
|
|
236
|
+
return [] if text.nil? || text.empty?
|
|
237
|
+
|
|
238
|
+
words = []
|
|
239
|
+
position = 0
|
|
240
|
+
word_buffer = String.new
|
|
241
|
+
word_start = 0
|
|
242
|
+
|
|
243
|
+
text.each_char.with_index do |char, i|
|
|
244
|
+
if word_char?(char)
|
|
245
|
+
word_buffer << char
|
|
246
|
+
word_start = i if word_buffer.length == 1
|
|
247
|
+
position = i
|
|
248
|
+
elsif !word_buffer.empty?
|
|
249
|
+
words << [word_buffer.dup.freeze, word_start]
|
|
250
|
+
word_buffer.clear
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
# Don't forget the last word
|
|
255
|
+
words << [word_buffer.dup.freeze, word_start] unless word_buffer.empty?
|
|
256
|
+
|
|
257
|
+
words
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
# Get the dictionary being used.
|
|
261
|
+
#
|
|
262
|
+
# @return [Dictionary::Base] The dictionary
|
|
263
|
+
def dictionary
|
|
264
|
+
@generator.dictionary
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
# Reload the dictionary.
|
|
268
|
+
#
|
|
269
|
+
# @return [self] Self for chaining
|
|
270
|
+
def reload_dictionary
|
|
271
|
+
@config.reset_dictionary
|
|
272
|
+
|
|
273
|
+
dict = @config.dictionary
|
|
274
|
+
@generator = Suggestions::Generator.new(
|
|
275
|
+
dict,
|
|
276
|
+
max_suggestions: @config.max_suggestions,
|
|
277
|
+
algorithms: @config.suggestion_algorithms
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
self
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
private
|
|
284
|
+
|
|
285
|
+
# Check if a character is part of a word.
|
|
286
|
+
#
|
|
287
|
+
# @param char [String] The character
|
|
288
|
+
# @return [Boolean] True if it's a word character
|
|
289
|
+
def word_char?(char)
|
|
290
|
+
case char
|
|
291
|
+
when "a".."z", "A".."Z", "'"
|
|
292
|
+
true
|
|
293
|
+
else
|
|
294
|
+
false
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
end
|
|
298
|
+
end
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
# String similarity metrics for spell checking.
|
|
5
|
+
#
|
|
6
|
+
# Ported from Spylls (Python) string_metrics.py
|
|
7
|
+
#
|
|
8
|
+
# These metrics are used for:
|
|
9
|
+
# - Computing word similarity
|
|
10
|
+
# - Ranking suggestions
|
|
11
|
+
# - N-gram based scoring
|
|
12
|
+
module StringMetrics
|
|
13
|
+
# Number of occurrences of the exactly same characters in exactly same position.
|
|
14
|
+
#
|
|
15
|
+
# @param s1 [String] First string
|
|
16
|
+
# @param s2 [String] Second string
|
|
17
|
+
# @return [Integer] Count of matching characters at same positions
|
|
18
|
+
#
|
|
19
|
+
# @example
|
|
20
|
+
# Kotoshu::StringMetrics.commoncharacters("hello", "hallo") # => 4 ('h', 'l', 'l', 'o' match)
|
|
21
|
+
def self.commoncharacters(s1, s2)
|
|
22
|
+
return 0 if s1.nil? || s2.nil?
|
|
23
|
+
|
|
24
|
+
# Zip strings and count matching character pairs
|
|
25
|
+
[s1.length, s2.length].min.times.count do |i|
|
|
26
|
+
s1[i] == s2[i]
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Size of the common start of two strings.
|
|
31
|
+
#
|
|
32
|
+
# @param s1 [String] First string
|
|
33
|
+
# @param s2 [String] Second string
|
|
34
|
+
# @return [Integer] Length of common prefix
|
|
35
|
+
#
|
|
36
|
+
# @example
|
|
37
|
+
# Kotoshu::StringMetrics.leftcommonsubstring("foo", "bar") # => 0
|
|
38
|
+
# Kotoshu::StringMetrics.leftcommonsubstring("built", "build") # => 4
|
|
39
|
+
# Kotoshu::StringMetrics.leftcommonsubstring("cat", "cats") # => 3
|
|
40
|
+
def self.leftcommonsubstring(s1, s2)
|
|
41
|
+
return 0 if s1.nil? || s2.nil?
|
|
42
|
+
|
|
43
|
+
# Find first position where characters differ
|
|
44
|
+
s1.chars.zip(s2.chars).each_with_index do |(c1, c2), i|
|
|
45
|
+
return i if c1 != c2
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# All characters matched up to shorter string length
|
|
49
|
+
[s1.length, s2.length].min
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Calculate n-gram similarity between two strings.
|
|
53
|
+
#
|
|
54
|
+
# Calculates how many n-grams of s1 are contained in s2 (the more the number,
|
|
55
|
+
# the more words are similar).
|
|
56
|
+
#
|
|
57
|
+
# @param max_ngram_size [Integer] Maximum n-gram size to check
|
|
58
|
+
# @param s1 [String] String to compare
|
|
59
|
+
# @param s2 [String] String to compare
|
|
60
|
+
# @param weighted [Boolean] Subtract from result for ngrams NOT contained
|
|
61
|
+
# @param any_mismatch [Boolean] Add penalty for any string length difference
|
|
62
|
+
# @param longer_worse [Boolean] Add penalty when second string is longer
|
|
63
|
+
# @return [Integer] N-gram similarity score (higher is more similar)
|
|
64
|
+
#
|
|
65
|
+
# @example
|
|
66
|
+
# Kotoshu::StringMetrics.ngram(4, "hello", "help") # => 6
|
|
67
|
+
# Kotoshu::StringMetrics.ngram(4, "teachings", "teaching") # => higher score
|
|
68
|
+
def self.ngram(max_ngram_size, s1, s2, weighted: false, any_mismatch: false, longer_worse: false)
|
|
69
|
+
l2 = s2.length
|
|
70
|
+
return 0 if l2.zero?
|
|
71
|
+
|
|
72
|
+
l1 = s1.length
|
|
73
|
+
nscore = 0
|
|
74
|
+
|
|
75
|
+
# For all sizes of ngram up to desired...
|
|
76
|
+
(1..max_ngram_size).each do |ngram_size|
|
|
77
|
+
ns = 0
|
|
78
|
+
|
|
79
|
+
# Check every position in the first string
|
|
80
|
+
(0..(l1 - ngram_size)).each do |pos|
|
|
81
|
+
ngram = s1[pos, ngram_size]
|
|
82
|
+
|
|
83
|
+
# If the ngram is present in ANY place in second string, increase score
|
|
84
|
+
if s2.include?(ngram)
|
|
85
|
+
ns += 1
|
|
86
|
+
elsif weighted
|
|
87
|
+
# For "weighted" ngrams, decrease score if ngram is not found
|
|
88
|
+
ns -= 1
|
|
89
|
+
# Decrease once more if it was the beginning or end of first string
|
|
90
|
+
ns -= 1 if pos.zero? || pos + ngram_size == l1
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
nscore += ns
|
|
95
|
+
|
|
96
|
+
# There is no need to check for 4-gram if there were only one 3-gram
|
|
97
|
+
break if ns < 2 && !weighted
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Calculate penalty based on settings
|
|
101
|
+
penalty = if longer_worse
|
|
102
|
+
# Add penalty when second string is longer
|
|
103
|
+
(l2 - l1) - 2
|
|
104
|
+
elsif any_mismatch
|
|
105
|
+
# Add penalty for any string length difference
|
|
106
|
+
(l2 - l1).abs - 2
|
|
107
|
+
else
|
|
108
|
+
0
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Apply penalty if positive
|
|
112
|
+
penalty > 0 ? nscore - penalty : nscore
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Calculate LCS (Longest Common Subsequence) length.
|
|
116
|
+
#
|
|
117
|
+
# Classic dynamic programming algorithm. This is different from
|
|
118
|
+
# longest common substring - subsequence doesn't require contiguity.
|
|
119
|
+
#
|
|
120
|
+
# @param s1 [String] First string
|
|
121
|
+
# @param s2 [String] Second string
|
|
122
|
+
# @return [Integer] Length of longest common subsequence
|
|
123
|
+
#
|
|
124
|
+
# @example
|
|
125
|
+
# Kotoshu::StringMetrics.lcslen("AGGTAB", "GXTXAYB") # => 4 ("GTAB")
|
|
126
|
+
def self.lcslen(s1, s2)
|
|
127
|
+
return 0 if s1.nil? || s2.nil? || s1.empty? || s2.empty?
|
|
128
|
+
|
|
129
|
+
m = s1.length
|
|
130
|
+
n = s2.length
|
|
131
|
+
|
|
132
|
+
# Create DP table
|
|
133
|
+
# Using a 2D array for clarity, though we could optimize space
|
|
134
|
+
c = Array.new(m + 1) { Array.new(n + 1, 0) }
|
|
135
|
+
|
|
136
|
+
(0...m).each do |i|
|
|
137
|
+
(0...n).each do |j|
|
|
138
|
+
if s1[i] == s2[j]
|
|
139
|
+
# Characters match - extend diagonal
|
|
140
|
+
c[i + 1][j + 1] = c[i][j] + 1
|
|
141
|
+
elsif c[i][j + 1] >= c[i + 1][j]
|
|
142
|
+
# Take max from top or left
|
|
143
|
+
c[i + 1][j + 1] = c[i][j + 1]
|
|
144
|
+
else
|
|
145
|
+
c[i + 1][j + 1] = c[i + 1][j]
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
c[m][n]
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Suggestions
|
|
5
|
+
# Context object passed to suggestion strategies.
|
|
6
|
+
# Encapsulates the state and parameters for suggestion generation.
|
|
7
|
+
class Context
|
|
8
|
+
attr_reader :word, :dictionary, :max_results, :options
|
|
9
|
+
|
|
10
|
+
def initialize(word:, dictionary:, max_results: 10, **options)
|
|
11
|
+
@word = word
|
|
12
|
+
@dictionary = dictionary
|
|
13
|
+
@max_results = max_results
|
|
14
|
+
@options = options
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Get an option value.
|
|
18
|
+
#
|
|
19
|
+
# @param key [Symbol] The option key
|
|
20
|
+
# @param default [Object] Default value if not found
|
|
21
|
+
# @return [Object] The option value
|
|
22
|
+
def option(key, default = nil)
|
|
23
|
+
@options.fetch(key, default)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Check if an option is present.
|
|
27
|
+
#
|
|
28
|
+
# @param key [Symbol] The option key
|
|
29
|
+
# @return [Boolean] True if option exists
|
|
30
|
+
def has_option?(key)
|
|
31
|
+
@options.key?(key)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Convert context to hash.
|
|
35
|
+
#
|
|
36
|
+
# @return [Hash] Context as hash
|
|
37
|
+
def to_h
|
|
38
|
+
{
|
|
39
|
+
word: @word,
|
|
40
|
+
dictionary: @dictionary,
|
|
41
|
+
max_results: @max_results,
|
|
42
|
+
options: @options
|
|
43
|
+
}
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Inspect the context.
|
|
47
|
+
#
|
|
48
|
+
# @return [String] Inspection string
|
|
49
|
+
def inspect
|
|
50
|
+
"Context(word: '#{@word}', max_results: #{@max_results})"
|
|
51
|
+
end
|
|
52
|
+
alias to_s inspect
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "context"
|
|
4
|
+
require_relative "suggestion_set"
|
|
5
|
+
require_relative "strategies/base_strategy"
|
|
6
|
+
require_relative "strategies/composite_strategy"
|
|
7
|
+
require_relative "strategies/edit_distance_strategy"
|
|
8
|
+
require_relative "strategies/phonetic_strategy"
|
|
9
|
+
require_relative "strategies/keyboard_proximity_strategy"
|
|
10
|
+
require_relative "strategies/ngram_strategy"
|
|
11
|
+
require_relative "strategies/semantic_strategy"
|
|
12
|
+
|
|
13
|
+
module Kotoshu
|
|
14
|
+
module Suggestions
|
|
15
|
+
# Generator for spelling suggestions.
|
|
16
|
+
#
|
|
17
|
+
# This class orchestrates multiple suggestion algorithms to generate
|
|
18
|
+
# comprehensive spelling suggestions.
|
|
19
|
+
#
|
|
20
|
+
# @example Using default algorithms
|
|
21
|
+
# generator = Generator.new(dictionary)
|
|
22
|
+
# suggestions = generator.generate("helo")
|
|
23
|
+
#
|
|
24
|
+
# @example Using custom algorithms
|
|
25
|
+
# custom_strategy = MyStrategy.new
|
|
26
|
+
# generator = Generator.new(dictionary, algorithms: [custom_strategy])
|
|
27
|
+
class Generator
|
|
28
|
+
# Default suggestion algorithms.
|
|
29
|
+
DEFAULT_ALGORITHMS = [
|
|
30
|
+
Strategies::EditDistanceStrategy,
|
|
31
|
+
Strategies::PhoneticStrategy,
|
|
32
|
+
Strategies::KeyboardProximityStrategy,
|
|
33
|
+
Strategies::NgramStrategy
|
|
34
|
+
].freeze
|
|
35
|
+
|
|
36
|
+
# @return [Object] The dictionary (any dictionary backend)
|
|
37
|
+
attr_reader :dictionary
|
|
38
|
+
|
|
39
|
+
# @return [Strategies::CompositeStrategy] The composite strategy
|
|
40
|
+
attr_reader :strategy
|
|
41
|
+
|
|
42
|
+
# Create a new suggestion generator.
|
|
43
|
+
#
|
|
44
|
+
# @param dictionary [Object] The dictionary instance
|
|
45
|
+
# @param algorithms [Array<Class, Strategies::BaseStrategy>, nil] Algorithm classes or instances
|
|
46
|
+
# @param max_suggestions [Integer] Maximum suggestions to return
|
|
47
|
+
# @param config [Hash] Configuration options
|
|
48
|
+
def initialize(dictionary, algorithms: nil, max_suggestions: 10, **config)
|
|
49
|
+
@dictionary = dictionary
|
|
50
|
+
@max_suggestions = max_suggestions
|
|
51
|
+
# Use default algorithms if none provided
|
|
52
|
+
algorithms_to_use = algorithms || DEFAULT_ALGORITHMS
|
|
53
|
+
@strategy = build_strategy(algorithms_to_use, config)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Generate suggestions for a word.
|
|
57
|
+
#
|
|
58
|
+
# @param word [String] The misspelled word
|
|
59
|
+
# @param max_suggestions [Integer] Maximum suggestions (optional)
|
|
60
|
+
# @return [SuggestionSet] Generated suggestions
|
|
61
|
+
#
|
|
62
|
+
# @example
|
|
63
|
+
# generator.generate("helo")
|
|
64
|
+
# # => #<Kotoshu::Suggestions::SuggestionSet ...>
|
|
65
|
+
def generate(word, max_suggestions: nil)
|
|
66
|
+
return SuggestionSet.empty if word.nil? || word.empty?
|
|
67
|
+
|
|
68
|
+
context = Context.new(
|
|
69
|
+
word: word,
|
|
70
|
+
dictionary: @dictionary,
|
|
71
|
+
max_results: max_suggestions || @max_suggestions
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
@strategy.generate(context)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Alias for generate for API consistency.
|
|
78
|
+
#
|
|
79
|
+
# @param word [String] The misspelled word
|
|
80
|
+
# @param max_suggestions [Integer] Maximum suggestions (optional)
|
|
81
|
+
# @return [SuggestionSet] Generated suggestions
|
|
82
|
+
#
|
|
83
|
+
# @example
|
|
84
|
+
# generator.suggest("helo")
|
|
85
|
+
# # => #<Kotoshu::Suggestions::SuggestionSet ...>
|
|
86
|
+
alias suggest generate
|
|
87
|
+
|
|
88
|
+
# Check if a word is correct.
|
|
89
|
+
#
|
|
90
|
+
# @param word [String] The word to check
|
|
91
|
+
# @return [Boolean] True if the word is in the dictionary
|
|
92
|
+
#
|
|
93
|
+
# @example
|
|
94
|
+
# generator.correct?("hello") # => true
|
|
95
|
+
# generator.correct?("helo") # => false
|
|
96
|
+
def correct?(word)
|
|
97
|
+
return false if word.nil? || word.empty?
|
|
98
|
+
|
|
99
|
+
dictionary_lookup(word)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Check if a word is incorrect.
|
|
103
|
+
#
|
|
104
|
+
# @param word [String] The word to check
|
|
105
|
+
# @return [Boolean] True if the word is not in the dictionary
|
|
106
|
+
def incorrect?(word)
|
|
107
|
+
!correct?(word)
|
|
108
|
+
end
|
|
109
|
+
alias misspelled? incorrect?
|
|
110
|
+
|
|
111
|
+
# Get the default algorithms.
|
|
112
|
+
#
|
|
113
|
+
# @return [Array<Class>] Default algorithm classes
|
|
114
|
+
#
|
|
115
|
+
# @example
|
|
116
|
+
# Generator.default_algorithms
|
|
117
|
+
def self.default_algorithms
|
|
118
|
+
DEFAULT_ALGORITHMS.dup
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Set the default algorithms.
|
|
122
|
+
#
|
|
123
|
+
# @param algorithms [Array<Class>] Algorithm classes
|
|
124
|
+
#
|
|
125
|
+
# @example
|
|
126
|
+
# Generator.default_algorithms = [MyCustomStrategy]
|
|
127
|
+
class << self
|
|
128
|
+
attr_writer :default_algorithms
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
private
|
|
132
|
+
|
|
133
|
+
# Build the composite strategy from algorithm classes.
|
|
134
|
+
#
|
|
135
|
+
# @param algorithms [Array<Class, Strategies::BaseStrategy>] Algorithm classes or instances
|
|
136
|
+
# @param config [Hash] Configuration options
|
|
137
|
+
# @return [Strategies::CompositeStrategy] The composite strategy
|
|
138
|
+
def build_strategy(algorithms, config)
|
|
139
|
+
composite = Strategies::CompositeStrategy.new(name: :default, **config)
|
|
140
|
+
|
|
141
|
+
algorithms.each do |alg|
|
|
142
|
+
strategy = if alg.is_a?(Strategies::BaseStrategy)
|
|
143
|
+
alg
|
|
144
|
+
elsif alg.is_a?(Class) && alg < Strategies::BaseStrategy
|
|
145
|
+
alg.new(**config)
|
|
146
|
+
else
|
|
147
|
+
raise ArgumentError, "Invalid algorithm: #{alg.inspect}"
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
composite.add(strategy)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
composite
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Look up a word in the dictionary.
|
|
157
|
+
#
|
|
158
|
+
# @param word [String] The word
|
|
159
|
+
# @return [Boolean] True if found
|
|
160
|
+
def dictionary_lookup(word)
|
|
161
|
+
if @dictionary.respond_to?(:lookup)
|
|
162
|
+
@dictionary.lookup(word)
|
|
163
|
+
elsif @dictionary.respond_to?(:include?)
|
|
164
|
+
@dictionary.include?(word)
|
|
165
|
+
elsif @dictionary.is_a?(Hash)
|
|
166
|
+
@dictionary.key?(word)
|
|
167
|
+
elsif @dictionary.is_a?(Array)
|
|
168
|
+
@dictionary.include?(word)
|
|
169
|
+
else
|
|
170
|
+
false
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|