kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,575 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Algorithms
|
|
5
|
+
# Main suggestion orchestration for spell checking.
|
|
6
|
+
#
|
|
7
|
+
# Ported from Spylls (Python) suggest.py
|
|
8
|
+
#
|
|
9
|
+
# On a bird's-eye view level, suggest does:
|
|
10
|
+
# 1. Tries small word "edits" (remove letters, insert letters, swap letters)
|
|
11
|
+
# and checks (with the help of Lookup) if there are any valid ones
|
|
12
|
+
# 2. If no good suggestions found, tries "ngram-based" suggestions
|
|
13
|
+
# (calculating ngram-based distance to all dictionary words)
|
|
14
|
+
# 3. If possible, tries metaphone-based suggestions (phonetic)
|
|
15
|
+
#
|
|
16
|
+
# Note: Spylls's implementation takes one liberty vs Hunspell:
|
|
17
|
+
# In Hunspell, ngram suggestions and phonetic suggestions are done in the
|
|
18
|
+
# same cycle. Spylls does them in two separate cycles for clarity.
|
|
19
|
+
#
|
|
20
|
+
# To follow algorithm details, see Suggest.suggestions method.
|
|
21
|
+
module Suggest
|
|
22
|
+
MAXPHONSUGS = 2
|
|
23
|
+
MAXSUGGESTIONS = 15
|
|
24
|
+
GOOD_EDITS = %w[spaceword uppercase replchars].freeze
|
|
25
|
+
|
|
26
|
+
# Represents a single word suggestion.
|
|
27
|
+
#
|
|
28
|
+
# Suggestions are produced internally to store enough information to
|
|
29
|
+
# make sure it is a good one.
|
|
30
|
+
class Suggestion
|
|
31
|
+
# @return [String] Actual suggestion text
|
|
32
|
+
attr_reader :text
|
|
33
|
+
|
|
34
|
+
# @return [String] How suggestion was produced (same as method name)
|
|
35
|
+
attr_reader :kind
|
|
36
|
+
|
|
37
|
+
def initialize(text, kind)
|
|
38
|
+
@text = text
|
|
39
|
+
@kind = kind
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Create a copy with changes.
|
|
43
|
+
#
|
|
44
|
+
# @param changes [Hash] Changes to apply
|
|
45
|
+
# @return [Suggestion] New suggestion with changes applied
|
|
46
|
+
def replace(**changes)
|
|
47
|
+
self.class.new(
|
|
48
|
+
changes.fetch(:text, @text),
|
|
49
|
+
changes.fetch(:kind, @kind)
|
|
50
|
+
)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# String representation.
|
|
54
|
+
#
|
|
55
|
+
# @return [String]
|
|
56
|
+
def to_s
|
|
57
|
+
@text
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Inspect string.
|
|
61
|
+
#
|
|
62
|
+
# @return [String]
|
|
63
|
+
def inspect
|
|
64
|
+
"Suggestion[#{@kind}](#{@text.inspect})"
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Represents suggestion to split words into several.
|
|
69
|
+
#
|
|
70
|
+
# Used when the algorithm suggests that a misspelled word should be
|
|
71
|
+
# split into multiple dictionary words.
|
|
72
|
+
class MultiWordSuggestion
|
|
73
|
+
# @return [Array<String>] List of words
|
|
74
|
+
attr_reader :words
|
|
75
|
+
|
|
76
|
+
# @return [String] Same as Suggestion.kind
|
|
77
|
+
attr_reader :source
|
|
78
|
+
|
|
79
|
+
# @return [Boolean] Whether words can be joined by dash
|
|
80
|
+
attr_reader :allow_dash
|
|
81
|
+
|
|
82
|
+
def initialize(words, source, allow_dash: true)
|
|
83
|
+
@words = words
|
|
84
|
+
@source = source
|
|
85
|
+
@allow_dash = allow_dash
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Convert to string suggestion.
|
|
89
|
+
#
|
|
90
|
+
# @param separator [String] Separator to join words with
|
|
91
|
+
# @return [Suggestion] String suggestion with joined words
|
|
92
|
+
def stringify(separator = ' ')
|
|
93
|
+
Suggestion.new(@words.join(separator), @source)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Inspect string.
|
|
97
|
+
#
|
|
98
|
+
# @return [String]
|
|
99
|
+
def inspect
|
|
100
|
+
"Suggestion[#{@source}](#{@words.inspect})"
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Main suggestion class.
|
|
105
|
+
#
|
|
106
|
+
# Typically, you would not use this directly, but you might want to for
|
|
107
|
+
# experiments.
|
|
108
|
+
#
|
|
109
|
+
# Example:
|
|
110
|
+
# dictionary = Kotoshu::Dictionary.load('en_US')
|
|
111
|
+
# suggester = dictionary.suggester
|
|
112
|
+
#
|
|
113
|
+
# suggester.suggestions('spylls') do |suggestion|
|
|
114
|
+
# puts suggestion
|
|
115
|
+
# end
|
|
116
|
+
#
|
|
117
|
+
# # Output:
|
|
118
|
+
# # Suggestion[badchar](spell)
|
|
119
|
+
# # Suggestion[badchar](spill)
|
|
120
|
+
class Suggester
|
|
121
|
+
# @return [Object] Aff data structure (from aff file)
|
|
122
|
+
attr_reader :aff
|
|
123
|
+
|
|
124
|
+
# @return [Object] Dic data structure (from dic file)
|
|
125
|
+
attr_reader :dic
|
|
126
|
+
|
|
127
|
+
# @return [Object] Lookup object
|
|
128
|
+
attr_reader :lookup
|
|
129
|
+
|
|
130
|
+
def initialize(aff, dic, lookup)
|
|
131
|
+
@aff = aff
|
|
132
|
+
@dic = dic
|
|
133
|
+
@lookup = lookup
|
|
134
|
+
|
|
135
|
+
# Prepare words for ngram (exclude those with bad flags)
|
|
136
|
+
bad_flags = [
|
|
137
|
+
@aff[:FORBIDDENWORD],
|
|
138
|
+
@aff[:NOSUGGEST],
|
|
139
|
+
@aff[:ONLYINCOMPOUND]
|
|
140
|
+
].compact
|
|
141
|
+
|
|
142
|
+
@words_for_ngram = @dic[:words].select do |word|
|
|
143
|
+
flags = word[:flags] || []
|
|
144
|
+
(flags & bad_flags).empty?
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Outer "public" interface: returns all valid suggestions as strings.
|
|
149
|
+
#
|
|
150
|
+
# Returns an enumerator for lazy evaluation.
|
|
151
|
+
#
|
|
152
|
+
# @param word [String] Word to check
|
|
153
|
+
# @return [Enumerator<String>] Suggestions as strings
|
|
154
|
+
def call(word)
|
|
155
|
+
return enum_for(:call, word) unless block_given?
|
|
156
|
+
|
|
157
|
+
suggestions(word) do |suggestion|
|
|
158
|
+
yield suggestion.text
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Main suggestion search loop.
|
|
163
|
+
#
|
|
164
|
+
# What it does, in general:
|
|
165
|
+
# 1. Generates possible misspelled word cases (capitalization variants)
|
|
166
|
+
# 2. Produces word edits with edits, checks them with Lookup
|
|
167
|
+
# 3. If needed, produces ngram-based suggestions
|
|
168
|
+
# 4. If needed, produces phonetically similar suggestions
|
|
169
|
+
#
|
|
170
|
+
# @param word [String] Word to check
|
|
171
|
+
# @yield [Suggestion, MultiWordSuggestion] Each suggestion object
|
|
172
|
+
def suggestions(word)
|
|
173
|
+
return enum_for(:suggestions, word) unless block_given?
|
|
174
|
+
|
|
175
|
+
# Track all suggestions we've already yielded
|
|
176
|
+
handled = Set.new
|
|
177
|
+
|
|
178
|
+
# Helper: Check if suggestion is a valid word
|
|
179
|
+
is_good_suggestion = ->(w) do
|
|
180
|
+
# Check if there's any good form of this exact word
|
|
181
|
+
# Note: We check good_forms directly to avoid ICONV and dash-breaking
|
|
182
|
+
good_forms = @lookup.good_forms(w, capitalization: false, allow_nosuggest: false)
|
|
183
|
+
good_forms.any?
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Helper: Check if word is forbidden
|
|
187
|
+
is_forbidden = ->(w) do
|
|
188
|
+
return false unless @aff[:FORBIDDENWORD]
|
|
189
|
+
|
|
190
|
+
@dic[:has_flag]&.call(w, @aff[:FORBIDDENWORD]) || false
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Get capitalization type and variants
|
|
194
|
+
captype, variants = @aff[:casing].corrections(word)
|
|
195
|
+
|
|
196
|
+
# Special case: FORCEUCASE with NO capitalization
|
|
197
|
+
if @aff[:FORCEUCASE] && captype == Capitalization::Type::NO
|
|
198
|
+
@aff[:casing].capitalize(word).each do |capitalized|
|
|
199
|
+
if is_good_suggestion.call(capitalized)
|
|
200
|
+
yield Suggestion.new(capitalized.capitalize, 'forceucase')
|
|
201
|
+
return
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
good_edits_found = false
|
|
207
|
+
|
|
208
|
+
# Process each capitalization variant
|
|
209
|
+
variants.each_with_index do |variant, idx|
|
|
210
|
+
# If different from original and is good, suggest it
|
|
211
|
+
if idx.positive? && is_good_suggestion.call(variant)
|
|
212
|
+
handle_found(
|
|
213
|
+
Suggestion.new(variant, 'case'),
|
|
214
|
+
captype: captype,
|
|
215
|
+
is_forbidden: is_forbidden,
|
|
216
|
+
handled: handled
|
|
217
|
+
) do |suggestion|
|
|
218
|
+
yield suggestion
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# Generate and check edits (non-compound first)
|
|
223
|
+
nocompound = false
|
|
224
|
+
|
|
225
|
+
edit_suggestions(variant, compounds: false, limit: MAXSUGGESTIONS) do |suggestion|
|
|
226
|
+
handle_found(
|
|
227
|
+
suggestion,
|
|
228
|
+
captype: captype,
|
|
229
|
+
is_forbidden: is_forbidden,
|
|
230
|
+
handled: handled,
|
|
231
|
+
check_inclusion: false
|
|
232
|
+
) do |handled_suggestion|
|
|
233
|
+
yield handled_suggestion
|
|
234
|
+
|
|
235
|
+
kind = handled_suggestion.kind
|
|
236
|
+
good_edits_found = true if GOOD_EDITS.include?(kind)
|
|
237
|
+
nocompound = true if %w[uppercase replchars mapchars].include?(kind)
|
|
238
|
+
|
|
239
|
+
# If we found a spaceword that's in the dictionary as a whole,
|
|
240
|
+
# that's the only suggestion we need
|
|
241
|
+
return if kind == 'spaceword'
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Generate compound suggestions if not excluded
|
|
246
|
+
unless nocompound
|
|
247
|
+
limit = @aff[:MAXCPDSUGS] || MAXSUGGESTIONS
|
|
248
|
+
edit_suggestions(variant, compounds: true, limit: limit) do |suggestion|
|
|
249
|
+
handle_found(
|
|
250
|
+
suggestion,
|
|
251
|
+
captype: captype,
|
|
252
|
+
is_forbidden: is_forbidden,
|
|
253
|
+
handled: handled,
|
|
254
|
+
check_inclusion: false
|
|
255
|
+
) do |handled_suggestion|
|
|
256
|
+
yield handled_suggestion
|
|
257
|
+
kind = handled_suggestion.kind
|
|
258
|
+
good_edits_found = true if GOOD_EDITS.include?(kind)
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# Skip ngram/phonetic if we found good edits
|
|
265
|
+
return if good_edits_found
|
|
266
|
+
|
|
267
|
+
# Try fixing words with dashes
|
|
268
|
+
if word.include?('-') && handled.none? { |s| s.include?('-') }
|
|
269
|
+
chunks = word.split('-')
|
|
270
|
+
chunks.each_with_index do |chunk, idx|
|
|
271
|
+
next if is_good_suggestion.call(chunk)
|
|
272
|
+
|
|
273
|
+
# Try all suggestions for this chunk
|
|
274
|
+
call(chunk).each do |sug|
|
|
275
|
+
candidate = chunks[0...idx] + [sug] + chunks[(idx + 1)..]
|
|
276
|
+
candidate_str = candidate.join('-')
|
|
277
|
+
|
|
278
|
+
# Check if the whole word with replacement is good
|
|
279
|
+
if @lookup.call(candidate_str, capitalization: true, allow_nosuggest: true)
|
|
280
|
+
yield Suggestion.new(candidate_str, 'dashes')
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
# Only try one misspelled chunk
|
|
285
|
+
break
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# Ngram-based suggestions
|
|
290
|
+
if @aff[:MAXNGRAMSUGS]&.positive?
|
|
291
|
+
ngrams_seen = 0
|
|
292
|
+
ngram_suggestions(word, handled: handled) do |sug|
|
|
293
|
+
handle_found(
|
|
294
|
+
Suggestion.new(sug, 'ngram'),
|
|
295
|
+
captype: captype,
|
|
296
|
+
is_forbidden: is_forbidden,
|
|
297
|
+
handled: handled,
|
|
298
|
+
check_inclusion: true
|
|
299
|
+
) do |suggestion|
|
|
300
|
+
yield suggestion
|
|
301
|
+
ngrams_seen += 1
|
|
302
|
+
break if ngrams_seen >= @aff[:MAXNGRAMSUGS]
|
|
303
|
+
end
|
|
304
|
+
end
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
# Phonetic suggestions
|
|
308
|
+
if @aff[:PHONE]
|
|
309
|
+
phonet_seen = 0
|
|
310
|
+
phonet_suggestions(word) do |sug|
|
|
311
|
+
handle_found(
|
|
312
|
+
Suggestion.new(sug, 'phonet'),
|
|
313
|
+
captype: captype,
|
|
314
|
+
is_forbidden: is_forbidden,
|
|
315
|
+
handled: handled,
|
|
316
|
+
check_inclusion: true
|
|
317
|
+
) do |suggestion|
|
|
318
|
+
yield suggestion
|
|
319
|
+
phonet_seen += 1
|
|
320
|
+
break if phonet_seen >= MAXPHONSUGS
|
|
321
|
+
end
|
|
322
|
+
end
|
|
323
|
+
end
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
# Generate all possible word edits in order of priority.
|
|
327
|
+
#
|
|
328
|
+
# Order is important - it's the order user receives suggestions.
|
|
329
|
+
#
|
|
330
|
+
# @param word [String] Word to mutate
|
|
331
|
+
# @yield [Suggestion, MultiWordSuggestion] Each edit suggestion
|
|
332
|
+
def edits(word)
|
|
333
|
+
# Uppercase suggestion (html -> HTML)
|
|
334
|
+
yield Suggestion.new(@aff[:casing].upper(word), 'uppercase')
|
|
335
|
+
|
|
336
|
+
# REP table replacements
|
|
337
|
+
reptable = @aff[:REP] || []
|
|
338
|
+
Permutations.replchars(word, reptable) do |suggestion|
|
|
339
|
+
if suggestion.is_a?(Array)
|
|
340
|
+
# Multi-word suggestion from REP with underscore
|
|
341
|
+
yield Suggestion.new(suggestion.join(' '), 'replchars')
|
|
342
|
+
yield MultiWordSuggestion.new(suggestion, 'replchars', allow_dash: false)
|
|
343
|
+
else
|
|
344
|
+
yield Suggestion.new(suggestion, 'replchars')
|
|
345
|
+
end
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
# Split into two words (spaceword)
|
|
349
|
+
Permutations.twowords(word) do |words|
|
|
350
|
+
yield Suggestion.new(words.join(' '), 'spaceword')
|
|
351
|
+
yield Suggestion.new(words.join('-'), 'spaceword') if use_dash?
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
# MAP table (related character replacements)
|
|
355
|
+
maptable = @aff[:MAP] || []
|
|
356
|
+
Permutations.mapchars(word, maptable) do |suggestion|
|
|
357
|
+
yield Suggestion.new(suggestion, 'mapchars')
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
# Swap adjacent characters
|
|
361
|
+
Permutations.swapchar(word) do |suggestion|
|
|
362
|
+
yield Suggestion.new(suggestion, 'swapchar')
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
# Long swaps (up to 4 chars distance)
|
|
366
|
+
Permutations.longswapchar(word) do |suggestion|
|
|
367
|
+
yield Suggestion.new(suggestion, 'longswapchar')
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
# Replace with keyboard-adjacent chars
|
|
371
|
+
layout = @aff[:KEY] || ''
|
|
372
|
+
Permutations.badcharkey(word, layout) do |suggestion|
|
|
373
|
+
yield Suggestion.new(suggestion, 'badcharkey')
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
# Remove one character
|
|
377
|
+
Permutations.extrachar(word) do |suggestion|
|
|
378
|
+
yield Suggestion.new(suggestion, 'extrachar')
|
|
379
|
+
end
|
|
380
|
+
|
|
381
|
+
# Insert one character (from TRY string)
|
|
382
|
+
trystring = @aff[:TRY] || ''
|
|
383
|
+
Permutations.forgotchar(word, trystring) do |suggestion|
|
|
384
|
+
yield Suggestion.new(suggestion, 'forgotchar')
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
# Move character forward/backward
|
|
388
|
+
Permutations.movechar(word) do |suggestion|
|
|
389
|
+
yield Suggestion.new(suggestion, 'movechar')
|
|
390
|
+
end
|
|
391
|
+
|
|
392
|
+
# Replace each character
|
|
393
|
+
Permutations.badchar(word, trystring) do |suggestion|
|
|
394
|
+
yield Suggestion.new(suggestion, 'badchar')
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
# Fix two-character doubling
|
|
398
|
+
Permutations.doubletwochars(word) do |suggestion|
|
|
399
|
+
yield Suggestion.new(suggestion, 'doubletwochars')
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
# Split by space in all positions
|
|
403
|
+
unless @aff[:NOSPLITSUGS]
|
|
404
|
+
Permutations.twowords(word) do |words|
|
|
405
|
+
yield MultiWordSuggestion.new(words, 'twowords', allow_dash: use_dash?)
|
|
406
|
+
end
|
|
407
|
+
end
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
# Generate edit suggestions and filter for valid words.
|
|
411
|
+
#
|
|
412
|
+
# @param word [String] Word to generate edits for
|
|
413
|
+
# @param compounds [Boolean] Whether to check compound words
|
|
414
|
+
# @param limit [Integer] Maximum number of suggestions to yield
|
|
415
|
+
# @yield [Suggestion, MultiWordSuggestion] Each valid edit suggestion
|
|
416
|
+
def edit_suggestions(word, compounds:, limit:)
|
|
417
|
+
count = 0
|
|
418
|
+
|
|
419
|
+
edits(word) do |suggestion|
|
|
420
|
+
break if count > limit
|
|
421
|
+
|
|
422
|
+
# Filter for valid words
|
|
423
|
+
filtered = filter_suggestion(suggestion, compounds)
|
|
424
|
+
next unless filtered
|
|
425
|
+
|
|
426
|
+
yield filtered
|
|
427
|
+
count += 1
|
|
428
|
+
end
|
|
429
|
+
end
|
|
430
|
+
|
|
431
|
+
# Generate ngram-based suggestions.
|
|
432
|
+
#
|
|
433
|
+
# @param word [String] Misspelled word
|
|
434
|
+
# @param handled [Set<String>] Already suggested words
|
|
435
|
+
# @yield [String] Each ngram suggestion
|
|
436
|
+
def ngram_suggestions(word, handled:)
|
|
437
|
+
return unless @aff[:MAXNGRAMSUGS]&.positive?
|
|
438
|
+
|
|
439
|
+
known_lower = handled.map(&:downcase).to_set
|
|
440
|
+
|
|
441
|
+
NgramSuggest.suggest(
|
|
442
|
+
word.downcase,
|
|
443
|
+
dictionary_words: @words_for_ngram,
|
|
444
|
+
prefixes: @aff[:PFX] || {},
|
|
445
|
+
suffixes: @aff[:SFX] || {},
|
|
446
|
+
known: known_lower,
|
|
447
|
+
maxdiff: @aff[:MAXDIFF] || 2,
|
|
448
|
+
onlymaxdiff: @aff[:ONLYMAXDIFF] || true,
|
|
449
|
+
has_phonetic: !@aff[:PHONE].nil?
|
|
450
|
+
) do |suggestion|
|
|
451
|
+
yield suggestion
|
|
452
|
+
end
|
|
453
|
+
end
|
|
454
|
+
|
|
455
|
+
# Generate phonetic suggestions.
|
|
456
|
+
#
|
|
457
|
+
# @param word [String] Misspelled word
|
|
458
|
+
# @yield [String] Each phonetic suggestion
|
|
459
|
+
def phonet_suggestions(word)
|
|
460
|
+
return unless @aff[:PHONE]
|
|
461
|
+
|
|
462
|
+
PhonetSuggest.suggest(
|
|
463
|
+
word,
|
|
464
|
+
dictionary_words: @words_for_ngram,
|
|
465
|
+
table: @aff[:PHONE]
|
|
466
|
+
) do |suggestion|
|
|
467
|
+
yield suggestion
|
|
468
|
+
end
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
# Check if dashes are allowed for joining words.
|
|
472
|
+
#
|
|
473
|
+
# Definition from Hunspell: Either dash is in TRY directive, or TRY
|
|
474
|
+
# indicates Latinic script (by having 'a').
|
|
475
|
+
#
|
|
476
|
+
# @return [Boolean] Whether dashes are allowed
|
|
477
|
+
def use_dash?
|
|
478
|
+
try_chars = @aff[:TRY] || ''
|
|
479
|
+
try_chars.include?('-') || try_chars.include?('a')
|
|
480
|
+
end
|
|
481
|
+
|
|
482
|
+
private
|
|
483
|
+
|
|
484
|
+
# Handle a found suggestion with proper capitalization and validation.
|
|
485
|
+
#
|
|
486
|
+
# @param suggestion [Suggestion, MultiWordSuggestion] Raw suggestion
|
|
487
|
+
# @param captype [Symbol] Original word's capitalization type
|
|
488
|
+
# @param is_forbidden [Proc] Function to check if word is forbidden
|
|
489
|
+
# @param handled [Set<String>] Already handled suggestions
|
|
490
|
+
# @param check_inclusion [Boolean] Whether to check for subsumption
|
|
491
|
+
# @yield [Suggestion] Processed suggestion if valid
|
|
492
|
+
def handle_found(suggestion, captype:, is_forbidden:, handled:, check_inclusion: false)
|
|
493
|
+
return unless block_given?
|
|
494
|
+
|
|
495
|
+
text = suggestion.text
|
|
496
|
+
|
|
497
|
+
# Apply capitalization coercion
|
|
498
|
+
unless @aff[:KEEPCASE] && suggestion_has_keepcase_flag?(suggestion)
|
|
499
|
+
text = @aff[:casing].coerce(text, captype)
|
|
500
|
+
|
|
501
|
+
# If coerced form is forbidden, revert to original
|
|
502
|
+
if text != suggestion.text && is_forbidden.call(text)
|
|
503
|
+
text = suggestion.text
|
|
504
|
+
end
|
|
505
|
+
|
|
506
|
+
# Fix "aNew" -> "a New" case
|
|
507
|
+
if [Capitalization::Type::HUH, Capitalization::Type::HUHINIT].include?(captype) && text.include?(' ')
|
|
508
|
+
pos = text.index(' ')
|
|
509
|
+
if pos && text[pos + 1] != text[pos] && text[pos + 1]&.upcase == text[pos]
|
|
510
|
+
text = text[0...pos + 1] + text[pos] + text[(pos + 2)..]
|
|
511
|
+
end
|
|
512
|
+
end
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
# Skip if forbidden
|
|
516
|
+
return if is_forbidden.call(text)
|
|
517
|
+
|
|
518
|
+
# Apply OCONV transformation if present
|
|
519
|
+
if @aff[:OCONV]
|
|
520
|
+
text = @aff[:OCONV].call(text)
|
|
521
|
+
end
|
|
522
|
+
|
|
523
|
+
# Skip if already seen
|
|
524
|
+
return if handled.include?(text)
|
|
525
|
+
|
|
526
|
+
# Skip if subsumed by existing suggestion
|
|
527
|
+
if check_inclusion
|
|
528
|
+
return if handled.any? { |prev| prev.downcase.in?(text.downcase) }
|
|
529
|
+
end
|
|
530
|
+
|
|
531
|
+
handled.add(text)
|
|
532
|
+
yield suggestion.replace(text: text)
|
|
533
|
+
end
|
|
534
|
+
|
|
535
|
+
# Check if suggestion has KEEPCASE flag.
|
|
536
|
+
#
|
|
537
|
+
# @param suggestion [Suggestion, MultiWordSuggestion]
|
|
538
|
+
# @return [Boolean]
|
|
539
|
+
def suggestion_has_keepcase_flag?(suggestion)
|
|
540
|
+
return false unless @aff[:KEEPCASE]
|
|
541
|
+
|
|
542
|
+
# Simplified check - full implementation would check dictionary
|
|
543
|
+
suggestion.text.include?('ß')
|
|
544
|
+
end
|
|
545
|
+
|
|
546
|
+
# Filter suggestion to only valid words.
|
|
547
|
+
#
|
|
548
|
+
# @param suggestion [Suggestion, MultiWordSuggestion]
|
|
549
|
+
# @param compounds [Boolean] Whether to check compound forms
|
|
550
|
+
# @return [Suggestion, nil] Filtered suggestion or nil if invalid
|
|
551
|
+
def filter_suggestion(suggestion, compounds)
|
|
552
|
+
is_good = ->(word) do
|
|
553
|
+
if compounds
|
|
554
|
+
@lookup.good_forms(word, capitalization: false, allow_nosuggest: false, affix_forms: false).any?
|
|
555
|
+
else
|
|
556
|
+
@lookup.good_forms(word, capitalization: false, allow_nosuggest: false, compound_forms: false).any?
|
|
557
|
+
end
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
if suggestion.is_a?(MultiWordSuggestion)
|
|
561
|
+
# Check all words are valid
|
|
562
|
+
return nil unless suggestion.words.all? { |w| is_good.call(w) }
|
|
563
|
+
|
|
564
|
+
suggestion.stringify
|
|
565
|
+
else
|
|
566
|
+
# Check single word is valid
|
|
567
|
+
return nil unless is_good.call(suggestion.text)
|
|
568
|
+
|
|
569
|
+
suggestion
|
|
570
|
+
end
|
|
571
|
+
end
|
|
572
|
+
end
|
|
573
|
+
end
|
|
574
|
+
end
|
|
575
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
# Algorithms namespace for spell checking algorithms.
|
|
5
|
+
#
|
|
6
|
+
# Contains the core algorithms ported from Spylls:
|
|
7
|
+
# - NgramSuggest: N-gram based suggestion algorithm
|
|
8
|
+
# - Lookup: Word correctness checking with affix support (TODO)
|
|
9
|
+
# - Suggest: Main suggestion orchestration (TODO)
|
|
10
|
+
#
|
|
11
|
+
# These are the core Hunspell algorithms that make spell checking work.
|
|
12
|
+
module Algorithms
|
|
13
|
+
end
|
|
14
|
+
end
|