kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Algorithms
|
|
5
|
+
# N-gram based suggestion algorithm.
|
|
6
|
+
#
|
|
7
|
+
# Ported from Spylls (Python) ngram_suggest.py
|
|
8
|
+
#
|
|
9
|
+
# This is the core Hunspell suggestion algorithm that uses n-gram
|
|
10
|
+
# similarity to rank and filter spelling corrections.
|
|
11
|
+
#
|
|
12
|
+
# The algorithm works in three stages:
|
|
13
|
+
# 1. root_score: Quick n-gram score + left common substring
|
|
14
|
+
# 2. rough_affix_score: Affixed form n-gram score
|
|
15
|
+
# 3. precise_affix_score: Full scoring with LCS, bigrams, etc.
|
|
16
|
+
module NgramSuggest
|
|
17
|
+
# Maximum number of root words to consider in first pass
|
|
18
|
+
MAX_ROOTS = 100
|
|
19
|
+
|
|
20
|
+
# Maximum number of suggestions to generate
|
|
21
|
+
MAX_GUESSES = 200
|
|
22
|
+
|
|
23
|
+
class << self
|
|
24
|
+
# Main entry point for n-gram based suggestions.
|
|
25
|
+
#
|
|
26
|
+
# @param misspelling [String] The misspelled word
|
|
27
|
+
# @param dictionary_words [Array<Hash>] Dictionary entries with stem and flags
|
|
28
|
+
# @param prefixes [Hash] Prefix flags to prefix objects mapping
|
|
29
|
+
# @param suffixes [Hash] Suffix flags to suffix objects mapping
|
|
30
|
+
# @param known [Set<String>] Already suggested words (to avoid duplicates)
|
|
31
|
+
# @param maxdiff [Integer] MAXDIFF value from aff file (0-10)
|
|
32
|
+
# @param onlymaxdiff [Boolean] ONLYMAXDIFF flag
|
|
33
|
+
# @param has_phonetic [Boolean] Whether PHONE table exists in aff file
|
|
34
|
+
# @yield [String] Each suggestion
|
|
35
|
+
#
|
|
36
|
+
# This is a simplified version that works with basic dictionary structures.
|
|
37
|
+
# Full implementation would need affix flag parsing and Word model objects.
|
|
38
|
+
def suggest(misspelling,
|
|
39
|
+
dictionary_words:,
|
|
40
|
+
prefixes: {},
|
|
41
|
+
suffixes: {},
|
|
42
|
+
known: Set.new,
|
|
43
|
+
maxdiff: 2,
|
|
44
|
+
onlymaxdiff: true,
|
|
45
|
+
has_phonetic: false,
|
|
46
|
+
&block)
|
|
47
|
+
|
|
48
|
+
# Stage 1: Find best root candidates by n-gram score
|
|
49
|
+
root_scores = []
|
|
50
|
+
|
|
51
|
+
dictionary_words.each do |word_entry|
|
|
52
|
+
stem = word_entry[:stem] || word_entry
|
|
53
|
+
|
|
54
|
+
# Skip words with length difference > 4
|
|
55
|
+
next if (stem.length - misspelling.length).abs > 4
|
|
56
|
+
|
|
57
|
+
score = root_score(misspelling, stem)
|
|
58
|
+
|
|
59
|
+
# Use heap to keep only MAX_ROOTS best results
|
|
60
|
+
if root_scores.size >= MAX_ROOTS
|
|
61
|
+
# Keep only the best scores
|
|
62
|
+
root_scores = root_scores.sort.reverse.first(MAX_ROOTS)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
root_scores << [score, word_entry] if score > 0
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Stage 2: Generate affixed forms and score them
|
|
69
|
+
threshold = detect_threshold(misspelling)
|
|
70
|
+
guess_scores = []
|
|
71
|
+
|
|
72
|
+
# Sort by score descending
|
|
73
|
+
root_scores.sort_by { |score, _| -score }.first(MAX_ROOTS).each do |(_, root_entry)|
|
|
74
|
+
root = root_entry[:stem] || root_entry
|
|
75
|
+
|
|
76
|
+
# Generate forms with suffixes
|
|
77
|
+
forms = forms_for(root_entry, prefixes, suffixes, similar_to: misspelling)
|
|
78
|
+
|
|
79
|
+
forms.each do |form|
|
|
80
|
+
score = rough_affix_score(misspelling, form.to_s.downcase)
|
|
81
|
+
next unless score > threshold
|
|
82
|
+
|
|
83
|
+
guess_scores << [score, form.to_s, form.to_s]
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Limit to MAX_GUESSES and sort by score
|
|
88
|
+
guesses = guess_scores.sort.reverse.first(MAX_GUESSES)
|
|
89
|
+
|
|
90
|
+
# Stage 3: Calculate precise scores
|
|
91
|
+
fact = maxdiff >= 0 ? (10.0 - maxdiff) / 5.0 : 1.0
|
|
92
|
+
|
|
93
|
+
guesses2 = guesses.map do |score, compared, real|
|
|
94
|
+
[precise_affix_score(misspelling, compared.to_s.downcase,
|
|
95
|
+
fact, base: score, has_phonetic: has_phonetic), real.to_s]
|
|
96
|
+
end.sort.reverse
|
|
97
|
+
|
|
98
|
+
# Stage 4: Filter and yield suggestions
|
|
99
|
+
filter_guesses(guesses2, known: known, onlymaxdiff: onlymaxdiff, &block)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Stage 1 scoring: 3-gram score + left common substring.
|
|
103
|
+
#
|
|
104
|
+
# @param word1 [String] Misspelled word
|
|
105
|
+
# @param word2 [String] Possible suggestion
|
|
106
|
+
# @return [Float] Root score
|
|
107
|
+
def root_score(word1, word2)
|
|
108
|
+
# Use lowercase for comparison as per Hunspell
|
|
109
|
+
word2_lower = word2.downcase
|
|
110
|
+
|
|
111
|
+
StringMetrics.ngram(3, word1, word2_lower, longer_worse: true) +
|
|
112
|
+
StringMetrics.leftcommonsubstring(word1, word2_lower).to_f
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Stage 2 scoring: N-gram score with n=len(word1) + left common substring.
|
|
116
|
+
#
|
|
117
|
+
# @param word1 [String] Misspelled word
|
|
118
|
+
# @param word2 [String] Possible suggestion
|
|
119
|
+
# @return [Float] Rough affix score
|
|
120
|
+
def rough_affix_score(word1, word2)
|
|
121
|
+
# Use lowercase for comparison as per Hunspell
|
|
122
|
+
word2_lower = word2.downcase
|
|
123
|
+
|
|
124
|
+
StringMetrics.ngram(word1.length, word1, word2_lower, any_mismatch: true) +
|
|
125
|
+
StringMetrics.leftcommonsubstring(word1, word2_lower).to_f
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Stage 3 scoring: Full precise scoring.
|
|
129
|
+
#
|
|
130
|
+
# Returns one of three "score groups":
|
|
131
|
+
# - > 1000: Very good (same word, different casing)
|
|
132
|
+
# - < -100: Questionable (too different)
|
|
133
|
+
# - -100 to 1000: Normal suggestion
|
|
134
|
+
#
|
|
135
|
+
# @param word1 [String] Misspelled word
|
|
136
|
+
# @param word2 [String] Possible suggestion
|
|
137
|
+
# @param diff_factor [Float] Factor based on MAXDIFF (0-2)
|
|
138
|
+
# @param base [Float] Base score from stage 2
|
|
139
|
+
# @param has_phonetic [Boolean] Whether PHONE table exists
|
|
140
|
+
# @return [Float] Precise affix score
|
|
141
|
+
def precise_affix_score(word1, word2, diff_factor, base:, has_phonetic: false)
|
|
142
|
+
# Use lowercase for LCS to catch case-only differences
|
|
143
|
+
word1_lower = word1.downcase
|
|
144
|
+
word2_lower = word2.downcase
|
|
145
|
+
|
|
146
|
+
lcs = StringMetrics.lcslen(word1_lower, word2_lower)
|
|
147
|
+
|
|
148
|
+
# Same characters with different casing -- "very good" suggestion
|
|
149
|
+
if word1.length == word2.length && word1.length == lcs
|
|
150
|
+
return base + 2000
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Score is: 2 * LCS - length difference
|
|
154
|
+
result = 2 * lcs - (word1.length - word2.length).abs
|
|
155
|
+
|
|
156
|
+
# Add common start substring length
|
|
157
|
+
result += StringMetrics.leftcommonsubstring(word1_lower, word2_lower)
|
|
158
|
+
|
|
159
|
+
# Add 1 if any characters match at same positions
|
|
160
|
+
result += 1 if StringMetrics.commoncharacters(word1_lower, word2_lower) > 0
|
|
161
|
+
|
|
162
|
+
# Add regular 4-gram score
|
|
163
|
+
result += StringMetrics.ngram(4, word1_lower, word2_lower, any_mismatch: true)
|
|
164
|
+
|
|
165
|
+
# Add weighted bigrams (both directions)
|
|
166
|
+
bigrams = (
|
|
167
|
+
StringMetrics.ngram(2, word1_lower, word2_lower, any_mismatch: true, weighted: true) +
|
|
168
|
+
StringMetrics.ngram(2, word2_lower, word1_lower, any_mismatch: true, weighted: true)
|
|
169
|
+
)
|
|
170
|
+
result += bigrams
|
|
171
|
+
|
|
172
|
+
# Apply "questionable" threshold based on diff_factor and has_phonetic
|
|
173
|
+
questionable_limit = if has_phonetic
|
|
174
|
+
word2.length * diff_factor
|
|
175
|
+
else
|
|
176
|
+
(word1.length + word2.length) * diff_factor
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
result -= 1000 if bigrams < questionable_limit
|
|
180
|
+
|
|
181
|
+
result
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Calculate minimum threshold for passable suggestions.
|
|
185
|
+
#
|
|
186
|
+
# Mangles the word in 3 different ways (replacing each 4th char with '*')
|
|
187
|
+
# and scores them to generate a minimum acceptable score.
|
|
188
|
+
#
|
|
189
|
+
# @param word [String] The misspelled word
|
|
190
|
+
# @return [Float] Minimum threshold score
|
|
191
|
+
def detect_threshold(word)
|
|
192
|
+
thresh = 0.0
|
|
193
|
+
|
|
194
|
+
(1..3).each do |start_pos|
|
|
195
|
+
mangled = word.chars.map.with_index do |char, pos|
|
|
196
|
+
((pos - start_pos) % 4).zero? && pos >= start_pos ? "*" : char
|
|
197
|
+
end.join
|
|
198
|
+
|
|
199
|
+
thresh += StringMetrics.ngram(word.length, word, mangled, any_mismatch: true)
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Take average of the three scores and subtract 1
|
|
203
|
+
(thresh / 3.0) - 1
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Generate all possible affixed forms for a dictionary word.
|
|
207
|
+
#
|
|
208
|
+
# @param word_entry [Hash] Dictionary word with stem and flags
|
|
209
|
+
# @param all_prefixes [Hash] Available prefixes
|
|
210
|
+
# @param all_suffixes [Hash] Available suffixes
|
|
211
|
+
# @param similar_to [String] Original misspelling (for filtering)
|
|
212
|
+
# @return [Array<String>] Generated forms
|
|
213
|
+
def forms_for(word_entry, all_prefixes, all_suffixes, similar_to:)
|
|
214
|
+
stem = word_entry[:stem] || word_entry
|
|
215
|
+
flags = word_entry[:flags] || []
|
|
216
|
+
|
|
217
|
+
# Base form without affixes
|
|
218
|
+
res = [stem]
|
|
219
|
+
|
|
220
|
+
# Generate suffix forms
|
|
221
|
+
# Simplified: just return base form for now
|
|
222
|
+
# Full implementation would parse affix flags and apply them
|
|
223
|
+
|
|
224
|
+
res
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# Filter guesses by score into quality buckets.
|
|
228
|
+
#
|
|
229
|
+
# Score buckets:
|
|
230
|
+
# - > 1000: Very good (same word, different casing)
|
|
231
|
+
# - 1000 to -100: Normal suggestions
|
|
232
|
+
# - < -100: Questionable (too different)
|
|
233
|
+
#
|
|
234
|
+
# Stops yielding when:
|
|
235
|
+
# - A very good suggestion was found and then a normal one
|
|
236
|
+
# - A questionable suggestion was found (only yields one)
|
|
237
|
+
#
|
|
238
|
+
# @param guesses [Array<Array>] Array of [score, value] pairs
|
|
239
|
+
# @param known [Set<String>] Already suggested words
|
|
240
|
+
# @param onlymaxdiff [Boolean] Whether to exclude questionable
|
|
241
|
+
# @yield [String] Each filtered suggestion
|
|
242
|
+
def filter_guesses(guesses, known:, onlymaxdiff: true)
|
|
243
|
+
seen = false
|
|
244
|
+
found = 0
|
|
245
|
+
|
|
246
|
+
guesses.each do |score, value|
|
|
247
|
+
# Stop if we saw very good and now have normal suggestions
|
|
248
|
+
return if seen && score <= 1000
|
|
249
|
+
|
|
250
|
+
if score > 1000
|
|
251
|
+
# Very good suggestion - set flag to only accept other very good ones
|
|
252
|
+
seen = true
|
|
253
|
+
elsif score < -100
|
|
254
|
+
# Questionable suggestion
|
|
255
|
+
# Stop if we already found good ones, or if we're excluding questionable
|
|
256
|
+
return if found.positive? || onlymaxdiff
|
|
257
|
+
seen = true
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
# Skip if this word was already suggested
|
|
261
|
+
next if known.any? { |known_word| value.include?(known_word) }
|
|
262
|
+
|
|
263
|
+
found += 1
|
|
264
|
+
yield value
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
end
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Algorithms
|
|
5
|
+
# Word permutation algorithms for generating spelling variations.
|
|
6
|
+
#
|
|
7
|
+
# Ported from Spylls (Python) permutations.py
|
|
8
|
+
#
|
|
9
|
+
# These functions generate various word edits that are used by the
|
|
10
|
+
# suggestion system to find possible corrections for misspelled words.
|
|
11
|
+
#
|
|
12
|
+
# Method names match Hunspell's suggest.cxx to maintain compatibility.
|
|
13
|
+
module Permutations
|
|
14
|
+
MAX_CHAR_DISTANCE = 4
|
|
15
|
+
|
|
16
|
+
module_function
|
|
17
|
+
|
|
18
|
+
# Uses REP table (typical misspellings) to replace patterns in word.
|
|
19
|
+
#
|
|
20
|
+
# If the pattern's replacement contains "_", it means replacing to " "
|
|
21
|
+
# and yielding two different hypotheses:
|
|
22
|
+
# 1. It was one (dictionary) word "foo bar" (checked as such)
|
|
23
|
+
# 2. It was words ["foo", "bar"] (checked separately)
|
|
24
|
+
#
|
|
25
|
+
# @param word [String] The word to process
|
|
26
|
+
# @param reptable [Array<Hash>] Array of replacement pattern hashes with :regexp and :replacement
|
|
27
|
+
# @yield [String, Array<String>] Each suggestion (string or array of words)
|
|
28
|
+
#
|
|
29
|
+
# @example
|
|
30
|
+
# Kotoshu::Algorithms::Permutations.replchars("acces", [{regexp: /ac/, replacement: "ex"}]) do |sug|
|
|
31
|
+
# puts sug
|
|
32
|
+
# end
|
|
33
|
+
def replchars(word, reptable)
|
|
34
|
+
return if word.length < 2 || reptable.nil? || reptable.empty?
|
|
35
|
+
|
|
36
|
+
reptable.each do |pattern|
|
|
37
|
+
str = word.to_s
|
|
38
|
+
pos = 0
|
|
39
|
+
|
|
40
|
+
while (match_data = pattern[:regexp].match(str, pos))
|
|
41
|
+
suggestion = str[0...match_data.begin(0)] +
|
|
42
|
+
pattern[:replacement].gsub('_', ' ') +
|
|
43
|
+
str[match_data.end(0)..]
|
|
44
|
+
|
|
45
|
+
yield suggestion
|
|
46
|
+
yield suggestion.split(' ', 2) if suggestion.include?(' ')
|
|
47
|
+
|
|
48
|
+
# Move past this match to find next occurrence
|
|
49
|
+
pos = match_data.end(0)
|
|
50
|
+
break if pos >= str.length
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Uses MAP table (sets of potentially similar chars) and tries to replace them recursively.
|
|
56
|
+
#
|
|
57
|
+
# Example: Assuming MAP has entry "aáã", and we have misspelling "anarchia":
|
|
58
|
+
# mapchars will produce: "ánarchia", "ánárchia", "ánárchiá", etc.
|
|
59
|
+
#
|
|
60
|
+
# @param word [String] The word to process
|
|
61
|
+
# @param maptable [Array<Set<String>>] Array of character sets for mapping
|
|
62
|
+
# @yield [String] Each variant with mapped characters
|
|
63
|
+
#
|
|
64
|
+
# @example
|
|
65
|
+
# Kotoshu::Algorithms::Permutations.mapchars("anarchia", [Set.new(['a', 'á', 'ã'])]) do |variant|
|
|
66
|
+
# puts variant
|
|
67
|
+
# end
|
|
68
|
+
def mapchars(word, maptable)
|
|
69
|
+
return if word.length < 2 || maptable.nil? || maptable.empty?
|
|
70
|
+
|
|
71
|
+
mapchars_internal(word, 0, maptable) { |variant| yield variant }
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Produces permutations with adjacent chars swapped.
|
|
75
|
+
#
|
|
76
|
+
# For short (4 or 5 letters) words also produces double swaps: ahev -> have
|
|
77
|
+
#
|
|
78
|
+
# @param word [String] The word to process
|
|
79
|
+
# @yield [String] Each swap variant
|
|
80
|
+
def swapchar(word)
|
|
81
|
+
return if word.length < 2
|
|
82
|
+
|
|
83
|
+
chars = word.chars
|
|
84
|
+
(0...chars.length - 1).each do |i|
|
|
85
|
+
swapped = chars[0...i] + [chars[i + 1], chars[i]] + chars[(i + 2)..]
|
|
86
|
+
yield swapped.join
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Try double swaps for short words
|
|
90
|
+
# ahev -> have, owudl -> would
|
|
91
|
+
if [4, 5].include?(word.length)
|
|
92
|
+
yield word[1] + word[0] + (word.length == 5 ? word[2] : '') + word[-1] + word[-2]
|
|
93
|
+
if word.length == 5
|
|
94
|
+
yield word[0] + word[2] + word[1] + word[-1] + word[-2]
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Produces permutations with non-adjacent chars swapped (up to 4 chars distance).
|
|
100
|
+
#
|
|
101
|
+
# @param word [String] The word to process
|
|
102
|
+
# @yield [String] Each long swap variant
|
|
103
|
+
def longswapchar(word)
|
|
104
|
+
chars = word.chars
|
|
105
|
+
(0...chars.length - 2).each do |first|
|
|
106
|
+
((first + 2)...[first + MAX_CHAR_DISTANCE, chars.length].min).each do |second|
|
|
107
|
+
swapped = chars[0...first] +
|
|
108
|
+
[chars[second]] +
|
|
109
|
+
chars[(first + 1)...second] +
|
|
110
|
+
[chars[first]] +
|
|
111
|
+
chars[(second + 1)..]
|
|
112
|
+
yield swapped.join
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Produces permutations with chars replaced by adjacent chars on keyboard layout
|
|
118
|
+
# ("vat -> cat") or downcased (if it was accidental uppercase).
|
|
119
|
+
#
|
|
120
|
+
# @param word [String] The word to process
|
|
121
|
+
# @param layout [String] Keyboard layout string (KEY from aff file)
|
|
122
|
+
# @yield [String] Each variant with replaced chars
|
|
123
|
+
def badcharkey(word, layout)
|
|
124
|
+
chars = word.chars
|
|
125
|
+
chars.each_with_index do |c, i|
|
|
126
|
+
before = word[0...i]
|
|
127
|
+
after = word[(i + 1)..]
|
|
128
|
+
|
|
129
|
+
# Try uppercasing if not already uppercase
|
|
130
|
+
unless c == c.upcase
|
|
131
|
+
yield before + c.upcase + after.to_s
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
next if layout.nil? || layout.empty?
|
|
135
|
+
|
|
136
|
+
# Try adjacent keys on keyboard
|
|
137
|
+
pos = layout.index(c)
|
|
138
|
+
next unless pos
|
|
139
|
+
|
|
140
|
+
while pos
|
|
141
|
+
if pos.positive? && layout[pos - 1] != '|'
|
|
142
|
+
yield before + layout[pos - 1] + after.to_s
|
|
143
|
+
end
|
|
144
|
+
if pos + 1 < layout.length && layout[pos + 1] != '|'
|
|
145
|
+
yield before + layout[pos + 1] + after.to_s
|
|
146
|
+
end
|
|
147
|
+
pos = layout.index(c, pos + 1)
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Produces permutations with one char removed in all possible positions.
|
|
153
|
+
#
|
|
154
|
+
# @param word [String] The word to process
|
|
155
|
+
# @yield [String] Each variant with one char removed
|
|
156
|
+
def extrachar(word)
|
|
157
|
+
return if word.length < 2
|
|
158
|
+
|
|
159
|
+
word.length.times do |i|
|
|
160
|
+
yield word[0...i] + word[(i + 1)..]
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Produces permutations with one char inserted in all possible positions.
|
|
165
|
+
#
|
|
166
|
+
# List of chars is taken from TRY string -- if absent, tries nothing.
|
|
167
|
+
# Chars are expected to be sorted in order of usage in language.
|
|
168
|
+
#
|
|
169
|
+
# @param word [String] The word to process
|
|
170
|
+
# @param trystring [String] Characters to try inserting (from aff TRY directive)
|
|
171
|
+
# @yield [String] Each variant with one char inserted
|
|
172
|
+
def forgotchar(word, trystring)
|
|
173
|
+
return if trystring.nil? || trystring.empty?
|
|
174
|
+
|
|
175
|
+
trystring.each_char do |c|
|
|
176
|
+
(0..word.length).each do |i|
|
|
177
|
+
yield word[0...i] + c + word[i..]
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Produces permutations with one character moved by 2, 3 or 4 places forward or backward
|
|
183
|
+
# (not 1, because adjacent swaps are already handled by swapchar).
|
|
184
|
+
#
|
|
185
|
+
# @param word [String] The word to process
|
|
186
|
+
# @yield [String] Each variant with moved character
|
|
187
|
+
def movechar(word)
|
|
188
|
+
return if word.length < 2
|
|
189
|
+
|
|
190
|
+
chars = word.chars
|
|
191
|
+
|
|
192
|
+
# Move characters forward
|
|
193
|
+
chars.each_with_index do |char, frompos|
|
|
194
|
+
((frompos + 3)...[chars.length, frompos + MAX_CHAR_DISTANCE + 1].min).each do |topos|
|
|
195
|
+
moved = chars[0...frompos] + chars[(frompos + 1)...topos] + [char] + chars[topos..]
|
|
196
|
+
yield moved.join
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# Move characters backward
|
|
201
|
+
(chars.length - 1).downto(0) do |frompos|
|
|
202
|
+
[[0, frompos - MAX_CHAR_DISTANCE + 1].max, frompos - 1].min.downto(0) do |topos|
|
|
203
|
+
moved = chars[0...topos] + [chars[frompos]] + chars[topos...frompos] + chars[(frompos + 1)..]
|
|
204
|
+
yield moved.join
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Produces permutations with chars replaced by chars in TRY set.
|
|
210
|
+
#
|
|
211
|
+
# @param word [String] The word to process
|
|
212
|
+
# @param trystring [String] Characters to try replacing with (from aff TRY directive)
|
|
213
|
+
# @yield [String] Each variant with replaced char
|
|
214
|
+
def badchar(word, trystring)
|
|
215
|
+
return if trystring.nil? || trystring.empty?
|
|
216
|
+
|
|
217
|
+
trystring.each_char do |c|
|
|
218
|
+
(word.length - 1).downto(0) do |i|
|
|
219
|
+
next if word[i] == c
|
|
220
|
+
|
|
221
|
+
yield word[0...i] + c + word[(i + 1)..]
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# Produces permutations with accidental two-letter-doubling fixed.
|
|
227
|
+
# Example: "vacacation" -> "vacation"
|
|
228
|
+
#
|
|
229
|
+
# @param word [String] The word to process
|
|
230
|
+
# @yield [String] Each variant with fixed doubling
|
|
231
|
+
def doubletwochars(word)
|
|
232
|
+
return if word.length < 5
|
|
233
|
+
|
|
234
|
+
(2...word.length).each do |i|
|
|
235
|
+
# Check if word[i-2] == word[i] and word[i-3] == word[i-1]
|
|
236
|
+
# Example: vacacation -> "ca" at positions 3-4, so "vac" at 2-4
|
|
237
|
+
if word[i - 2] == word[i] && word[i - 3] == word[i - 1]
|
|
238
|
+
yield word[0...(i - 1)] + word[(i + 1)..]
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Produces permutations of splitting word into two in all possible positions.
|
|
244
|
+
#
|
|
245
|
+
# @param word [String] The word to process
|
|
246
|
+
# @yield [Array<String>] Each two-word split
|
|
247
|
+
def twowords(word)
|
|
248
|
+
(1...word.length).each do |i|
|
|
249
|
+
yield [word[0...i], word[i..]]
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
# Internal recursive method for mapchars.
|
|
254
|
+
#
|
|
255
|
+
# @param word [String] Current word state
|
|
256
|
+
# @param start [Integer] Starting position for search
|
|
257
|
+
# @param maptable [Array<Set<String>>] Character mapping table
|
|
258
|
+
# @yield [String] Each variant
|
|
259
|
+
def mapchars_internal(word, start, maptable)
|
|
260
|
+
return if start >= word.length
|
|
261
|
+
|
|
262
|
+
maptable.each do |options|
|
|
263
|
+
options.each do |option|
|
|
264
|
+
pos = word.index(option, start)
|
|
265
|
+
next unless pos
|
|
266
|
+
|
|
267
|
+
options.each do |other|
|
|
268
|
+
next if other == option
|
|
269
|
+
|
|
270
|
+
replaced = word[0...pos] + other + word[(pos + option.length)..]
|
|
271
|
+
yield replaced
|
|
272
|
+
|
|
273
|
+
# Recursively continue from this position
|
|
274
|
+
mapchars_internal(replaced, pos + 1, maptable) { |variant| yield variant }
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
private_class_method :mapchars_internal
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
end
|