kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Suggestions
|
|
5
|
+
module Strategies
|
|
6
|
+
# Keyboard proximity suggestion strategy.
|
|
7
|
+
#
|
|
8
|
+
# Generates suggestions by finding words that can be formed by
|
|
9
|
+
# substituting adjacent keys on a QWERTY keyboard.
|
|
10
|
+
#
|
|
11
|
+
# @example Creating a keyboard proximity strategy
|
|
12
|
+
# strategy = KeyboardProximityStrategy.new
|
|
13
|
+
# result = strategy.generate(context)
|
|
14
|
+
class KeyboardProximityStrategy < BaseStrategy
|
|
15
|
+
# QWERTY keyboard layout (US).
|
|
16
|
+
#
|
|
17
|
+
# Each key maps to its adjacent keys.
|
|
18
|
+
KEYBOARD_LAYOUT = {
|
|
19
|
+
"`" => %w[1 tab],
|
|
20
|
+
"1" => ["`", "2", "q"],
|
|
21
|
+
"2" => %w[1 3 w q],
|
|
22
|
+
"3" => %w[2 4 e w],
|
|
23
|
+
"4" => %w[3 5 r e],
|
|
24
|
+
"5" => %w[4 6 t r],
|
|
25
|
+
"6" => %w[5 7 y t],
|
|
26
|
+
"7" => %w[6 8 u y],
|
|
27
|
+
"8" => %w[7 9 i u],
|
|
28
|
+
"9" => %w[8 0 o i],
|
|
29
|
+
"0" => %w[9 p o],
|
|
30
|
+
"-" => ["0", "="],
|
|
31
|
+
"=" => ["-"],
|
|
32
|
+
"q" => %w[tab w a 1],
|
|
33
|
+
"w" => %w[q e a s 2],
|
|
34
|
+
"e" => %w[w r s d 3],
|
|
35
|
+
"r" => %w[e t d f 4],
|
|
36
|
+
"t" => %w[r y f g 5],
|
|
37
|
+
"y" => %w[t u g h 6],
|
|
38
|
+
"u" => %w[y i h j 7],
|
|
39
|
+
"i" => %w[u o j k 8],
|
|
40
|
+
"o" => %w[i p k l 9],
|
|
41
|
+
"p" => ["o", "l", ";", "0"],
|
|
42
|
+
"[" => ["p", "'"],
|
|
43
|
+
"]" => ["enter", "\\"],
|
|
44
|
+
"\\" => ["enter"], # Backslash neighbors
|
|
45
|
+
"a" => %w[caps s z q],
|
|
46
|
+
"s" => %w[a d z x w],
|
|
47
|
+
"d" => %w[s f x c e],
|
|
48
|
+
"f" => %w[d g c v r],
|
|
49
|
+
"g" => %w[f h v b t],
|
|
50
|
+
"h" => %w[g j b n y],
|
|
51
|
+
"j" => %w[h k n m u],
|
|
52
|
+
"k" => ["j", "l", "m", ",", "i"],
|
|
53
|
+
"l" => ["k", ";", ",", ".", "o"],
|
|
54
|
+
";" => ["l", "'", ".", "p"],
|
|
55
|
+
"'" => [";"],
|
|
56
|
+
"z" => %w[shift s x a],
|
|
57
|
+
"x" => %w[z c s d],
|
|
58
|
+
"c" => %w[x v d f],
|
|
59
|
+
"v" => %w[c b f g],
|
|
60
|
+
"b" => %w[v n g h],
|
|
61
|
+
"n" => %w[b m h j],
|
|
62
|
+
"m" => ["n", ",", "j", "k"],
|
|
63
|
+
"," => ["m", ".", "k", "l"],
|
|
64
|
+
"." => [",", "/", "l", ";"],
|
|
65
|
+
"/" => [".", "shift"],
|
|
66
|
+
" " => [] # Space has no neighbors
|
|
67
|
+
}.freeze
|
|
68
|
+
|
|
69
|
+
# Create a new keyboard proximity strategy.
|
|
70
|
+
#
|
|
71
|
+
# @param name [String, Symbol] Name of the strategy
|
|
72
|
+
# @param config [Hash] Configuration options
|
|
73
|
+
# @option config [Integer] max_distance Maximum keyboard distance
|
|
74
|
+
# @option config [Integer] max_results Maximum results to return
|
|
75
|
+
def initialize(name: :keyboard_proximity, **config)
|
|
76
|
+
super(name: name, **config)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Generate suggestions based on keyboard proximity.
|
|
80
|
+
#
|
|
81
|
+
# @param context [Context] The suggestion context
|
|
82
|
+
# @return [SuggestionSet] Suggestions within keyboard distance
|
|
83
|
+
def generate(context)
|
|
84
|
+
word = context.word
|
|
85
|
+
max_dist = get_config(:max_distance, 2)
|
|
86
|
+
min_similarity = get_config(:min_similarity, 0.70) # Filter low-similarity suggestions
|
|
87
|
+
|
|
88
|
+
all_words = dictionary_words(context)
|
|
89
|
+
|
|
90
|
+
# Generate keyboard variants
|
|
91
|
+
variants = keyboard_variants(word, max_dist)
|
|
92
|
+
|
|
93
|
+
# Find matching dictionary words with their edit distances and similarity
|
|
94
|
+
results_with_distances = {}
|
|
95
|
+
variants.each do |variant|
|
|
96
|
+
dict_word = find_word(all_words, variant)
|
|
97
|
+
next unless dict_word && dict_word != word
|
|
98
|
+
|
|
99
|
+
# Calculate edit distance from original word
|
|
100
|
+
dist = edit_distance(word, dict_word)
|
|
101
|
+
next if dist > max_dist
|
|
102
|
+
|
|
103
|
+
# Calculate typo correction similarity
|
|
104
|
+
similarity = calculate_ngram_similarity(word, dict_word)
|
|
105
|
+
next if similarity < min_similarity # Filter by similarity threshold
|
|
106
|
+
|
|
107
|
+
# Keep the minimum distance for each word
|
|
108
|
+
results_with_distances[dict_word] ||= dist
|
|
109
|
+
results_with_distances[dict_word] = dist if dist < results_with_distances[dict_word]
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Sort by distance and create suggestions
|
|
113
|
+
sorted_words = results_with_distances.sort_by { |_, dist| dist }.map(&:first)
|
|
114
|
+
create_suggestion_set(sorted_words, distances: results_with_distances, original_word: word)
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Check if this strategy should handle the context.
|
|
118
|
+
#
|
|
119
|
+
# @param context [Context] The suggestion context
|
|
120
|
+
# @return [Boolean] True if the word needs correction
|
|
121
|
+
def handles?(context)
|
|
122
|
+
return false unless enabled?
|
|
123
|
+
|
|
124
|
+
!dictionary_lookup(context, context.word)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
private
|
|
128
|
+
|
|
129
|
+
# Calculate edit distance between two strings.
|
|
130
|
+
# Uses Levenshtein distance (substitution, insertion, deletion).
|
|
131
|
+
#
|
|
132
|
+
# @param str1 [String] First string
|
|
133
|
+
# @param str2 [String] Second string
|
|
134
|
+
# @return [Integer] Edit distance
|
|
135
|
+
def edit_distance(str1, str2)
|
|
136
|
+
return str2.length if str1.empty?
|
|
137
|
+
return str1.length if str2.empty?
|
|
138
|
+
|
|
139
|
+
len1 = str1.length
|
|
140
|
+
len2 = str2.length
|
|
141
|
+
|
|
142
|
+
# Create a 2D array for dynamic programming
|
|
143
|
+
d = Array.new(len1 + 1) { Array.new(len2 + 1, 0) }
|
|
144
|
+
|
|
145
|
+
# Initialize the first row and column
|
|
146
|
+
(0..len1).each { |i| d[i][0] = i }
|
|
147
|
+
(0..len2).each { |j| d[0][j] = j }
|
|
148
|
+
|
|
149
|
+
# Fill the matrix
|
|
150
|
+
(1..len1).each do |i|
|
|
151
|
+
(1..len2).each do |j|
|
|
152
|
+
cost = (str1[i - 1] == str2[j - 1]) ? 0 : 1
|
|
153
|
+
|
|
154
|
+
d[i][j] = [
|
|
155
|
+
d[i - 1][j] + 1, # deletion
|
|
156
|
+
d[i][j - 1] + 1, # insertion
|
|
157
|
+
d[i - 1][j - 1] + cost # substitution
|
|
158
|
+
].min
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
d[len1][len2]
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Get neighbors for a key.
|
|
166
|
+
#
|
|
167
|
+
# @param char [String] The character
|
|
168
|
+
# @return [Array<String>] Neighbor keys
|
|
169
|
+
def neighbors(char)
|
|
170
|
+
KEYBOARD_LAYOUT[char.downcase] || []
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Generate keyboard variants of a word.
|
|
174
|
+
#
|
|
175
|
+
# @param word [String] The word
|
|
176
|
+
# @param max_distance [Integer] Maximum edit distance
|
|
177
|
+
# @return [Array<String>] Keyboard variants
|
|
178
|
+
def keyboard_variants(word, max_distance)
|
|
179
|
+
return [] if word.nil? || word.empty?
|
|
180
|
+
|
|
181
|
+
word = word.downcase
|
|
182
|
+
variants = Set.new([word])
|
|
183
|
+
|
|
184
|
+
max_distance.times do
|
|
185
|
+
new_variants = Set.new
|
|
186
|
+
|
|
187
|
+
variants.each do |variant|
|
|
188
|
+
# Generate all single-key substitutions
|
|
189
|
+
variant.each_char.with_index do |char, i|
|
|
190
|
+
neighbors(char).each do |neighbor|
|
|
191
|
+
new_word = variant[0...i] + neighbor + variant[(i + 1)..]
|
|
192
|
+
new_variants.add(new_word)
|
|
193
|
+
|
|
194
|
+
# Also try insertions and deletions
|
|
195
|
+
new_variants.add(variant[0...i] + variant[(i + 1)..]) # Delete
|
|
196
|
+
new_variants.add(variant[0...i] + neighbor + variant[i..]) # Insert
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
variants = new_variants
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
variants.to_a
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Find a word in the dictionary (case-insensitive).
|
|
208
|
+
#
|
|
209
|
+
# @param all_words [Array<String>] All dictionary words
|
|
210
|
+
# @param word [String] The word to find
|
|
211
|
+
# @return [String, nil] The dictionary word or nil
|
|
212
|
+
def find_word(all_words, word)
|
|
213
|
+
return nil if word.nil? || word.empty?
|
|
214
|
+
|
|
215
|
+
word_lower = word.downcase
|
|
216
|
+
|
|
217
|
+
# First try exact match
|
|
218
|
+
return word if all_words.include?(word_lower)
|
|
219
|
+
|
|
220
|
+
# Then try case-insensitive search
|
|
221
|
+
all_words.find { |w| w.downcase == word_lower }
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
require "set" if RUBY_VERSION < "3.0"
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Suggestions
|
|
5
|
+
module Strategies
|
|
6
|
+
# N-gram suggestion strategy.
|
|
7
|
+
#
|
|
8
|
+
# Generates suggestions by finding words with high n-gram similarity.
|
|
9
|
+
# N-grams are contiguous sequences of n characters.
|
|
10
|
+
#
|
|
11
|
+
# @example Creating an n-gram strategy
|
|
12
|
+
# strategy = NgramStrategy.new(n: 3)
|
|
13
|
+
# result = strategy.generate(context)
|
|
14
|
+
class NgramStrategy < BaseStrategy
|
|
15
|
+
# Create a new n-gram strategy.
|
|
16
|
+
#
|
|
17
|
+
# @param name [String, Symbol] Name of the strategy
|
|
18
|
+
# @param config [Hash] Configuration options
|
|
19
|
+
# @option config [Integer] n N-gram size (default: 3)
|
|
20
|
+
# @option config [Float] min_similarity Minimum similarity threshold (0-1)
|
|
21
|
+
# @option config [Integer] max_results Maximum results to return
|
|
22
|
+
def initialize(name: :ngram, **config)
|
|
23
|
+
super(name: name, **config)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Generate suggestions based on n-gram similarity.
|
|
27
|
+
#
|
|
28
|
+
# @param context [Context] The suggestion context
|
|
29
|
+
# @return [SuggestionSet] Suggestions with high n-gram similarity
|
|
30
|
+
def generate(context)
|
|
31
|
+
word = context.word
|
|
32
|
+
n = get_config(:n, 3)
|
|
33
|
+
min_sim = get_config(:min_similarity, 0.3)
|
|
34
|
+
min_typo_similarity = get_config(:min_typo_similarity, 0.70) # Filter by typo correction similarity
|
|
35
|
+
|
|
36
|
+
return create_suggestion_set([]) if word.length < n
|
|
37
|
+
|
|
38
|
+
all_words = dictionary_words(context)
|
|
39
|
+
|
|
40
|
+
# Get n-grams for input word
|
|
41
|
+
word_ngrams = extract_ngrams(word, n)
|
|
42
|
+
|
|
43
|
+
# Calculate n-gram similarity for each dictionary word
|
|
44
|
+
results = {}
|
|
45
|
+
all_words.each do |dict_word|
|
|
46
|
+
next if dict_word == word
|
|
47
|
+
next if dict_word.length < n
|
|
48
|
+
|
|
49
|
+
similarity = ngram_similarity(word_ngrams, dict_word, n)
|
|
50
|
+
next if similarity < min_sim
|
|
51
|
+
|
|
52
|
+
# Also check typo correction similarity for filtering
|
|
53
|
+
typo_sim = calculate_ngram_similarity(word, dict_word)
|
|
54
|
+
next if typo_sim < min_typo_similarity
|
|
55
|
+
|
|
56
|
+
# Convert similarity to distance (higher similarity = lower distance)
|
|
57
|
+
dist = ((1 - similarity) * 10).to_i
|
|
58
|
+
next if dist.zero?
|
|
59
|
+
|
|
60
|
+
results[dict_word] ||= dist
|
|
61
|
+
results[dict_word] = dist if dist < results[dict_word]
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Convert to suggestions sorted by similarity
|
|
65
|
+
sorted_words = results.sort_by { |_, dist| dist }.map(&:first)
|
|
66
|
+
create_suggestion_set(sorted_words, distances: results, original_word: word)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Check if this strategy should handle the context.
|
|
70
|
+
#
|
|
71
|
+
# @param context [Context] The suggestion context
|
|
72
|
+
# @return [Boolean] True if the word needs correction
|
|
73
|
+
def handles?(context)
|
|
74
|
+
return false unless enabled?
|
|
75
|
+
|
|
76
|
+
!dictionary_lookup(context, context.word)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
private
|
|
80
|
+
|
|
81
|
+
# Extract n-grams from a word.
|
|
82
|
+
#
|
|
83
|
+
# @param word [String] The word
|
|
84
|
+
# @param n [Integer] N-gram size
|
|
85
|
+
# @return [Hash] N-gram to count mapping
|
|
86
|
+
def extract_ngrams(word, n)
|
|
87
|
+
ngrams = Hash.new(0)
|
|
88
|
+
|
|
89
|
+
(word.length - n + 1).times do |i|
|
|
90
|
+
ngram = word[i...i + n]
|
|
91
|
+
ngrams[ngram] += 1
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
ngrams
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Calculate n-gram similarity between two words.
|
|
98
|
+
#
|
|
99
|
+
# Uses the Jaccard similarity coefficient:
|
|
100
|
+
# similarity = |intersection| / |union|
|
|
101
|
+
#
|
|
102
|
+
# @param word_ngrams [Hash] N-grams for the first word
|
|
103
|
+
# @param other_word [String] The second word
|
|
104
|
+
# @param n [Integer] N-gram size
|
|
105
|
+
# @return [Float] Similarity score (0-1)
|
|
106
|
+
def ngram_similarity(word_ngrams, other_word, n)
|
|
107
|
+
other_ngrams = extract_ngrams(other_word, n)
|
|
108
|
+
|
|
109
|
+
# Calculate intersection
|
|
110
|
+
intersection = 0
|
|
111
|
+
word_ngrams.each do |ngram, count|
|
|
112
|
+
other_count = other_ngrams[ngram]
|
|
113
|
+
intersection += [count, other_count].min if other_count
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Calculate union
|
|
117
|
+
all_ngrams = word_ngrams.keys | other_ngrams.keys
|
|
118
|
+
union = 0
|
|
119
|
+
all_ngrams.each do |ngram|
|
|
120
|
+
union += [word_ngrams[ngram] || 0, other_ngrams[ngram] || 0].max
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
return 0.0 if union.zero?
|
|
124
|
+
|
|
125
|
+
intersection.to_f / union
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Suggestions
|
|
5
|
+
module Strategies
|
|
6
|
+
# Phonetic suggestion strategy.
|
|
7
|
+
#
|
|
8
|
+
# Generates suggestions by finding words with similar phonetic codes
|
|
9
|
+
# using algorithms like Soundex and Metaphone.
|
|
10
|
+
#
|
|
11
|
+
# @example Creating a phonetic strategy
|
|
12
|
+
# strategy = PhoneticStrategy.new(algorithm: :soundex)
|
|
13
|
+
# result = strategy.generate(context)
|
|
14
|
+
class PhoneticStrategy < BaseStrategy
|
|
15
|
+
# Supported algorithms.
|
|
16
|
+
ALGORITHMS = %i[soundex metaphone].freeze
|
|
17
|
+
|
|
18
|
+
# Create a new phonetic strategy.
|
|
19
|
+
#
|
|
20
|
+
# @param name [String, Symbol] Name of the strategy
|
|
21
|
+
# @param config [Hash] Configuration options
|
|
22
|
+
# @option config [Symbol] algorithm The algorithm to use (:soundex or :metaphone)
|
|
23
|
+
# @option config [Integer] max_results Maximum results to return
|
|
24
|
+
def initialize(name: :phonetic, **config)
|
|
25
|
+
super(name: name, **config)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Generate suggestions based on phonetic similarity.
|
|
29
|
+
#
|
|
30
|
+
# @param context [Context] The suggestion context
|
|
31
|
+
# @return [SuggestionSet] Suggestions with same phonetic code
|
|
32
|
+
def generate(context)
|
|
33
|
+
word = context.word
|
|
34
|
+
algorithm = get_config(:algorithm, :soundex)
|
|
35
|
+
max_dist = 2
|
|
36
|
+
|
|
37
|
+
all_words = dictionary_words(context)
|
|
38
|
+
|
|
39
|
+
# Get phonetic code for input word
|
|
40
|
+
word_code = phonetic_code(word, algorithm)
|
|
41
|
+
|
|
42
|
+
# Find words with same phonetic code
|
|
43
|
+
results = []
|
|
44
|
+
all_words.each do |dict_word|
|
|
45
|
+
next if dict_word == word
|
|
46
|
+
|
|
47
|
+
dict_code = phonetic_code(dict_word, algorithm)
|
|
48
|
+
next unless dict_code == word_code
|
|
49
|
+
|
|
50
|
+
dist = edit_distance(word, dict_word)
|
|
51
|
+
next if dist > max_dist || dist.zero?
|
|
52
|
+
|
|
53
|
+
results << [dict_word, dist]
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Sort by distance and convert to suggestions
|
|
57
|
+
sorted_words = results.sort_by { |_, dist| dist }.map(&:first)
|
|
58
|
+
create_suggestion_set(sorted_words)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Check if this strategy should handle the context.
|
|
62
|
+
#
|
|
63
|
+
# @param context [Context] The suggestion context
|
|
64
|
+
# @return [Boolean] True if the word needs correction
|
|
65
|
+
def handles?(context)
|
|
66
|
+
return false unless enabled?
|
|
67
|
+
|
|
68
|
+
!dictionary_lookup(context, context.word)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
private
|
|
72
|
+
|
|
73
|
+
# Get phonetic code for a word.
|
|
74
|
+
#
|
|
75
|
+
# @param word [String] The word
|
|
76
|
+
# @param algorithm [Symbol] The algorithm to use
|
|
77
|
+
# @return [String] The phonetic code
|
|
78
|
+
def phonetic_code(word, algorithm = :soundex)
|
|
79
|
+
case algorithm
|
|
80
|
+
when :soundex
|
|
81
|
+
soundex_code(word)
|
|
82
|
+
when :metaphone
|
|
83
|
+
metaphone_code(word)
|
|
84
|
+
else
|
|
85
|
+
soundex_code(word)
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Calculate Soundex code for a word.
|
|
90
|
+
#
|
|
91
|
+
# Soundex is a phonetic algorithm developed by Robert C. Russell
|
|
92
|
+
# and Margaret King Odell in the early 1900s.
|
|
93
|
+
#
|
|
94
|
+
# @param word [String] The word
|
|
95
|
+
# @return [String] The Soundex code (letter + 3 digits)
|
|
96
|
+
#
|
|
97
|
+
# @example
|
|
98
|
+
# soundex_code("Robert") # => "R163"
|
|
99
|
+
# soundex_code("Rupert") # => "R163"
|
|
100
|
+
# soundex_code("Ashcraft") # => "A226"
|
|
101
|
+
def soundex_code(word)
|
|
102
|
+
return "" if word.nil? || word.empty?
|
|
103
|
+
|
|
104
|
+
word = word.upcase.gsub(/[^A-Z]/, "")
|
|
105
|
+
return "" if word.empty?
|
|
106
|
+
|
|
107
|
+
# Keep first letter
|
|
108
|
+
first_letter = word[0]
|
|
109
|
+
rest = word[1..]
|
|
110
|
+
|
|
111
|
+
# Encode remaining letters
|
|
112
|
+
code = first_letter
|
|
113
|
+
|
|
114
|
+
prev_code = soundex_encode(first_letter)
|
|
115
|
+
i = 0
|
|
116
|
+
|
|
117
|
+
while code.length < 4 && i < rest.length
|
|
118
|
+
char = rest[i]
|
|
119
|
+
encoded = soundex_encode(char)
|
|
120
|
+
|
|
121
|
+
# Add code if different from previous (ignore h and w)
|
|
122
|
+
code += encoded if encoded != "0" && encoded != prev_code
|
|
123
|
+
|
|
124
|
+
prev_code = encoded if encoded != "0"
|
|
125
|
+
i += 1
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Pad with zeros if needed
|
|
129
|
+
code.ljust(4, "0")[0...4]
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Soundex encoding table.
|
|
133
|
+
#
|
|
134
|
+
# @param char [String] The character
|
|
135
|
+
# @return [String] The encoded digit or "0" for no code
|
|
136
|
+
def soundex_encode(char)
|
|
137
|
+
case char.upcase
|
|
138
|
+
when "B", "P", "F", "V"
|
|
139
|
+
"1"
|
|
140
|
+
when "C", "S", "K", "G", "J", "Q", "X", "Z"
|
|
141
|
+
"2"
|
|
142
|
+
when "D", "T"
|
|
143
|
+
"3"
|
|
144
|
+
when "L"
|
|
145
|
+
"4"
|
|
146
|
+
when "M", "N"
|
|
147
|
+
"5"
|
|
148
|
+
when "R"
|
|
149
|
+
"6"
|
|
150
|
+
else
|
|
151
|
+
"0"
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Calculate Metaphone code for a word.
|
|
156
|
+
#
|
|
157
|
+
# Metaphone is an improved phonetic algorithm developed by
|
|
158
|
+
# Lawrence Philips in 1990.
|
|
159
|
+
#
|
|
160
|
+
# @param word [String] The word
|
|
161
|
+
# @return [String] The Metaphone code
|
|
162
|
+
#
|
|
163
|
+
# @example
|
|
164
|
+
# metaphone_code("Schmidt") # => "XMT"
|
|
165
|
+
# metaphone_code("Smith") # => "SM0"
|
|
166
|
+
def metaphone_code(word)
|
|
167
|
+
return "" if word.nil? || word.empty?
|
|
168
|
+
|
|
169
|
+
word = word.upcase.gsub(/[^A-Z]/, "")
|
|
170
|
+
return "" if word.empty?
|
|
171
|
+
|
|
172
|
+
# Metaphone rules (simplified implementation)
|
|
173
|
+
code = ""
|
|
174
|
+
i = 0
|
|
175
|
+
length = word.length
|
|
176
|
+
|
|
177
|
+
while i < length && code.length < 4
|
|
178
|
+
char = word[i]
|
|
179
|
+
next_char = i + 1 < length ? word[i + 1] : ""
|
|
180
|
+
|
|
181
|
+
case char
|
|
182
|
+
when "A", "E", "I", "O", "U"
|
|
183
|
+
# Vowels are only encoded at the beginning
|
|
184
|
+
code += char if i.zero?
|
|
185
|
+
when "B"
|
|
186
|
+
code += "B"
|
|
187
|
+
when "C"
|
|
188
|
+
if next_char == "H" && i + 2 < length && %w[A E I O U].include?(word[i + 2])
|
|
189
|
+
# "CH" followed by vowel => "X"
|
|
190
|
+
code += "X"
|
|
191
|
+
i += 1
|
|
192
|
+
elsif next_char == "I" && i + 2 < length && word[i + 2] == "A"
|
|
193
|
+
# "CIA" => "X"
|
|
194
|
+
code += "X"
|
|
195
|
+
i += 2
|
|
196
|
+
elsif %w[S G].include?(next_char)
|
|
197
|
+
# "CS", "CG" => "X"
|
|
198
|
+
code += "X"
|
|
199
|
+
i += 1
|
|
200
|
+
else
|
|
201
|
+
code += "K"
|
|
202
|
+
end
|
|
203
|
+
when "D"
|
|
204
|
+
if next_char == "G" && i + 2 < length && %w[I E Y].include?(word[i + 2])
|
|
205
|
+
# "DG" followed by I, E, Y => "J"
|
|
206
|
+
code += "J"
|
|
207
|
+
i += 1
|
|
208
|
+
else
|
|
209
|
+
code += "T"
|
|
210
|
+
end
|
|
211
|
+
when "F"
|
|
212
|
+
code += "F"
|
|
213
|
+
when "G"
|
|
214
|
+
if next_char == "H"
|
|
215
|
+
# "GH" => silent unless at beginning or after vowel
|
|
216
|
+
if i.zero?
|
|
217
|
+
code += "K"
|
|
218
|
+
i += 1
|
|
219
|
+
end
|
|
220
|
+
elsif next_char == "N"
|
|
221
|
+
# "GN" => "N" (silent G)
|
|
222
|
+
i += 1
|
|
223
|
+
elsif next_char == "N" && i + 2 < length && word[i + 2] == "E" && i + 3 < length && word[i + 3] == "D"
|
|
224
|
+
# "GNED" => "N" (silent G)
|
|
225
|
+
i += 3
|
|
226
|
+
else
|
|
227
|
+
code += "K"
|
|
228
|
+
end
|
|
229
|
+
when "H"
|
|
230
|
+
# H is silent unless at beginning
|
|
231
|
+
code += "H" if i.zero?
|
|
232
|
+
when "J"
|
|
233
|
+
code += "J"
|
|
234
|
+
when "K"
|
|
235
|
+
code += "K"
|
|
236
|
+
i += 1 if next_char == "N" # "KN" => "N"
|
|
237
|
+
when "L"
|
|
238
|
+
code += "L"
|
|
239
|
+
when "M"
|
|
240
|
+
code += "M"
|
|
241
|
+
when "N"
|
|
242
|
+
code += "N"
|
|
243
|
+
when "P"
|
|
244
|
+
if next_char == "H"
|
|
245
|
+
# "PH" => "F"
|
|
246
|
+
code += "F"
|
|
247
|
+
i += 1
|
|
248
|
+
else
|
|
249
|
+
code += "P"
|
|
250
|
+
end
|
|
251
|
+
when "Q"
|
|
252
|
+
code += "K"
|
|
253
|
+
when "R"
|
|
254
|
+
code += "R"
|
|
255
|
+
when "S"
|
|
256
|
+
if next_char == "H"
|
|
257
|
+
# "SH" => "X"
|
|
258
|
+
code += "X"
|
|
259
|
+
i += 1
|
|
260
|
+
elsif next_char == "I" && i + 2 < length && word[i + 2] == "O"
|
|
261
|
+
# "SIO" or "SIA" => "X"
|
|
262
|
+
code += "X"
|
|
263
|
+
i += 2
|
|
264
|
+
else
|
|
265
|
+
code += "S"
|
|
266
|
+
end
|
|
267
|
+
when "T"
|
|
268
|
+
if next_char == "I" && i + 2 < length && %w[O A].include?(word[i + 2])
|
|
269
|
+
# "TIO" or "TIA" => "X"
|
|
270
|
+
code += "X"
|
|
271
|
+
i += 2
|
|
272
|
+
elsif next_char == "H"
|
|
273
|
+
# "TH" => "0"
|
|
274
|
+
code += "0"
|
|
275
|
+
i += 1
|
|
276
|
+
else
|
|
277
|
+
code += "T"
|
|
278
|
+
end
|
|
279
|
+
when "V"
|
|
280
|
+
code += "F"
|
|
281
|
+
when "W", "Y"
|
|
282
|
+
# W and Y are semi-vowels, only encode at beginning
|
|
283
|
+
code += char if i.zero?
|
|
284
|
+
when "X"
|
|
285
|
+
code += "KS"
|
|
286
|
+
when "Z"
|
|
287
|
+
code += "S"
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
i += 1
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
code[0...4] # Max 4 characters
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
# Calculate Levenshtein edit distance.
|
|
297
|
+
#
|
|
298
|
+
# @param str1 [String] First string
|
|
299
|
+
# @param str2 [String] Second string
|
|
300
|
+
# @return [Integer] Edit distance
|
|
301
|
+
def edit_distance(str1, str2)
|
|
302
|
+
return str2.length if str1.empty?
|
|
303
|
+
return str1.length if str2.empty?
|
|
304
|
+
|
|
305
|
+
# Use smaller string for inner loop
|
|
306
|
+
str1, str2 = str2, str1 if str1.length > str2.length
|
|
307
|
+
|
|
308
|
+
previous = (0..str1.length).to_a
|
|
309
|
+
|
|
310
|
+
str2.each_char.with_index do |char2, j|
|
|
311
|
+
current = [j + 1]
|
|
312
|
+
|
|
313
|
+
str1.each_char.with_index do |char1, i|
|
|
314
|
+
insert_cost = current[i] + 1
|
|
315
|
+
delete_cost = previous[i + 1] + 1
|
|
316
|
+
substitute_cost = previous[i] + (char1 == char2 ? 0 : 1)
|
|
317
|
+
|
|
318
|
+
current << [insert_cost, delete_cost, substitute_cost].min
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
previous = current
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
previous.last
|
|
325
|
+
end
|
|
326
|
+
end
|
|
327
|
+
end
|
|
328
|
+
end
|
|
329
|
+
end
|