kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'base_strategy'
|
|
4
|
+
require_relative '../suggestion'
|
|
5
|
+
require_relative '../suggestion_set'
|
|
6
|
+
require_relative '../../embeddings'
|
|
7
|
+
|
|
8
|
+
module Kotoshu
|
|
9
|
+
module Suggestions
|
|
10
|
+
module Strategies
|
|
11
|
+
# Semantic strategy using FastText ONNX embeddings.
|
|
12
|
+
#
|
|
13
|
+
# Provides embedding-based spell correction for:
|
|
14
|
+
# - Typos: Re-ranks edit-distance candidates by semantic similarity
|
|
15
|
+
# - Real-word errors: Detects when valid words are used incorrectly in context
|
|
16
|
+
#
|
|
17
|
+
# This strategy works alongside other strategies (EditDistance, Phonetic, etc.)
|
|
18
|
+
# to provide comprehensive spell checking with semantic awareness.
|
|
19
|
+
#
|
|
20
|
+
# @example Basic usage
|
|
21
|
+
# strategy = SemanticStrategy.new(language_code: 'en')
|
|
22
|
+
# suggestions = strategy.generate(context)
|
|
23
|
+
#
|
|
24
|
+
# @example With preloaded embeddings (faster)
|
|
25
|
+
# strategy = SemanticStrategy.new(
|
|
26
|
+
# language_code: 'en',
|
|
27
|
+
# preload_embeddings: true
|
|
28
|
+
# )
|
|
29
|
+
# suggestions = strategy.generate(context)
|
|
30
|
+
class SemanticStrategy < BaseStrategy
|
|
31
|
+
# @return [String] Language code (ISO 639-1)
|
|
32
|
+
attr_reader :language_code
|
|
33
|
+
|
|
34
|
+
# @return [Embeddings::Vocabulary] The vocabulary
|
|
35
|
+
attr_reader :vocabulary
|
|
36
|
+
|
|
37
|
+
# @return [Embeddings::OnnxRuntimeModel] The ONNX model
|
|
38
|
+
attr_reader :model
|
|
39
|
+
|
|
40
|
+
# @return [Embeddings::SimilaritySearch] The similarity search
|
|
41
|
+
attr_reader :search
|
|
42
|
+
|
|
43
|
+
# Create a new semantic strategy.
|
|
44
|
+
#
|
|
45
|
+
# @param language_code [String] ISO 639-1 language code
|
|
46
|
+
# @param cache [Cache::ModelCache, nil] Optional cache instance
|
|
47
|
+
# @param preload_embeddings [Boolean] Whether to preload embeddings
|
|
48
|
+
# @param max_context_window [Integer] Words to consider for context
|
|
49
|
+
# @param min_semantic_similarity [Float] Minimum similarity for semantic suggestions
|
|
50
|
+
# @param semantic_boost_weight [Float] Weight for semantic similarity in re-ranking
|
|
51
|
+
# @param config [Hash] Additional configuration
|
|
52
|
+
def initialize(language_code:, cache: nil, preload_embeddings: false,
|
|
53
|
+
max_context_window: 5, min_semantic_similarity: 0.5,
|
|
54
|
+
semantic_boost_weight: 0.3, **config)
|
|
55
|
+
super(name: :semantic, **config)
|
|
56
|
+
@language_code = language_code
|
|
57
|
+
@max_context_window = max_context_window
|
|
58
|
+
@min_semantic_similarity = min_semantic_similarity
|
|
59
|
+
@semantic_boost_weight = semantic_boost_weight
|
|
60
|
+
|
|
61
|
+
# Initialize embedding components
|
|
62
|
+
initialize_embeddings(cache, preload_embeddings)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Generate suggestions using semantic similarity.
|
|
66
|
+
#
|
|
67
|
+
# Handles two cases:
|
|
68
|
+
# 1. Word not in vocabulary (typo): Re-ranks edit-distance candidates
|
|
69
|
+
# 2. Word in vocabulary (real-word error): Finds semantically similar alternatives
|
|
70
|
+
#
|
|
71
|
+
# @param context [Context] The suggestion context
|
|
72
|
+
# @return [SuggestionSet] Generated suggestions
|
|
73
|
+
def generate(context)
|
|
74
|
+
word = context.word
|
|
75
|
+
max_results = context.max_results || max_results
|
|
76
|
+
|
|
77
|
+
# Ensure embeddings are loaded
|
|
78
|
+
return SuggestionSet.empty unless @search
|
|
79
|
+
|
|
80
|
+
# Case 1: Word not in vocabulary (typo)
|
|
81
|
+
unless @vocabulary.include?(word)
|
|
82
|
+
return generate_for_typo(context)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Case 2: Real-word error detection
|
|
86
|
+
# Find semantically similar words that might be correct in context
|
|
87
|
+
generate_for_real_word_error(context)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Check if this strategy should handle the context.
|
|
91
|
+
#
|
|
92
|
+
# Semantic strategy handles:
|
|
93
|
+
# - Words not in vocabulary (for typo re-ranking)
|
|
94
|
+
# - Words in vocabulary (for real-word error detection)
|
|
95
|
+
#
|
|
96
|
+
# @param context [Context] The suggestion context
|
|
97
|
+
# @return [Boolean] True if the strategy should handle this context
|
|
98
|
+
def handles?(context)
|
|
99
|
+
return false unless enabled?
|
|
100
|
+
return false unless @search && @vocabulary
|
|
101
|
+
|
|
102
|
+
# Handle all words - we filter in generate()
|
|
103
|
+
true
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Get embedding for a word.
|
|
107
|
+
#
|
|
108
|
+
# @param word [String] The word
|
|
109
|
+
# @return [Array<Float>, nil] Embedding vector or nil if not found
|
|
110
|
+
def embedding_for(word)
|
|
111
|
+
return nil unless @search
|
|
112
|
+
|
|
113
|
+
@search.send(:get_embedding, word)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Compute semantic similarity between two words.
|
|
117
|
+
#
|
|
118
|
+
# @param word1 [String] First word
|
|
119
|
+
# @param word2 [String] Second word
|
|
120
|
+
# @return [Float, nil] Cosine similarity or nil if either word not found
|
|
121
|
+
def semantic_similarity(word1, word2)
|
|
122
|
+
return nil unless @search
|
|
123
|
+
|
|
124
|
+
@search.similarity(word1, word2)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Find semantically similar words.
|
|
128
|
+
#
|
|
129
|
+
# @param word [String] The query word
|
|
130
|
+
# @param k [Integer] Number of neighbors
|
|
131
|
+
# @return [Array<Hash>] Array of {word, similarity} hashes
|
|
132
|
+
def find_similar_words(word, k: 10)
|
|
133
|
+
return [] unless @search
|
|
134
|
+
|
|
135
|
+
@search.find_nearest(word, k: k, exclude_self: false)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# String representation.
|
|
139
|
+
#
|
|
140
|
+
# @return [String] String representation
|
|
141
|
+
def to_s
|
|
142
|
+
"SemanticStrategy(language: #{@language_code}, vocab_size: #{@vocabulary&.size || 0}, loaded: #{@search && true})"
|
|
143
|
+
end
|
|
144
|
+
alias inspect to_s
|
|
145
|
+
|
|
146
|
+
private
|
|
147
|
+
|
|
148
|
+
# Initialize embedding components.
|
|
149
|
+
#
|
|
150
|
+
# @param cache [Cache::ModelCache, nil] Cache instance
|
|
151
|
+
# @param preload [Boolean] Whether to preload embeddings
|
|
152
|
+
def initialize_embeddings(cache, preload)
|
|
153
|
+
# Try to load from cache
|
|
154
|
+
@search = Embeddings::SimilaritySearch.from_cache(
|
|
155
|
+
@language_code,
|
|
156
|
+
cache: cache,
|
|
157
|
+
preload: preload
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Extract vocabulary and model from search
|
|
161
|
+
if @search
|
|
162
|
+
@vocabulary = @search.vocabulary
|
|
163
|
+
@model = @search.model
|
|
164
|
+
else
|
|
165
|
+
@vocabulary = nil
|
|
166
|
+
@model = nil
|
|
167
|
+
|
|
168
|
+
warn "Warning: Could not load ONNX model for language '#{@language_code}'. Semantic strategy will be disabled." if $VERBOSE
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Generate suggestions for a typo (word not in vocabulary).
|
|
173
|
+
#
|
|
174
|
+
# Uses semantic similarity to re-rank candidates from other strategies.
|
|
175
|
+
#
|
|
176
|
+
# @param context [Context] The suggestion context
|
|
177
|
+
# @return [SuggestionSet] Re-ranked suggestions
|
|
178
|
+
def generate_for_typo(context)
|
|
179
|
+
word = context.word
|
|
180
|
+
max_results = context.max_results || max_results
|
|
181
|
+
|
|
182
|
+
# For typos, we find semantically similar words in vocabulary
|
|
183
|
+
# that are also close in spelling (handled by edit distance strategy)
|
|
184
|
+
neighbors = @search.find_nearest(
|
|
185
|
+
word,
|
|
186
|
+
k: max_results * 2, # Get more candidates for filtering
|
|
187
|
+
exclude_self: true,
|
|
188
|
+
min_similarity: @min_semantic_similarity
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
return SuggestionSet.empty if neighbors.empty?
|
|
192
|
+
|
|
193
|
+
# Convert to suggestions
|
|
194
|
+
# Confidence is based on semantic similarity
|
|
195
|
+
suggestions = neighbors.map do |neighbor|
|
|
196
|
+
similarity = neighbor[:similarity]
|
|
197
|
+
confidence = normalize_similarity(similarity)
|
|
198
|
+
|
|
199
|
+
# Calculate "distance" as inverse of similarity
|
|
200
|
+
# High similarity = low distance
|
|
201
|
+
distance = similarity_to_distance(similarity)
|
|
202
|
+
|
|
203
|
+
create_suggestion(
|
|
204
|
+
neighbor[:word],
|
|
205
|
+
distance: distance,
|
|
206
|
+
confidence: confidence,
|
|
207
|
+
semantic_similarity: similarity
|
|
208
|
+
)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Sort and limit
|
|
212
|
+
SuggestionSet.new(suggestions, max_size: max_results)
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Generate suggestions for a real-word error.
|
|
216
|
+
#
|
|
217
|
+
# Finds semantically similar words that might be correct in context.
|
|
218
|
+
#
|
|
219
|
+
# @param context [Context] The suggestion context
|
|
220
|
+
# @return [SuggestionSet] Alternative suggestions
|
|
221
|
+
def generate_for_real_word_error(context)
|
|
222
|
+
word = context.word
|
|
223
|
+
max_results = context.max_results || max_results
|
|
224
|
+
|
|
225
|
+
# Get context words from the surrounding text
|
|
226
|
+
context_words = get_context_words(context)
|
|
227
|
+
|
|
228
|
+
# Find semantically similar words
|
|
229
|
+
neighbors = @search.find_nearest(
|
|
230
|
+
word,
|
|
231
|
+
k: max_results * 3,
|
|
232
|
+
exclude_self: true,
|
|
233
|
+
min_similarity: @min_semantic_similarity
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
return SuggestionSet.empty if neighbors.empty?
|
|
237
|
+
|
|
238
|
+
# Re-rank by context similarity
|
|
239
|
+
suggestions = neighbors.map do |neighbor|
|
|
240
|
+
candidate_word = neighbor[:word]
|
|
241
|
+
similarity = neighbor[:similarity]
|
|
242
|
+
|
|
243
|
+
# Check if candidate makes more sense in context
|
|
244
|
+
context_score = compute_context_fit(candidate_word, context_words)
|
|
245
|
+
|
|
246
|
+
# Combine semantic similarity with context fit
|
|
247
|
+
combined_score = (similarity * 0.7) + (context_score * 0.3)
|
|
248
|
+
|
|
249
|
+
confidence = normalize_similarity(combined_score)
|
|
250
|
+
distance = similarity_to_distance(combined_score)
|
|
251
|
+
|
|
252
|
+
create_suggestion(
|
|
253
|
+
candidate_word,
|
|
254
|
+
distance: distance,
|
|
255
|
+
confidence: confidence,
|
|
256
|
+
semantic_similarity: similarity,
|
|
257
|
+
context_score: context_score
|
|
258
|
+
)
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Sort by combined score and limit
|
|
262
|
+
SuggestionSet.new(suggestions.sort_by { |s| -s.metadata[:context_score] }, max_size: max_results)
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
# Get context words for semantic analysis.
|
|
266
|
+
#
|
|
267
|
+
# @param context [Context] The suggestion context
|
|
268
|
+
# @return [Array<String>] Context words
|
|
269
|
+
def get_context_words(context)
|
|
270
|
+
# For now, return empty - context analysis would need full text
|
|
271
|
+
# This could be extended in the future
|
|
272
|
+
[]
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
# Compute how well a word fits in context.
|
|
276
|
+
#
|
|
277
|
+
# @param candidate [String] Candidate word
|
|
278
|
+
# @param context_words [Array<String>] Context words
|
|
279
|
+
# @return [Float] Context fit score (0.0 to 1.0)
|
|
280
|
+
def compute_context_fit(candidate, context_words)
|
|
281
|
+
return 0.5 if context_words.empty?
|
|
282
|
+
|
|
283
|
+
# Compute average similarity between candidate and context words
|
|
284
|
+
similarities = context_words.map do |ctx_word|
|
|
285
|
+
@search.similarity(candidate, ctx_word)
|
|
286
|
+
end.compact
|
|
287
|
+
|
|
288
|
+
return 0.5 if similarities.empty?
|
|
289
|
+
|
|
290
|
+
similarities.sum / similarities.size
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
# Normalize similarity to confidence (0.0 to 1.0).
|
|
294
|
+
#
|
|
295
|
+
# @param similarity [Float] Cosine similarity (-1.0 to 1.0)
|
|
296
|
+
# @return [Float] Normalized confidence (0.0 to 1.0)
|
|
297
|
+
def normalize_similarity(similarity)
|
|
298
|
+
# Map from [-1, 1] to [0, 1]
|
|
299
|
+
((similarity + 1) / 2.0).clamp(0.0, 1.0)
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
# Convert similarity to "distance" for ranking.
|
|
303
|
+
#
|
|
304
|
+
# @param similarity [Float] Cosine similarity (-1.0 to 1.0)
|
|
305
|
+
# @return [Integer] Pseudo-distance (lower = better)
|
|
306
|
+
def similarity_to_distance(similarity)
|
|
307
|
+
# Map similarity to distance: higher similarity = lower distance
|
|
308
|
+
# Similarity 1.0 -> distance 0
|
|
309
|
+
# Similarity 0.0 -> distance 2
|
|
310
|
+
# Similarity -1.0 -> distance 4
|
|
311
|
+
((1.0 - similarity) * 2).to_i.clamp(0, 5)
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
end
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Suggestions
|
|
5
|
+
module Strategies
|
|
6
|
+
# SymSpell suggestion strategy.
|
|
7
|
+
#
|
|
8
|
+
# Uses deletion distance algorithm for fast approximate string matching.
|
|
9
|
+
# Pre-computes deletion variants for all dictionary words, enabling O(1)
|
|
10
|
+
# lookup for common misspellings.
|
|
11
|
+
#
|
|
12
|
+
# This is 10-100x faster than EditDistanceStrategy for large dictionaries.
|
|
13
|
+
#
|
|
14
|
+
# The algorithm works by:
|
|
15
|
+
# 1. Pre-computing single deletion variants for each dictionary word
|
|
16
|
+
# 2. Looking up input word's deletion variants in the pre-computed map
|
|
17
|
+
# 3. Distance is inferred from the deletion level
|
|
18
|
+
#
|
|
19
|
+
# @see https://github.com/wolfgarbe/SymSpell Original SymSpell paper
|
|
20
|
+
class SymSpellStrategy < BaseStrategy
|
|
21
|
+
# Maximum deletion distance to consider
|
|
22
|
+
DEFAULT_MAX_DELETION_DISTANCE = 2
|
|
23
|
+
# Maximum dictionary words to process (increased for better coverage)
|
|
24
|
+
DEFAULT_MAX_DICTIONARY_SIZE = 500_000
|
|
25
|
+
# Enable transposition handling (slower pre-computation, better accuracy)
|
|
26
|
+
DEFAULT_HANDLE_TRANSPOSITIONS = true
|
|
27
|
+
|
|
28
|
+
# Create a new SymSpell strategy.
|
|
29
|
+
#
|
|
30
|
+
# @param dictionary [Object] Dictionary to use for suggestions
|
|
31
|
+
# @param name [String, Symbol] Strategy name
|
|
32
|
+
# @param config [Hash] Configuration options
|
|
33
|
+
# @option config [Integer] max_deletion_distance Maximum deletion distance (default: 2)
|
|
34
|
+
# @option config [Integer] max_results Maximum results to return (default: 10)
|
|
35
|
+
# @option config [Integer] max_dictionary_size Maximum words to process (default: 500_000)
|
|
36
|
+
# @option config [Boolean] handle_transpositions Generate transposition variants (default: true)
|
|
37
|
+
def initialize(dictionary:, name: :symspell, **config)
|
|
38
|
+
super(name: name, **config)
|
|
39
|
+
@dictionary = dictionary
|
|
40
|
+
@max_deletion_distance = config.fetch(:max_deletion_distance, DEFAULT_MAX_DELETION_DISTANCE)
|
|
41
|
+
@max_dictionary_size = config.fetch(:max_dictionary_size, DEFAULT_MAX_DICTIONARY_SIZE)
|
|
42
|
+
@handle_transpositions = config.fetch(:handle_transpositions, DEFAULT_HANDLE_TRANSPOSITIONS)
|
|
43
|
+
@deletes = Hash.new { |h, k| h[k] = [] } # deletion_variant -> [original_words]
|
|
44
|
+
@words = Set.new
|
|
45
|
+
precompute!
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Generate suggestions using deletion distance.
|
|
49
|
+
#
|
|
50
|
+
# @param context [Context] The suggestion context
|
|
51
|
+
# @return [SuggestionSet] Generated suggestions
|
|
52
|
+
def generate(context)
|
|
53
|
+
word = context.word
|
|
54
|
+
max_dist = get_config(:max_deletion_distance, @max_deletion_distance)
|
|
55
|
+
|
|
56
|
+
# Normalize to lowercase for case-insensitive matching
|
|
57
|
+
word_lower = word.downcase
|
|
58
|
+
|
|
59
|
+
# Check if word is in dictionary
|
|
60
|
+
return SuggestionSet.empty if @words.include?(word_lower)
|
|
61
|
+
|
|
62
|
+
# Collect candidates with their distances
|
|
63
|
+
candidates = {}
|
|
64
|
+
checked = Set.new([word_lower])
|
|
65
|
+
|
|
66
|
+
# First, check if the input word is a deletion variant of any dictionary word
|
|
67
|
+
@deletes[word_lower].each do |dict_word|
|
|
68
|
+
candidates[dict_word] ||= 1
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# If transpositions are enabled, check them too
|
|
72
|
+
if @handle_transpositions
|
|
73
|
+
generate_transpositions(word_lower).each do |transposed|
|
|
74
|
+
@deletes[transposed].each do |dict_word|
|
|
75
|
+
candidates[dict_word] ||= 1
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Generate deletion variants and check for matches
|
|
81
|
+
max_dist.times do |dist|
|
|
82
|
+
generate_deletions_from_set(checked).each do |variant|
|
|
83
|
+
next if checked.include?(variant)
|
|
84
|
+
|
|
85
|
+
checked.add(variant)
|
|
86
|
+
|
|
87
|
+
# Check if variant is directly in dictionary
|
|
88
|
+
candidates[variant] = dist + 1 if @words.include?(variant)
|
|
89
|
+
|
|
90
|
+
# Check if variant maps to dictionary words
|
|
91
|
+
@deletes[variant].each do |dict_word|
|
|
92
|
+
# Distance = deletions from input + deletions from dict_word
|
|
93
|
+
# Both reach the same variant
|
|
94
|
+
candidates[dict_word] ||= dist + 2
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Sort by distance and create suggestions
|
|
100
|
+
sorted_words = candidates.sort_by { |_, dist| dist }.map(&:first)
|
|
101
|
+
create_suggestion_set(sorted_words, distances: candidates, original_word: context.word)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Pre-compute deletion variants for all dictionary words.
|
|
105
|
+
#
|
|
106
|
+
# This is called during initialization and builds the index.
|
|
107
|
+
def precompute!
|
|
108
|
+
words = dictionary_words(@dictionary)
|
|
109
|
+
|
|
110
|
+
words.first(@max_dictionary_size).each do |word|
|
|
111
|
+
next if word.nil? || word.empty?
|
|
112
|
+
|
|
113
|
+
word_lower = word.downcase
|
|
114
|
+
@words.add(word_lower)
|
|
115
|
+
|
|
116
|
+
# Generate only single deletion variants for efficiency
|
|
117
|
+
# Multiple deletions are handled during lookup
|
|
118
|
+
generate_single_deletions(word_lower).each do |variant|
|
|
119
|
+
@deletes[variant] << word_lower
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Generate transposition variants if enabled
|
|
123
|
+
if @handle_transpositions
|
|
124
|
+
generate_transpositions(word_lower).each do |variant|
|
|
125
|
+
@deletes[variant] << word_lower
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Generate all adjacent transposition variants of a word.
|
|
132
|
+
#
|
|
133
|
+
# For example, "world" → ["owrld", "wrold", "wolrd", "wordl"]
|
|
134
|
+
#
|
|
135
|
+
# @param word [String] The word
|
|
136
|
+
# @return [Array<String>] Array of variants with adjacent characters swapped
|
|
137
|
+
def generate_transpositions(word)
|
|
138
|
+
variants = []
|
|
139
|
+
word.chars.each_with_index do |_, i|
|
|
140
|
+
next if i == word.length - 1 # Can't swap last character
|
|
141
|
+
|
|
142
|
+
variant = word.dup
|
|
143
|
+
variant[i], variant[i + 1] = variant[i + 1], variant[i]
|
|
144
|
+
variants << variant unless variant == word
|
|
145
|
+
end
|
|
146
|
+
variants
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Calculate deletion distance between two words.
|
|
150
|
+
#
|
|
151
|
+
# For SymSpell, this is the length of their longest common subsequence
|
|
152
|
+
# based distance (minimum deletions to make them equal).
|
|
153
|
+
#
|
|
154
|
+
# @param str1 [String] First word
|
|
155
|
+
# @param str2 [String] Second word
|
|
156
|
+
# @return [Integer] Deletion distance
|
|
157
|
+
def deletion_distance(str1, str2)
|
|
158
|
+
return str2.length if str1.empty?
|
|
159
|
+
return str1.length if str2.empty?
|
|
160
|
+
return 0 if str1 == str2
|
|
161
|
+
|
|
162
|
+
# Simple approach: find if one can be transformed to the other
|
|
163
|
+
# by only deletions (check if str1 is subsequence of str2 or vice versa)
|
|
164
|
+
if is_subsequence?(str1, str2)
|
|
165
|
+
str2.length - str1.length
|
|
166
|
+
elsif is_subsequence?(str2, str1)
|
|
167
|
+
str1.length - str2.length
|
|
168
|
+
else
|
|
169
|
+
# Fallback to edit distance approximation
|
|
170
|
+
# This shouldn't happen often with proper SymSpell usage
|
|
171
|
+
lcs_len = longest_common_subsequence_length(str1, str2)
|
|
172
|
+
str1.length + str2.length - 2 * lcs_len
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
private
|
|
177
|
+
|
|
178
|
+
# Generate all single-deletion variants of a word.
|
|
179
|
+
#
|
|
180
|
+
# @param word [String] The word
|
|
181
|
+
# @return [Array<String>] Array of variants with one character deleted
|
|
182
|
+
def generate_single_deletions(word)
|
|
183
|
+
variants = []
|
|
184
|
+
word.chars.each_with_index do |_, i|
|
|
185
|
+
variant = word[0...i] + word[(i + 1)..].to_s
|
|
186
|
+
variants << variant unless variant.empty? || variant == word
|
|
187
|
+
end
|
|
188
|
+
variants
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# Generate deletion variants from a set of words.
|
|
192
|
+
#
|
|
193
|
+
# @param words_set [Set<String>] Set of words to process
|
|
194
|
+
# @return [Set<String>] New set with all single deletions
|
|
195
|
+
def generate_deletions_from_set(words_set)
|
|
196
|
+
result = Set.new
|
|
197
|
+
words_set.each do |word|
|
|
198
|
+
generate_single_deletions(word).each do |variant|
|
|
199
|
+
result.add(variant)
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
result
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
# Check if str1 is a subsequence of str2.
|
|
206
|
+
#
|
|
207
|
+
# @param str1 [String] Potential subsequence
|
|
208
|
+
# @param str2 [String] String to check against
|
|
209
|
+
# @return [Boolean] True if str1 is subsequence of str2
|
|
210
|
+
def is_subsequence?(str1, str2)
|
|
211
|
+
return true if str1.empty?
|
|
212
|
+
return false if str1.length > str2.length
|
|
213
|
+
|
|
214
|
+
i = 0
|
|
215
|
+
str2.each_char do |c|
|
|
216
|
+
i += 1 if c == str1[i]
|
|
217
|
+
return true if i == str1.length
|
|
218
|
+
end
|
|
219
|
+
i == str1.length
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# Calculate the length of the longest common subsequence.
|
|
223
|
+
#
|
|
224
|
+
# Uses dynamic programming for efficiency.
|
|
225
|
+
#
|
|
226
|
+
# @param str1 [String] First string
|
|
227
|
+
# @param str2 [String] Second string
|
|
228
|
+
# @return [Integer] LCS length
|
|
229
|
+
def longest_common_subsequence_length(str1, str2)
|
|
230
|
+
return 0 if str1.empty? || str2.empty?
|
|
231
|
+
|
|
232
|
+
# Use shorter string for inner loop
|
|
233
|
+
str1, str2 = str2, str1 if str1.length > str2.length
|
|
234
|
+
|
|
235
|
+
# Previous row of DP table
|
|
236
|
+
previous = Array.new(str1.length + 1, 0)
|
|
237
|
+
|
|
238
|
+
str2.each_char do |char2|
|
|
239
|
+
current = [0] # First column is always 0
|
|
240
|
+
|
|
241
|
+
str1.each_char.with_index do |char1, i|
|
|
242
|
+
current << if char1 == char2
|
|
243
|
+
previous[i] + 1
|
|
244
|
+
else
|
|
245
|
+
[current[i], previous[i + 1]].max
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
previous = current
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
previous.last
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
# Get all words from the dictionary.
|
|
256
|
+
#
|
|
257
|
+
# @param dictionary [Object] Dictionary object
|
|
258
|
+
# @return [Array<String>] All words
|
|
259
|
+
def dictionary_words(dictionary)
|
|
260
|
+
if dictionary.respond_to?(:words)
|
|
261
|
+
dictionary.words
|
|
262
|
+
elsif dictionary.is_a?(Array)
|
|
263
|
+
dictionary
|
|
264
|
+
elsif dictionary.is_a?(Hash)
|
|
265
|
+
dictionary.keys
|
|
266
|
+
elsif dictionary.respond_to?(:all_words)
|
|
267
|
+
dictionary.all_words
|
|
268
|
+
else
|
|
269
|
+
[]
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
end
|