kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,671 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require_relative "../suggestion"
|
|
5
|
+
require_relative "../suggestion_set"
|
|
6
|
+
require_relative "base_strategy"
|
|
7
|
+
require_relative "../../data/common_words_loader"
|
|
8
|
+
|
|
9
|
+
module Kotoshu
|
|
10
|
+
module Suggestions
|
|
11
|
+
module Strategies
|
|
12
|
+
# Edit distance suggestion strategy with enhanced ranking.
|
|
13
|
+
# Generates suggestions by finding words with small edit distance,
|
|
14
|
+
# ranked by word frequency, keyboard proximity, and common typo patterns.
|
|
15
|
+
#
|
|
16
|
+
# Multi-language support:
|
|
17
|
+
# - Automatically selects keyboard layout based on language_code
|
|
18
|
+
# - Loads frequency data from YAML files (Phase 1) or GitHub (Phase 2)
|
|
19
|
+
# - Supports language-specific typo patterns
|
|
20
|
+
#
|
|
21
|
+
# This is MORE OOP than Spylls which uses standalone functions
|
|
22
|
+
# for edit distance operations.
|
|
23
|
+
#
|
|
24
|
+
# Follows Open-Closed Principle: Extend by adding YAML files,
|
|
25
|
+
# NOT by modifying this class.
|
|
26
|
+
class EditDistanceStrategy < BaseStrategy
|
|
27
|
+
attr_reader :language_code, :keyboard_layout
|
|
28
|
+
|
|
29
|
+
# @param name [String, Symbol] Name of the strategy
|
|
30
|
+
# @param config [Hash] Configuration options
|
|
31
|
+
# @option config [String] :language_code Language code for keyboard layout (default: 'en')
|
|
32
|
+
# @option config [Keyboard::Layout] :keyboard_layout Custom keyboard layout (optional)
|
|
33
|
+
# @option config [Hash] :frequency_tiers Custom frequency tiers (optional)
|
|
34
|
+
# @option config [Integer] :max_distance Maximum edit distance (default: 2)
|
|
35
|
+
# @option config [Integer] :max_results Maximum results to return (default: 10)
|
|
36
|
+
def initialize(name: :edit_distance, language_code: 'en', keyboard_layout: nil,
|
|
37
|
+
frequency_tiers: nil, **config)
|
|
38
|
+
super(name: name, **config)
|
|
39
|
+
@language_code = language_code
|
|
40
|
+
|
|
41
|
+
# Use OOP registry for keyboard layout lookup
|
|
42
|
+
@keyboard_layout = resolve_keyboard_layout(keyboard_layout)
|
|
43
|
+
|
|
44
|
+
# Use custom frequency tiers if provided, otherwise load from Kelly data
|
|
45
|
+
if frequency_tiers
|
|
46
|
+
@frequency_tiers = frequency_tiers
|
|
47
|
+
@common_words = Set.new
|
|
48
|
+
else
|
|
49
|
+
# Load frequency data for the language from Kelly JSON
|
|
50
|
+
# This sets @frequency_tiers internally
|
|
51
|
+
load_frequency_data(language_code)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Public method to get current keyboard being used
|
|
56
|
+
#
|
|
57
|
+
# @return [Keyboard::Layout] The keyboard layout instance
|
|
58
|
+
def keyboard
|
|
59
|
+
@keyboard_layout
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Public method to get keyboard name
|
|
63
|
+
#
|
|
64
|
+
# @return [String] Keyboard layout name
|
|
65
|
+
def keyboard_name
|
|
66
|
+
@keyboard_layout.name
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Check if a substitution is a keyboard-adjacent typo
|
|
70
|
+
#
|
|
71
|
+
# @param char1 [String] First character
|
|
72
|
+
# @param char2 [String] Second character
|
|
73
|
+
# @return [Boolean] True if keys are adjacent
|
|
74
|
+
def adjacent_key_typo?(char1, char2)
|
|
75
|
+
@keyboard_layout.adjacent_keys(char1).include?(char2)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Get adjacent keys for a given key
|
|
79
|
+
#
|
|
80
|
+
# @param key [String] The key to find adjacent keys for
|
|
81
|
+
# @return [Array<String>] List of adjacent key characters
|
|
82
|
+
def adjacent_keys(key)
|
|
83
|
+
@keyboard_layout.adjacent_keys(key)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Get frequency bonus for a word
|
|
87
|
+
#
|
|
88
|
+
# @param word [String] The word to check
|
|
89
|
+
# @return [Integer] Frequency bonus (0-200)
|
|
90
|
+
def frequency_bonus(word)
|
|
91
|
+
return 0 unless @frequency_tiers
|
|
92
|
+
|
|
93
|
+
word_downcase = word.downcase
|
|
94
|
+
|
|
95
|
+
# Top 50: 200 bonus
|
|
96
|
+
return 200 if @frequency_tiers[:top_50]&.include?(word_downcase)
|
|
97
|
+
|
|
98
|
+
# Top 200: 100 bonus
|
|
99
|
+
return 100 if @frequency_tiers[:top_200]&.include?(word_downcase)
|
|
100
|
+
|
|
101
|
+
# Top 1000: 50 bonus
|
|
102
|
+
return 50 if @frequency_tiers[:top_1000]&.include?(word_downcase)
|
|
103
|
+
|
|
104
|
+
# Not in common words: no bonus
|
|
105
|
+
0
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Generate suggestions based on enhanced edit distance scoring.
|
|
109
|
+
#
|
|
110
|
+
# Scoring factors:
|
|
111
|
+
# - Edit distance (primary factor)
|
|
112
|
+
# - Word frequency (common words rank higher)
|
|
113
|
+
# - Keyboard proximity (adjacent key typos rank higher)
|
|
114
|
+
# - Common typo patterns (missing double letters, etc.)
|
|
115
|
+
#
|
|
116
|
+
# @param context [Context] The suggestion context
|
|
117
|
+
# @return [SuggestionSet] Suggestions within max_distance
|
|
118
|
+
def generate(context)
|
|
119
|
+
word = context.word
|
|
120
|
+
max_dist = get_config(:max_distance, 2)
|
|
121
|
+
min_confidence = get_config(:min_confidence, 0.75) # Higher threshold for quality
|
|
122
|
+
min_similarity = get_config(:min_jaro_similarity, 0.70) # Minimum Jaro-Winkler similarity (0.0-1.0)
|
|
123
|
+
min_results = get_config(:min_results, 3) # Always return at least 3 suggestions if available
|
|
124
|
+
|
|
125
|
+
# Get all dictionary words
|
|
126
|
+
all_words = dictionary_words(context)
|
|
127
|
+
|
|
128
|
+
# Calculate enhanced scores for all candidates
|
|
129
|
+
candidates = []
|
|
130
|
+
all_words.each do |dict_word|
|
|
131
|
+
next if dict_word == word
|
|
132
|
+
|
|
133
|
+
dist = edit_distance(word, dict_word)
|
|
134
|
+
next if dist > max_dist || dist <= 0
|
|
135
|
+
|
|
136
|
+
# Calculate enhanced score (lower is better)
|
|
137
|
+
score = calculate_enhanced_score(word, dict_word, dist)
|
|
138
|
+
candidates << [dict_word, dist, score]
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Sort by enhanced score (lower is better)
|
|
142
|
+
sorted_candidates = candidates.sort_by { |_, _, score| score }
|
|
143
|
+
|
|
144
|
+
# Calculate confidence scores with threshold filtering
|
|
145
|
+
if sorted_candidates.empty?
|
|
146
|
+
return SuggestionSet.empty
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
max_score = sorted_candidates.map { |_, _, s| s.to_f }.max
|
|
150
|
+
min_score = sorted_candidates.map { |_, _, s| s.to_f }.min
|
|
151
|
+
score_range = (max_score - min_score).abs
|
|
152
|
+
|
|
153
|
+
# Create suggestions with confidence-based filtering
|
|
154
|
+
suggestions = []
|
|
155
|
+
sorted_candidates.each do |dict_word, dist, score|
|
|
156
|
+
# Normalize score to confidence (0.0 to 1.0)
|
|
157
|
+
# Lower score = higher confidence
|
|
158
|
+
if score_range > 0
|
|
159
|
+
normalized = (score.to_f - min_score) / score_range # 0 to 1
|
|
160
|
+
confidence = 1.0 - normalized # Invert: lower score = higher confidence
|
|
161
|
+
else
|
|
162
|
+
confidence = 1.0
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Calculate Jaro-Winkler similarity for additional filtering
|
|
166
|
+
jaro_similarity = calculate_ngram_similarity(word, dict_word)
|
|
167
|
+
|
|
168
|
+
# Skip low-confidence or low-similarity suggestions (unless we need more for min_results)
|
|
169
|
+
if confidence < min_confidence || jaro_similarity < min_similarity
|
|
170
|
+
next if suggestions.size >= min_results
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
suggestions << Suggestion.new(
|
|
174
|
+
word: dict_word,
|
|
175
|
+
distance: dist,
|
|
176
|
+
confidence: confidence,
|
|
177
|
+
source: @name,
|
|
178
|
+
original_length: word.length,
|
|
179
|
+
ngram_score: jaro_similarity, # Now stores Jaro-Winkler similarity (0.0-1.0)
|
|
180
|
+
enhanced_score: score
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Stop when we have enough high-quality suggestions
|
|
184
|
+
break if suggestions.size >= max_results
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
SuggestionSet.new(suggestions, max_size: max_results)
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Check if this strategy should handle the context.
|
|
191
|
+
#
|
|
192
|
+
# @param context [Context] The suggestion context
|
|
193
|
+
# @return [Boolean] True if the word needs correction
|
|
194
|
+
def handles?(context)
|
|
195
|
+
return false unless enabled?
|
|
196
|
+
|
|
197
|
+
# Only handle if the word is not in the dictionary
|
|
198
|
+
!dictionary_lookup(context, context.word)
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
private
|
|
202
|
+
|
|
203
|
+
# Get all words from the dictionary.
|
|
204
|
+
#
|
|
205
|
+
# @param context [Context] The suggestion context
|
|
206
|
+
# @return [Array<String>] All dictionary words
|
|
207
|
+
def dictionary_words(context)
|
|
208
|
+
dictionary = context.dictionary
|
|
209
|
+
|
|
210
|
+
# Check for IndexedDictionary if Core module is loaded
|
|
211
|
+
if defined?(::Kotoshu::Core::IndexedDictionary) && dictionary.is_a?(::Kotoshu::Core::IndexedDictionary)
|
|
212
|
+
dictionary.all_words
|
|
213
|
+
elsif dictionary.respond_to?(:words)
|
|
214
|
+
dictionary.words
|
|
215
|
+
elsif dictionary.is_a?(Hash)
|
|
216
|
+
dictionary.keys
|
|
217
|
+
elsif dictionary.is_a?(Set)
|
|
218
|
+
dictionary.to_a
|
|
219
|
+
elsif dictionary.is_a?(Array)
|
|
220
|
+
dictionary
|
|
221
|
+
else
|
|
222
|
+
# Fallback: try to iterate
|
|
223
|
+
Array(dictionary).flat_map(&:to_a)
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# Check if a word exists in the dictionary.
|
|
228
|
+
#
|
|
229
|
+
# @param context [Context] The suggestion context
|
|
230
|
+
# @param word [String] The word to check
|
|
231
|
+
# @return [Boolean] True if word exists
|
|
232
|
+
def dictionary_lookup(context, word)
|
|
233
|
+
dictionary = context.dictionary
|
|
234
|
+
|
|
235
|
+
# First check if it's a dictionary backend with lookup method
|
|
236
|
+
if dictionary.respond_to?(:lookup)
|
|
237
|
+
dictionary.lookup(word)
|
|
238
|
+
elsif defined?(::Kotoshu::Core::IndexedDictionary) && dictionary.is_a?(::Kotoshu::Core::IndexedDictionary)
|
|
239
|
+
dictionary.has_word?(word)
|
|
240
|
+
elsif dictionary.is_a?(Set)
|
|
241
|
+
dictionary.include?(word)
|
|
242
|
+
elsif dictionary.respond_to?(:include?)
|
|
243
|
+
dictionary.include?(word)
|
|
244
|
+
elsif dictionary.is_a?(Hash)
|
|
245
|
+
dictionary.key?(word)
|
|
246
|
+
else
|
|
247
|
+
false
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Calculate Damerau-Levenshtein edit distance between two strings.
|
|
252
|
+
# This extends Levenshtein by treating transposition of adjacent characters as 1 operation.
|
|
253
|
+
#
|
|
254
|
+
# Examples:
|
|
255
|
+
# "wrold" → "world" = 1 (transposition of 'r' and 'o')
|
|
256
|
+
# "hello" → "hell" = 1 (deletion)
|
|
257
|
+
# "cat" → "cut" = 1 (substitution)
|
|
258
|
+
#
|
|
259
|
+
# @param str1 [String] First string
|
|
260
|
+
# @param str2 [String] Second string
|
|
261
|
+
# @return [Integer] Edit distance
|
|
262
|
+
def edit_distance(str1, str2)
|
|
263
|
+
# Handle empty strings
|
|
264
|
+
return str2.length if str1.empty?
|
|
265
|
+
return str1.length if str2.empty?
|
|
266
|
+
|
|
267
|
+
len1 = str1.length
|
|
268
|
+
len2 = str2.length
|
|
269
|
+
|
|
270
|
+
# Create a 2D array for dynamic programming
|
|
271
|
+
d = Array.new(len1 + 1) { Array.new(len2 + 1, 0) }
|
|
272
|
+
|
|
273
|
+
# Initialize the first row and column
|
|
274
|
+
(0..len1).each { |i| d[i][0] = i }
|
|
275
|
+
(0..len2).each { |j| d[0][j] = j }
|
|
276
|
+
|
|
277
|
+
# Fill the matrix
|
|
278
|
+
(1..len1).each do |i|
|
|
279
|
+
(1..len2).each do |j|
|
|
280
|
+
cost = (str1[i - 1] == str2[j - 1]) ? 0 : 1
|
|
281
|
+
|
|
282
|
+
d[i][j] = [
|
|
283
|
+
d[i - 1][j] + 1, # deletion
|
|
284
|
+
d[i][j - 1] + 1, # insertion
|
|
285
|
+
d[i - 1][j - 1] + cost # substitution
|
|
286
|
+
].min
|
|
287
|
+
|
|
288
|
+
# Check for transposition (Damerau extension)
|
|
289
|
+
if i > 1 && j > 1 &&
|
|
290
|
+
str1[i - 1] == str2[j - 2] &&
|
|
291
|
+
str1[i - 2] == str2[j - 1]
|
|
292
|
+
d[i][j] = [d[i][j], d[i - 2][j - 2] + 1].min
|
|
293
|
+
end
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
d[len1][len2]
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
# Optimized edit distance with early termination.
|
|
301
|
+
# Returns early if distance exceeds threshold.
|
|
302
|
+
#
|
|
303
|
+
# @param str1 [String] First string
|
|
304
|
+
# @param str2 [String] Second string
|
|
305
|
+
# @param threshold [Integer] Maximum distance to calculate
|
|
306
|
+
# @return [Integer, nil] Distance or nil if exceeds threshold
|
|
307
|
+
def edit_distance_with_threshold(str1, str2, threshold)
|
|
308
|
+
# For now, use the regular implementation
|
|
309
|
+
# This can be optimized later with early termination
|
|
310
|
+
dist = edit_distance(str1, str2)
|
|
311
|
+
dist <= threshold ? dist : nil
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
# Calculate enhanced score combining multiple factors.
|
|
315
|
+
#
|
|
316
|
+
# Lower score = better suggestion
|
|
317
|
+
#
|
|
318
|
+
# @param original [String] The original misspelled word
|
|
319
|
+
# @param suggestion [String] The suggested word
|
|
320
|
+
# @param distance [Integer] Edit distance
|
|
321
|
+
# @return [Float] Enhanced score (lower is better)
|
|
322
|
+
def calculate_enhanced_score(original, suggestion, distance)
|
|
323
|
+
score = distance * 1000.0 # Base score from edit distance
|
|
324
|
+
|
|
325
|
+
# Factor 1: Word frequency bonus (common words get lower score)
|
|
326
|
+
score -= frequency_bonus(suggestion)
|
|
327
|
+
|
|
328
|
+
# Factor 2: Keyboard proximity penalty (typo-like patterns get lower score)
|
|
329
|
+
score += keyboard_penalty(original, suggestion)
|
|
330
|
+
|
|
331
|
+
# Factor 3: Common typo pattern bonus
|
|
332
|
+
# Transposition (swap adjacent chars) is the MOST common typo
|
|
333
|
+
trans_bonus = transposition_bonus(original, suggestion)
|
|
334
|
+
score -= trans_bonus
|
|
335
|
+
|
|
336
|
+
# Factor 4: Missing double letter bonus (helo -> hello)
|
|
337
|
+
score -= typo_pattern_bonus(original, suggestion)
|
|
338
|
+
|
|
339
|
+
# Factor 5: Length similarity bonus (similar length is better)
|
|
340
|
+
length_diff = (original.length - suggestion.length).abs
|
|
341
|
+
score += length_diff * 50
|
|
342
|
+
|
|
343
|
+
score
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
# Calculate bonus for transposition (swap adjacent characters).
|
|
347
|
+
# This is the MOST common typing error, so it gets the highest bonus.
|
|
348
|
+
#
|
|
349
|
+
# @param original [String] The original word
|
|
350
|
+
# @param suggestion [String] The suggested word
|
|
351
|
+
# @return [Float] Transposition bonus (0 or 200)
|
|
352
|
+
def transposition_bonus(original, suggestion)
|
|
353
|
+
# Transposition only makes sense for same-length words
|
|
354
|
+
return 0 unless original.length == suggestion.length
|
|
355
|
+
|
|
356
|
+
o = original.downcase
|
|
357
|
+
s = suggestion.downcase
|
|
358
|
+
|
|
359
|
+
# Count transpositions needed
|
|
360
|
+
transpositions = 0
|
|
361
|
+
(0...o.length).each do |i|
|
|
362
|
+
next if o[i] == s[i]
|
|
363
|
+
# Find matching char in suggestion
|
|
364
|
+
match_idx = s.index(o[i], i + 1)
|
|
365
|
+
if match_idx && (match_idx == i + 1 || (match_idx > i + 1 && s[i] == o[match_idx]))
|
|
366
|
+
# This is a simple adjacent swap
|
|
367
|
+
transpositions += 1
|
|
368
|
+
end
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
# Only give bonus for single transposition
|
|
372
|
+
transpositions == 1 ? 200 : (transpositions * 100)
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
# Calculate keyboard proximity penalty.
|
|
376
|
+
#
|
|
377
|
+
# Substitutions between adjacent keys get lower penalty.
|
|
378
|
+
# Uses OOP keyboard layout for language-aware distance calculations.
|
|
379
|
+
#
|
|
380
|
+
# @param original [String] The original word
|
|
381
|
+
# @param suggestion [String] The suggested word
|
|
382
|
+
# @return [Float] Keyboard penalty (0-200)
|
|
383
|
+
def keyboard_penalty(original, suggestion)
|
|
384
|
+
penalty = 0
|
|
385
|
+
|
|
386
|
+
# Find the edit script to see what changed
|
|
387
|
+
o_chars = original.chars
|
|
388
|
+
s_chars = suggestion.chars
|
|
389
|
+
|
|
390
|
+
# Simple comparison for equal-length words (substitutions)
|
|
391
|
+
if o_chars.length == s_chars.length
|
|
392
|
+
o_chars.each_with_index do |c1, i|
|
|
393
|
+
c2 = s_chars[i]
|
|
394
|
+
next if c1 == c2
|
|
395
|
+
|
|
396
|
+
# Use OOP keyboard layout for distance calculation
|
|
397
|
+
key_dist = @keyboard_layout.distance(c1, c2)
|
|
398
|
+
|
|
399
|
+
if key_dist == Float::INFINITY
|
|
400
|
+
# Symbol or unknown key - medium penalty
|
|
401
|
+
penalty += 50
|
|
402
|
+
elsif key_dist == 1
|
|
403
|
+
penalty += 10 # Very likely typo (adjacent keys)
|
|
404
|
+
elsif key_dist == 2
|
|
405
|
+
penalty += 30 # Somewhat likely
|
|
406
|
+
else
|
|
407
|
+
penalty += 100 # Unlikely to be typo (far keys)
|
|
408
|
+
end
|
|
409
|
+
end
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
penalty
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
# Calculate bonus for common typo patterns.
|
|
416
|
+
#
|
|
417
|
+
# @param original [String] The original word
|
|
418
|
+
# @param suggestion [String] The suggested word
|
|
419
|
+
# @return [Float] Pattern bonus (0-300)
|
|
420
|
+
def typo_pattern_bonus(original, suggestion)
|
|
421
|
+
bonus = 0
|
|
422
|
+
|
|
423
|
+
# Pattern 1: Missing double letter (helo -> hello)
|
|
424
|
+
# This is the MOST COMMON typo after transposition, give it highest bonus
|
|
425
|
+
if suggestion.length == original.length + 1
|
|
426
|
+
# Check if suggestion has a double letter that original is missing
|
|
427
|
+
suggestion.chars.each_cons(2).with_index do |pair, i|
|
|
428
|
+
if pair[0] == pair[1] # Found double letter at positions i and i+1
|
|
429
|
+
# Check if removing the second occurrence (at i+1) gives us the original word
|
|
430
|
+
# For "hello" with "ll" at position 2, remove position 3: "hel" + "o" = "helo"
|
|
431
|
+
expected = suggestion[0...i + 1] + suggestion[i + 2..-1]
|
|
432
|
+
if expected == original
|
|
433
|
+
bonus += 300 # Strong bonus for missing double letter (MORE than transposition!)
|
|
434
|
+
break
|
|
435
|
+
end
|
|
436
|
+
end
|
|
437
|
+
end
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
# Pattern 2: Extra double letter (helllo -> hello)
|
|
441
|
+
if original.length == suggestion.length + 1
|
|
442
|
+
# Check if original has a double letter that suggestion doesn't
|
|
443
|
+
original.chars.each_cons(2).with_index do |pair, i|
|
|
444
|
+
if pair[0] == pair[1] # Found double letter in original
|
|
445
|
+
# Check if removing it gives the suggestion
|
|
446
|
+
reconstructed = original[0...i + 1] + original[i + 1..-1]
|
|
447
|
+
if reconstructed == suggestion
|
|
448
|
+
bonus += 100 # Bonus for extra double letter
|
|
449
|
+
break
|
|
450
|
+
end
|
|
451
|
+
end
|
|
452
|
+
end
|
|
453
|
+
end
|
|
454
|
+
|
|
455
|
+
# Pattern 3: Common prefixes/suffixes
|
|
456
|
+
if original.start_with?(suggestion[0...3]) && suggestion.length > original.length
|
|
457
|
+
bonus += 30 # Suggestion extends common prefix
|
|
458
|
+
end
|
|
459
|
+
|
|
460
|
+
bonus
|
|
461
|
+
end
|
|
462
|
+
|
|
463
|
+
private
|
|
464
|
+
|
|
465
|
+
# Resolve keyboard layout using OOP registry pattern
|
|
466
|
+
#
|
|
467
|
+
# @param keyboard_layout [Keyboard::Layout, String, nil] Layout override
|
|
468
|
+
# @return [Keyboard::Layout] Resolved layout
|
|
469
|
+
def resolve_keyboard_layout(keyboard_layout)
|
|
470
|
+
require_relative '../../../kotoshu/keyboard/registry'
|
|
471
|
+
|
|
472
|
+
if keyboard_layout.is_a?(Keyboard::Layout)
|
|
473
|
+
keyboard_layout
|
|
474
|
+
elsif keyboard_layout.is_a?(String)
|
|
475
|
+
Keyboard::Registry.layout_by_name(keyboard_layout)
|
|
476
|
+
elsif @language_code
|
|
477
|
+
Keyboard::Registry.layout_for(@language_code)
|
|
478
|
+
else
|
|
479
|
+
Keyboard::Registry.layout_by_name('QWERTY')
|
|
480
|
+
end
|
|
481
|
+
end
|
|
482
|
+
|
|
483
|
+
# Load frequency data for the language.
|
|
484
|
+
#
|
|
485
|
+
# Uses a tiered approach:
|
|
486
|
+
# 1. First tries to load from local Kelly JSON files (frequency-list-kelly/data/)
|
|
487
|
+
# 2. Then tries to load from GitHub frequency.json (Phase 2)
|
|
488
|
+
# 3. Falls back to local YAML files (Phase 1)
|
|
489
|
+
# 4. Falls back to empty set if no data available
|
|
490
|
+
#
|
|
491
|
+
# This follows the Open-Closed Principle: new languages are added
|
|
492
|
+
# by creating new JSON/YAML files, not by modifying this class.
|
|
493
|
+
#
|
|
494
|
+
# @param language_code [String] ISO 639-1 language code
|
|
495
|
+
# @return [Hash{Symbol => Set}] Hash with :tiers and :metadata
|
|
496
|
+
# Load frequency data for the language.
|
|
497
|
+
#
|
|
498
|
+
# Uses a tiered approach following OOP cache pattern:
|
|
499
|
+
# 1. First tries FrequencyCache (Kelly Project from GitHub with caching)
|
|
500
|
+
# 2. Falls back to local YAML files (legacy)
|
|
501
|
+
# 3. Falls back to empty set if no data available
|
|
502
|
+
#
|
|
503
|
+
# This follows the Open-Closed Principle: new languages are added
|
|
504
|
+
# by creating new JSON files, not by modifying this class.
|
|
505
|
+
#
|
|
506
|
+
# @param language_code [String] ISO 639-1 language code
|
|
507
|
+
# @return [Hash{Symbol => Set}] Hash with :tiers and :metadata
|
|
508
|
+
def load_frequency_data(language_code)
|
|
509
|
+
# Phase 1: Try Kelly FrequencyCache (GitHub download + local caching)
|
|
510
|
+
cache_result = try_load_from_frequency_cache(language_code)
|
|
511
|
+
if cache_result && cache_result[:tiers] && cache_result[:tiers][:top_1000].any?
|
|
512
|
+
@frequency_tiers = cache_result[:tiers]
|
|
513
|
+
return @frequency_tiers
|
|
514
|
+
end
|
|
515
|
+
|
|
516
|
+
# Phase 2: Load from local YAML files (legacy)
|
|
517
|
+
yaml_data = Data::CommonWordsLoader.load(language_code)
|
|
518
|
+
|
|
519
|
+
if yaml_data[:tiers][:top_1000].any?
|
|
520
|
+
@frequency_tiers = yaml_data[:tiers]
|
|
521
|
+
return @frequency_tiers
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
# No data available for this language
|
|
525
|
+
@frequency_tiers = {
|
|
526
|
+
top_50: Set.new,
|
|
527
|
+
top_200: Set.new,
|
|
528
|
+
top_1000: Set.new
|
|
529
|
+
}
|
|
530
|
+
@frequency_tiers
|
|
531
|
+
end
|
|
532
|
+
|
|
533
|
+
private
|
|
534
|
+
|
|
535
|
+
# Try to load frequency data from FrequencyCache (OOP cache pattern).
|
|
536
|
+
#
|
|
537
|
+
# Uses FrequencyCache to download Kelly frequency lists from GitHub
|
|
538
|
+
# with automatic caching in $XDG_CACHE_HOME/kotoshu/frequency-lists/
|
|
539
|
+
#
|
|
540
|
+
# @param language_code [String] ISO 639-1 language code
|
|
541
|
+
# @return [Hash, nil] Frequency data or nil if not available
|
|
542
|
+
def try_load_from_frequency_cache(language_code)
|
|
543
|
+
require_relative '../../../kotoshu/cache/frequency_cache'
|
|
544
|
+
|
|
545
|
+
cache = Cache::FrequencyCache.new
|
|
546
|
+
|
|
547
|
+
# Check if language is supported by Kelly
|
|
548
|
+
return nil unless cache.available_languages.include?(language_code)
|
|
549
|
+
|
|
550
|
+
begin
|
|
551
|
+
# Try to get from cache (will download if not cached or expired)
|
|
552
|
+
result = cache.get(language_code)
|
|
553
|
+
return result if result
|
|
554
|
+
rescue StandardError => e
|
|
555
|
+
warn "Warning: Failed to load frequency cache for #{language_code}: #{e.message}" if $VERBOSE
|
|
556
|
+
end
|
|
557
|
+
|
|
558
|
+
nil
|
|
559
|
+
end
|
|
560
|
+
|
|
561
|
+
# Deprecated: Use FrequencyCache instead.
|
|
562
|
+
# Kept for backwards compatibility during migration.
|
|
563
|
+
def try_load_from_github(language_code); end
|
|
564
|
+
def try_load_from_kelly(language_code); end
|
|
565
|
+
def try_load_kelly_local(language_code); end
|
|
566
|
+
def try_load_kelly_from_github(language_code); end
|
|
567
|
+
# Kelly Project frequency lists are stored in:
|
|
568
|
+
# frequency-list-kelly/data/{language_code}.json
|
|
569
|
+
#
|
|
570
|
+
# @param language_code [String] ISO 639-1 language code
|
|
571
|
+
# @return [Hash, nil] Frequency data or nil if not available
|
|
572
|
+
def try_load_from_kelly(language_code)
|
|
573
|
+
# Try local paths first
|
|
574
|
+
local_data = try_load_kelly_local(language_code)
|
|
575
|
+
return local_data if local_data
|
|
576
|
+
|
|
577
|
+
# If not found locally, try downloading from GitHub
|
|
578
|
+
try_load_kelly_from_github(language_code)
|
|
579
|
+
end
|
|
580
|
+
|
|
581
|
+
# Try to load Kelly data from local file paths.
|
|
582
|
+
#
|
|
583
|
+
# @param language_code [String] ISO 639-1 language code
|
|
584
|
+
# @return [Hash, nil] Frequency data or nil if not available
|
|
585
|
+
def try_load_kelly_local(language_code)
|
|
586
|
+
kelly_paths = [
|
|
587
|
+
# Check if we're in the kotoshu/kotoshu subdirectory
|
|
588
|
+
File.expand_path('../../../../frequency-list-kelly/data', __dir__),
|
|
589
|
+
# Check if we're in the kotoshu repo with frequency-list-kelly sibling
|
|
590
|
+
File.expand_path('../../frequency-list-kelly/data', __dir__),
|
|
591
|
+
# Check if we're in the kotoshu/lib subdirectory
|
|
592
|
+
File.expand_path('../../../frequency-list-kelly/data', __dir__),
|
|
593
|
+
# User's local kotoshu clone
|
|
594
|
+
File.expand_path('~/src/kotoshu/frequency-list-kelly/data'),
|
|
595
|
+
# Environment variable override
|
|
596
|
+
ENV['KELLY_DATA_PATH']
|
|
597
|
+
].compact.uniq
|
|
598
|
+
|
|
599
|
+
kelly_paths.each do |path|
|
|
600
|
+
potential_file = File.join(path, "#{language_code}.json")
|
|
601
|
+
if File.exist?(potential_file)
|
|
602
|
+
begin
|
|
603
|
+
return Data::CommonWordsLoader.load_from_frequency_file(potential_file)
|
|
604
|
+
rescue StandardError => e
|
|
605
|
+
warn "Warning: Failed to load local Kelly data for #{language_code}: #{e.message}" if $VERBOSE
|
|
606
|
+
end
|
|
607
|
+
end
|
|
608
|
+
end
|
|
609
|
+
|
|
610
|
+
nil
|
|
611
|
+
end
|
|
612
|
+
|
|
613
|
+
# Try to download Kelly data from GitHub.
|
|
614
|
+
#
|
|
615
|
+
# Kelly data is cached in $XDG_CACHE_HOME/kotoshu/frequency-lists/
|
|
616
|
+
#
|
|
617
|
+
# @param language_code [String] ISO 639-1 language code
|
|
618
|
+
# @return [Hash, nil] Frequency data or nil if not available
|
|
619
|
+
def try_load_kelly_from_github(language_code)
|
|
620
|
+
require 'net/http'
|
|
621
|
+
require 'fileutils'
|
|
622
|
+
|
|
623
|
+
kelly_languages = %w[ar zh en el it no ru sv]
|
|
624
|
+
return nil unless kelly_languages.include?(language_code)
|
|
625
|
+
|
|
626
|
+
# Cache in $XDG_CACHE_HOME/kotoshu/frequency-lists/ (same pattern as dictionaries)
|
|
627
|
+
cache_dir = File.join(Kotoshu::Paths.cache_path, 'frequency-lists')
|
|
628
|
+
FileUtils.mkdir_p(cache_dir)
|
|
629
|
+
|
|
630
|
+
cached_file = File.join(cache_dir, "#{language_code}.json")
|
|
631
|
+
cache_ttl = 604_800 # 7 days
|
|
632
|
+
|
|
633
|
+
# Use cached file if it exists and is recent
|
|
634
|
+
if File.exist?(cached_file)
|
|
635
|
+
file_age = Time.now - File.mtime(cached_file)
|
|
636
|
+
if file_age < cache_ttl
|
|
637
|
+
begin
|
|
638
|
+
data = Data::CommonWordsLoader.load_from_frequency_file(cached_file)
|
|
639
|
+
return data[:tiers]
|
|
640
|
+
rescue StandardError => e
|
|
641
|
+
warn "Warning: Failed to load cached Kelly data for #{language_code}: #{e.message}" if $VERBOSE
|
|
642
|
+
end
|
|
643
|
+
end
|
|
644
|
+
end
|
|
645
|
+
|
|
646
|
+
# Download from GitHub (kotoshu/frequency-list-kelly repository)
|
|
647
|
+
url = "https://raw.githubusercontent.com/kotoshu/frequency-list-kelly/main/data/#{language_code}.json"
|
|
648
|
+
|
|
649
|
+
begin
|
|
650
|
+
warn "Downloading Kelly frequency data for #{language_code} from GitHub..." if $VERBOSE
|
|
651
|
+
|
|
652
|
+
uri = URI(url)
|
|
653
|
+
response = Net::HTTP.get(uri)
|
|
654
|
+
|
|
655
|
+
# Validate JSON before saving
|
|
656
|
+
JSON.parse(response) # Validate it's valid JSON
|
|
657
|
+
|
|
658
|
+
# Save to cache
|
|
659
|
+
File.write(cached_file, response)
|
|
660
|
+
|
|
661
|
+
data = Data::CommonWordsLoader.load_from_frequency_file(cached_file)
|
|
662
|
+
data[:tiers]
|
|
663
|
+
rescue StandardError => e
|
|
664
|
+
warn "Warning: Failed to download Kelly data for #{language_code}: #{e.message}" if $VERBOSE
|
|
665
|
+
nil
|
|
666
|
+
end
|
|
667
|
+
end
|
|
668
|
+
end
|
|
669
|
+
end
|
|
670
|
+
end
|
|
671
|
+
end
|