kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Algorithms
|
|
5
|
+
# Phonetic suggestion algorithm provides suggestions based on phonetical
|
|
6
|
+
# (pronunciation) similarity.
|
|
7
|
+
#
|
|
8
|
+
# Ported from Spylls (Python) phonet_suggest.py
|
|
9
|
+
#
|
|
10
|
+
# Requires .aff file to define PHONE table (extremely rare in known dictionaries).
|
|
11
|
+
#
|
|
12
|
+
# Internally:
|
|
13
|
+
# 1. Selects words from dictionary similarly to ngram_suggest
|
|
14
|
+
# (and reuses its root_score)
|
|
15
|
+
# 2. Scores their phonetic representations (calculated with metaphone)
|
|
16
|
+
# with phonetic representation of misspelling
|
|
17
|
+
# 3. Chooses the most similar ones with final_score (ngram-based comparison)
|
|
18
|
+
module PhonetSuggest
|
|
19
|
+
MAX_ROOTS = 100
|
|
20
|
+
|
|
21
|
+
class << self
|
|
22
|
+
# Main entry point for phonetic suggestions.
|
|
23
|
+
#
|
|
24
|
+
# Note that both this method and NgramSuggest.suggest iterate through
|
|
25
|
+
# the whole dictionary. Hunspell optimizes by doing it all in one
|
|
26
|
+
# loop. Spylls (and Kotoshu) splits them for clarity.
|
|
27
|
+
#
|
|
28
|
+
# @param misspelling [String] The misspelled word
|
|
29
|
+
# @param dictionary_words [Array<Hash>] Dictionary entries with stem and flags
|
|
30
|
+
# @param table [Hash] Phone table with :rules hash mapping first char to rule list
|
|
31
|
+
# @yield [String] Each suggestion
|
|
32
|
+
#
|
|
33
|
+
# The table structure should have:
|
|
34
|
+
# - :rules => Hash mapping first character to array of rule hashes
|
|
35
|
+
# Each rule has: :search (Regexp), :replacement (String),
|
|
36
|
+
# :start (Boolean), :end (Boolean)
|
|
37
|
+
def suggest(misspelling, dictionary_words:, table:, &block)
|
|
38
|
+
misspelling_lower = misspelling.downcase
|
|
39
|
+
misspelling_ph = metaphone(table, misspelling_lower)
|
|
40
|
+
|
|
41
|
+
scores = []
|
|
42
|
+
|
|
43
|
+
# First, select words from dictionary whose stems are similar to misspelling
|
|
44
|
+
# This cycle is exactly the same as the first cycle in ngram_suggest
|
|
45
|
+
dictionary_words.each do |word|
|
|
46
|
+
stem = word[:stem] || word
|
|
47
|
+
|
|
48
|
+
# Skip words with length difference > 3
|
|
49
|
+
next if (stem.length - misspelling.length).abs > 3
|
|
50
|
+
|
|
51
|
+
# First, calculate "regular" similarity score, just like in ngram_suggest
|
|
52
|
+
nscore = NgramSuggest.root_score(misspelling_lower, stem)
|
|
53
|
+
|
|
54
|
+
# Check alternative spellings if available
|
|
55
|
+
if word[:alt_spellings]
|
|
56
|
+
word[:alt_spellings].each do |variant|
|
|
57
|
+
nscore = [nscore, NgramSuggest.root_score(misspelling_lower, variant)].max
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
next if nscore <= 2
|
|
62
|
+
|
|
63
|
+
# Calculate metaphone score
|
|
64
|
+
word_ph = metaphone(table, stem.downcase)
|
|
65
|
+
score = 2 * StringMetrics.ngram(3, misspelling_ph, word_ph, longer_worse: true)
|
|
66
|
+
|
|
67
|
+
# Use heap-like behavior: keep only MAX_ROOTS best results
|
|
68
|
+
if scores.size >= MAX_ROOTS
|
|
69
|
+
# Remove the worst score if we're at capacity
|
|
70
|
+
scores.sort!.shift if scores.first && scores.first[0] < score
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
scores << [score, stem] if scores.size < MAX_ROOTS || scores.empty? || score > scores.first[0]
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Sort by score descending
|
|
77
|
+
guesses = scores.sort.reverse
|
|
78
|
+
|
|
79
|
+
# Finally, sort suggestions by simplistic string similarity metric
|
|
80
|
+
guesses2 = guesses.map do |score, word|
|
|
81
|
+
final_scr = final_score(misspelling_lower, word.downcase)
|
|
82
|
+
[score + final_scr, word]
|
|
83
|
+
end.sort.reverse
|
|
84
|
+
|
|
85
|
+
guesses2.each do |_, sug|
|
|
86
|
+
yield sug
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Calculate score of suggestion against misspelling.
|
|
91
|
+
#
|
|
92
|
+
# @param word1 [String] Misspelling
|
|
93
|
+
# @param word2 [String] Candidate suggestion
|
|
94
|
+
# @return [Float] Final score
|
|
95
|
+
def final_score(word1, word2)
|
|
96
|
+
2 * StringMetrics.lcslen(word1, word2) -
|
|
97
|
+
(word1.length - word2.length).abs +
|
|
98
|
+
StringMetrics.leftcommonsubstring(word1, word2)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Metaphone calculation.
|
|
102
|
+
#
|
|
103
|
+
# Production in Kotoshu is currently implemented naively as just
|
|
104
|
+
# "search and replace" for rules. To see what potentially should be done,
|
|
105
|
+
# look at aspell's original description:
|
|
106
|
+
# http://aspell.net/man-html/Phonetic-Code.html
|
|
107
|
+
#
|
|
108
|
+
# @param table [Hash] Phone table with :rules hash
|
|
109
|
+
# @param word [String] Word to calculate metaphone for
|
|
110
|
+
# @return [String] Metaphone representation
|
|
111
|
+
def metaphone(table, word)
|
|
112
|
+
return word if table.nil? || table.empty?
|
|
113
|
+
|
|
114
|
+
rules = table[:rules] || {}
|
|
115
|
+
pos = 0
|
|
116
|
+
word_upper = word.upcase
|
|
117
|
+
result = +''
|
|
118
|
+
|
|
119
|
+
while pos < word_upper.length
|
|
120
|
+
char = word_upper[pos]
|
|
121
|
+
matched = false
|
|
122
|
+
|
|
123
|
+
# Get rules for this character
|
|
124
|
+
char_rules = rules[char] || []
|
|
125
|
+
char_rules.each do |rule|
|
|
126
|
+
match_result = match_rule(rule, word_upper, pos)
|
|
127
|
+
next unless match_result
|
|
128
|
+
|
|
129
|
+
result += rule[:replacement]
|
|
130
|
+
pos += match_result
|
|
131
|
+
matched = true
|
|
132
|
+
break
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
pos += 1 unless matched
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
result
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Check if a rule matches at the given position.
|
|
142
|
+
#
|
|
143
|
+
# @param rule [Hash] Rule hash with :search (Regexp), :start, :end
|
|
144
|
+
# @param word [String] The word to match against
|
|
145
|
+
# @param pos [Integer] Position in word
|
|
146
|
+
# @return [Integer, nil] Length of match, or nil if no match
|
|
147
|
+
def match_rule(rule, word, pos)
|
|
148
|
+
# Check start constraint
|
|
149
|
+
return nil if rule[:start] && pos > 0
|
|
150
|
+
|
|
151
|
+
# Try to match
|
|
152
|
+
match_data = if rule[:end]
|
|
153
|
+
# Full match from position
|
|
154
|
+
rule[:search].match(word[pos..])
|
|
155
|
+
else
|
|
156
|
+
# Regular match from position
|
|
157
|
+
rule[:search].match(word, pos)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
return nil unless match_data
|
|
161
|
+
|
|
162
|
+
match_data.to_s.length
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|