kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'zip'
|
|
4
|
+
require_relative 'readers/file_reader'
|
|
5
|
+
require_relative 'readers/aff_reader'
|
|
6
|
+
require_relative 'readers/dic_reader'
|
|
7
|
+
require_relative 'readers/lookup_builder'
|
|
8
|
+
require_relative 'algorithms/lookup'
|
|
9
|
+
require_relative 'algorithms/suggest'
|
|
10
|
+
|
|
11
|
+
module Kotoshu
|
|
12
|
+
# Unified Dictionary interface matching Spylls API.
|
|
13
|
+
#
|
|
14
|
+
# This class provides the main interface to Hunspell dictionaries,
|
|
15
|
+
# supporting loading from files, zip archives, and system paths.
|
|
16
|
+
#
|
|
17
|
+
# @example Loading from files
|
|
18
|
+
# dictionary = Dictionary.from_files('/path/to/dictionary/en_US')
|
|
19
|
+
# dictionary.lookup('spells') # => true
|
|
20
|
+
#
|
|
21
|
+
# @example Loading from zip archive
|
|
22
|
+
# dictionary = Dictionary.from_zip('/path/to/dictionary/en_US.odt')
|
|
23
|
+
#
|
|
24
|
+
# @example Loading from system
|
|
25
|
+
# dictionary = Dictionary.from_system('en_US')
|
|
26
|
+
#
|
|
27
|
+
# @example Getting suggestions
|
|
28
|
+
# dictionary.suggest('spylls') # => ["spells", "spills", ...]
|
|
29
|
+
#
|
|
30
|
+
# @example Accessing algorithms for experimentation
|
|
31
|
+
# dictionary.lookuper.good_forms('building') do |form|
|
|
32
|
+
# puts form
|
|
33
|
+
# end
|
|
34
|
+
class Dictionary
|
|
35
|
+
# System paths to search for Hunspell dictionaries
|
|
36
|
+
PATHES = [
|
|
37
|
+
'/usr/share/hunspell',
|
|
38
|
+
'/usr/share/myspell',
|
|
39
|
+
'/usr/share/myspell/dicts',
|
|
40
|
+
'/Library/Spelling',
|
|
41
|
+
'/opt/openoffice.org/basis3.0/share/dict/ooo',
|
|
42
|
+
'/usr/lib/openoffice.org/basis3.0/share/dict/ooo',
|
|
43
|
+
'/opt/openoffice.org2.4/share/dict/ooo',
|
|
44
|
+
'/usr/lib/openoffice.org2.4/share/dict/ooo',
|
|
45
|
+
'/opt/openoffice.org2.3/share/dict/ooo',
|
|
46
|
+
'/usr/lib/openoffice.org2.3/share/dict/ooo',
|
|
47
|
+
'/opt/openoffice.org2.2/share/dict/ooo',
|
|
48
|
+
'/usr/lib/openoffice.org2.2/share/dict/ooo',
|
|
49
|
+
'/opt/openoffice.org2.1/share/dict/ooo',
|
|
50
|
+
'/usr/lib/openoffice.org2.1/share/dict/ooo',
|
|
51
|
+
'/opt/openoffice.org2.0/share/dict/ooo',
|
|
52
|
+
'/usr/lib/openoffice.org2.0/share/dict/ooo'
|
|
53
|
+
].freeze
|
|
54
|
+
|
|
55
|
+
# Distributed dictionaries for testing
|
|
56
|
+
DISTRIBUTED = {
|
|
57
|
+
'en_US' => 'en',
|
|
58
|
+
'ru' => 'ru',
|
|
59
|
+
'sv_SE' => 'sv'
|
|
60
|
+
}.freeze
|
|
61
|
+
|
|
62
|
+
# @return [Hash] Aff data structure
|
|
63
|
+
attr_reader :aff
|
|
64
|
+
|
|
65
|
+
# @return [Array<Readers::Word>] Dic data structure
|
|
66
|
+
attr_reader :dic_words
|
|
67
|
+
|
|
68
|
+
# @return [Algorithms::Lookup::Lookuper] Lookuper instance for experimentation
|
|
69
|
+
attr_reader :lookuper
|
|
70
|
+
|
|
71
|
+
# @return [Algorithms::Suggest::Suggester] Suggester instance for experimentation
|
|
72
|
+
attr_reader :suggester
|
|
73
|
+
|
|
74
|
+
# Create a Dictionary from aff and dic data.
|
|
75
|
+
#
|
|
76
|
+
# @param aff [Hash] Aff data structure
|
|
77
|
+
# @param dic_words [Array<Readers::Word>] Dictionary word entries
|
|
78
|
+
def initialize(aff, dic_words)
|
|
79
|
+
@aff = aff
|
|
80
|
+
@dic_words = dic_words
|
|
81
|
+
|
|
82
|
+
# Create lookuper and suggester
|
|
83
|
+
@lookuper = Readers::LookupBuilder.from_data(aff, dic_words).build
|
|
84
|
+
@suggester = Algorithms::Suggest::Suggester.new(
|
|
85
|
+
aff: aff,
|
|
86
|
+
dic: build_dic_structure(dic_words),
|
|
87
|
+
lookuper: @lookuper
|
|
88
|
+
)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Load dictionary from file path.
|
|
92
|
+
#
|
|
93
|
+
# The path should be the base name without extension, e.g., 'en_US'
|
|
94
|
+
# for files 'en_US.aff' and 'en_US.dic'.
|
|
95
|
+
#
|
|
96
|
+
# @param path [String] Base path to dictionary files (without extension)
|
|
97
|
+
# @return [Dictionary] The loaded dictionary
|
|
98
|
+
#
|
|
99
|
+
# @example
|
|
100
|
+
# Dictionary.from_files('en_US')
|
|
101
|
+
def self.from_files(path)
|
|
102
|
+
# Check if it's a distributed dictionary
|
|
103
|
+
if DISTRIBUTED.key?(path) && !File.exist?("#{path}.aff")
|
|
104
|
+
distributed_path = File.join(File.dirname(__FILE__), '../../data', DISTRIBUTED[path], path)
|
|
105
|
+
if File.exist?("#{distributed_path}.aff")
|
|
106
|
+
path = distributed_path
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
aff_path = "#{path}.aff"
|
|
111
|
+
dic_path = "#{path}.dic"
|
|
112
|
+
|
|
113
|
+
raise ArgumentError, "Dictionary file not found: #{aff_path}" unless File.exist?(aff_path)
|
|
114
|
+
raise ArgumentError, "Dictionary file not found: #{dic_path}" unless File.exist?(dic_path)
|
|
115
|
+
|
|
116
|
+
# Read aff file
|
|
117
|
+
aff_reader = Readers::AffReader.new(aff_path)
|
|
118
|
+
aff_data = aff_reader.read
|
|
119
|
+
|
|
120
|
+
# Read dic file
|
|
121
|
+
dic_reader = Readers::DicReader.new(dic_path,
|
|
122
|
+
flag_format: aff_data['FLAG'] || 'short',
|
|
123
|
+
flag_synonyms: aff_data['AF'] || {})
|
|
124
|
+
dic_words = dic_reader.read
|
|
125
|
+
|
|
126
|
+
new(aff_data, dic_words)
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Load dictionary from zip archive.
|
|
130
|
+
#
|
|
131
|
+
# Supports OpenOffice/LibreOffice dictionary extensions (.odt, .oxt)
|
|
132
|
+
# and Firefox/Thunderbird dictionary extensions (.xpi).
|
|
133
|
+
#
|
|
134
|
+
# @param zip_path [String] Path to zip archive
|
|
135
|
+
# @return [Dictionary] The loaded dictionary
|
|
136
|
+
#
|
|
137
|
+
# @example
|
|
138
|
+
# Dictionary.from_zip('en_US.odt')
|
|
139
|
+
def self.from_zip(zip_path)
|
|
140
|
+
Zip::File.open(zip_path) do |zipfile|
|
|
141
|
+
# Find .aff and .dic files
|
|
142
|
+
aff_entry = nil
|
|
143
|
+
dic_entry = nil
|
|
144
|
+
|
|
145
|
+
zipfile.each do |entry|
|
|
146
|
+
if entry.name.end_with?('.aff')
|
|
147
|
+
raise ArgumentError, "Multiple .aff files found in zip" if aff_entry
|
|
148
|
+
aff_entry = entry
|
|
149
|
+
elsif entry.name.end_with?('.dic')
|
|
150
|
+
raise ArgumentError, "Multiple .dic files found in zip" if dic_entry
|
|
151
|
+
dic_entry = entry
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
raise ArgumentError, "No .aff file found in zip" unless aff_entry
|
|
156
|
+
raise ArgumentError, "No .dic file found in zip" unless dic_entry
|
|
157
|
+
|
|
158
|
+
# Read aff file
|
|
159
|
+
aff_reader = Readers::ZipReader.new(zipfile, aff_entry.name)
|
|
160
|
+
aff_data = aff_reader.to_a
|
|
161
|
+
# Parse the raw data into proper aff structure
|
|
162
|
+
aff_reader = Readers::AffReader.new(zip_path) # Temporary for context
|
|
163
|
+
aff_data = Readers::AffReader.new(aff_entry.name).read
|
|
164
|
+
|
|
165
|
+
# Read dic file
|
|
166
|
+
dic_reader = Readers::DicReader.new(dic_entry.name,
|
|
167
|
+
flag_format: aff_data['FLAG'] || 'short',
|
|
168
|
+
flag_synonyms: aff_data['AF'] || {})
|
|
169
|
+
dic_words = dic_reader.read
|
|
170
|
+
|
|
171
|
+
new(aff_data, dic_words)
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Load dictionary from system paths.
|
|
176
|
+
#
|
|
177
|
+
# Searches standard system locations for Hunspell dictionaries.
|
|
178
|
+
#
|
|
179
|
+
# @param name [String] Dictionary name (e.g., 'en_US', 'ru_RU')
|
|
180
|
+
# @return [Dictionary] The loaded dictionary
|
|
181
|
+
# @raise [ArgumentError] If dictionary not found in system paths
|
|
182
|
+
#
|
|
183
|
+
# @example
|
|
184
|
+
# Dictionary.from_system('en_US')
|
|
185
|
+
def self.from_system(name)
|
|
186
|
+
PATHES.each do |folder|
|
|
187
|
+
aff_path = File.join(folder, "#{name}.aff")
|
|
188
|
+
if File.exist?(aff_path)
|
|
189
|
+
base_path = aff_path.sub(/\.aff$/, '')
|
|
190
|
+
return from_files(base_path)
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
raise ArgumentError, "#{name}.aff not found in system paths: #{PATHES.inspect}"
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Check if a word is correct.
|
|
198
|
+
#
|
|
199
|
+
# @param word [String] Word to check
|
|
200
|
+
# @return [Boolean] True if the word exists in the dictionary
|
|
201
|
+
#
|
|
202
|
+
# @example
|
|
203
|
+
# dictionary.lookup('spells') # => true
|
|
204
|
+
# dictionary.lookup('spylls') # => false
|
|
205
|
+
def lookup(word)
|
|
206
|
+
@lookuper.call(word)
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Generate suggestions for a misspelled word.
|
|
210
|
+
#
|
|
211
|
+
# Returns suggestions in order of probability/similarity,
|
|
212
|
+
# with best suggestions first.
|
|
213
|
+
#
|
|
214
|
+
# @param word [String] The misspelled word
|
|
215
|
+
# @yield [String] Each suggestion
|
|
216
|
+
# @return [Enumerator] If no block given
|
|
217
|
+
#
|
|
218
|
+
# @example
|
|
219
|
+
# dictionary.suggest('spylls') # => ["spells", "spills", ...]
|
|
220
|
+
def suggest(word)
|
|
221
|
+
return enum_for(:suggest, word) unless block_given?
|
|
222
|
+
|
|
223
|
+
@suggester.suggestions(word) do |suggestion|
|
|
224
|
+
yield suggestion
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
private
|
|
229
|
+
|
|
230
|
+
# Build dic structure for suggester.
|
|
231
|
+
#
|
|
232
|
+
# @param dic_words [Array<Readers::Word>] Dictionary word entries
|
|
233
|
+
# @return [Hash] Dic structure
|
|
234
|
+
def build_dic_structure(dic_words)
|
|
235
|
+
# Build a hash indexed by word for fast lookup
|
|
236
|
+
word_index = Hash.new { |h, k| h[k] = [] }
|
|
237
|
+
|
|
238
|
+
dic_words.each do |word|
|
|
239
|
+
word_index[word.stem] << {
|
|
240
|
+
stem: word.stem,
|
|
241
|
+
flags: word.flags.to_a
|
|
242
|
+
}
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Build the dic structure with homonyms callable
|
|
246
|
+
{
|
|
247
|
+
homonyms: ->(w) { word_index[w] || [] },
|
|
248
|
+
has_flag: ->(w, flag, for_all: false) {
|
|
249
|
+
entries = word_index[w] || []
|
|
250
|
+
flags_present = entries.map { |e| e[:flags] }.flatten
|
|
251
|
+
if for_all
|
|
252
|
+
flags_present.all? { |flags| flags.include?(flag) }
|
|
253
|
+
else
|
|
254
|
+
flags_present.any? { |flags| flags.include?(flag) }
|
|
255
|
+
end
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
end
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base"
|
|
4
|
+
|
|
5
|
+
module Kotoshu
|
|
6
|
+
module Dictionary
|
|
7
|
+
# Unix system dictionary backend.
|
|
8
|
+
#
|
|
9
|
+
# This dictionary reads from Unix-style system dictionary files,
|
|
10
|
+
# typically located at `/usr/share/dict/words` or symlinks to
|
|
11
|
+
# dictionaries like `web2` (Webster's Second International).
|
|
12
|
+
#
|
|
13
|
+
# @example Using system dictionary
|
|
14
|
+
# dict = UnixWords.new("/usr/share/dict/words", language_code: "en-US")
|
|
15
|
+
# dict.lookup?("hello") # => true
|
|
16
|
+
# dict.suggest("helo") # => ["hello", "help", "held", ...]
|
|
17
|
+
#
|
|
18
|
+
# @example Auto-detecting system dictionary
|
|
19
|
+
# dict = UnixWords.detect(language_code: "en-US")
|
|
20
|
+
class UnixWords < Base
|
|
21
|
+
# Standard system paths to check for dictionaries.
|
|
22
|
+
SYSTEM_PATHS = [
|
|
23
|
+
"/usr/share/dict/words",
|
|
24
|
+
"/usr/share/dict/web2",
|
|
25
|
+
"/usr/share/dict/american-english",
|
|
26
|
+
"/usr/share/dict/british-english",
|
|
27
|
+
"/usr/dict/words",
|
|
28
|
+
"/System/Library/Assets/com_apple_MobileAsset_DictionaryServices_dictionaryOS/Dictionary/words" # macOS
|
|
29
|
+
].freeze
|
|
30
|
+
|
|
31
|
+
# @return [String] The path to the dictionary file
|
|
32
|
+
attr_reader :path
|
|
33
|
+
|
|
34
|
+
# @return [Boolean] Whether lookups are case-sensitive
|
|
35
|
+
attr_reader :case_sensitive
|
|
36
|
+
|
|
37
|
+
# Create a new UnixWords dictionary.
|
|
38
|
+
#
|
|
39
|
+
# @param path [String] Path to the dictionary file
|
|
40
|
+
# @param language_code [String] The language code
|
|
41
|
+
# @param locale [String, nil] The locale (optional)
|
|
42
|
+
# @param case_sensitive [Boolean] Whether lookups are case-sensitive
|
|
43
|
+
# @param metadata [Hash] Additional metadata (optional)
|
|
44
|
+
def initialize(path, language_code:, locale: nil, case_sensitive: false, metadata: {})
|
|
45
|
+
super(language_code, locale: locale, metadata: metadata)
|
|
46
|
+
|
|
47
|
+
@path = File.expand_path(path)
|
|
48
|
+
@case_sensitive = case_sensitive
|
|
49
|
+
@words = load_words(@path)
|
|
50
|
+
@word_set = build_word_set
|
|
51
|
+
|
|
52
|
+
# Register this dictionary type
|
|
53
|
+
self.class.register_type(:unix_words) unless Dictionary.registry.key?(:unix_words)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Check if a word exists in the dictionary.
|
|
57
|
+
#
|
|
58
|
+
# @param word [String] The word to look up
|
|
59
|
+
# @return [Boolean] True if the word exists
|
|
60
|
+
def lookup(word)
|
|
61
|
+
return false if word.nil? || word.empty?
|
|
62
|
+
|
|
63
|
+
lookup_word = @case_sensitive ? word : word.downcase
|
|
64
|
+
@word_set.key?(lookup_word)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Generate spelling suggestions.
|
|
68
|
+
#
|
|
69
|
+
# Uses edit distance to find similar words in the dictionary.
|
|
70
|
+
#
|
|
71
|
+
# @param word [String] The misspelled word
|
|
72
|
+
# @param max_suggestions [Integer] Maximum suggestions
|
|
73
|
+
# @return [Array<String>] List of suggested words
|
|
74
|
+
def suggest(word, max_suggestions: 10)
|
|
75
|
+
return [] if word.nil? || word.empty?
|
|
76
|
+
|
|
77
|
+
# For now, use simple prefix matching and edit distance
|
|
78
|
+
# This will be improved with the suggestion algorithms
|
|
79
|
+
lookup_word = @case_sensitive ? word : word.downcase
|
|
80
|
+
|
|
81
|
+
# Find words with same prefix
|
|
82
|
+
prefix_len = [lookup_word.length - 1, 3].max
|
|
83
|
+
prefix = lookup_word[0...prefix_len]
|
|
84
|
+
candidates = @words.select { |w| w.start_with?(prefix) }
|
|
85
|
+
|
|
86
|
+
# Calculate edit distances
|
|
87
|
+
candidates.map do |dict_word|
|
|
88
|
+
dist = edit_distance(lookup_word, dict_word)
|
|
89
|
+
[dict_word, dist]
|
|
90
|
+
end.select { |_, dist| dist.positive? && dist <= 2 } # Only close matches
|
|
91
|
+
.sort_by { |_, dist| dist }
|
|
92
|
+
.first(max_suggestions)
|
|
93
|
+
.map(&:first)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Add a word to the dictionary.
|
|
97
|
+
#
|
|
98
|
+
# @param word [String] The word to add
|
|
99
|
+
# @param flags [Array<String>] Flags (ignored for UnixWords)
|
|
100
|
+
# @return [Boolean] True if added
|
|
101
|
+
def add_word(word, flags: [])
|
|
102
|
+
return false if word.nil? || word.empty?
|
|
103
|
+
|
|
104
|
+
lookup_word = @case_sensitive ? word : word.downcase
|
|
105
|
+
return false if @word_set.key?(lookup_word)
|
|
106
|
+
|
|
107
|
+
@words << lookup_word
|
|
108
|
+
@word_set[lookup_word] = @words.length - 1
|
|
109
|
+
|
|
110
|
+
true
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Remove a word from the dictionary.
|
|
114
|
+
#
|
|
115
|
+
# @param word [String] The word to remove
|
|
116
|
+
# @return [Boolean] True if removed
|
|
117
|
+
def remove_word(word)
|
|
118
|
+
return false if word.nil? || word.empty?
|
|
119
|
+
|
|
120
|
+
lookup_word = @case_sensitive ? word : word.downcase
|
|
121
|
+
return false unless @word_set.key?(lookup_word)
|
|
122
|
+
|
|
123
|
+
index = @word_set.delete(lookup_word)
|
|
124
|
+
@words.delete_at(index)
|
|
125
|
+
|
|
126
|
+
true
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Get all words in the dictionary.
|
|
130
|
+
#
|
|
131
|
+
# @return [Array<String>] All words
|
|
132
|
+
def words
|
|
133
|
+
@words.dup
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Detect system dictionary path.
|
|
137
|
+
#
|
|
138
|
+
# Checks standard system paths for an existing dictionary file.
|
|
139
|
+
#
|
|
140
|
+
# @return [String, nil] The detected path or nil
|
|
141
|
+
#
|
|
142
|
+
# @example
|
|
143
|
+
# UnixWords.detect_system_dictionary # => "/usr/share/dict/words"
|
|
144
|
+
def self.detect_system_dictionary
|
|
145
|
+
SYSTEM_PATHS.find { |p| File.exist?(p) }
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Create a dictionary by auto-detecting system dictionary.
|
|
149
|
+
#
|
|
150
|
+
# @param language_code [String] The language code
|
|
151
|
+
# @param locale [String, nil] The locale (optional)
|
|
152
|
+
# @param case_sensitive [Boolean] Whether lookups are case-sensitive
|
|
153
|
+
# @return [UnixWords, nil] The dictionary or nil if not found
|
|
154
|
+
#
|
|
155
|
+
# @example
|
|
156
|
+
# dict = UnixWords.detect(language_code: "en-US")
|
|
157
|
+
def self.detect(language_code:, locale: nil, case_sensitive: false)
|
|
158
|
+
path = detect_system_dictionary
|
|
159
|
+
return nil unless path
|
|
160
|
+
|
|
161
|
+
new(path, language_code: language_code, locale: locale,
|
|
162
|
+
case_sensitive: case_sensitive)
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
private
|
|
166
|
+
|
|
167
|
+
# Load words from dictionary file.
|
|
168
|
+
#
|
|
169
|
+
# @param path [String] The file path
|
|
170
|
+
# @return [Array<String>] List of words
|
|
171
|
+
def load_words(path)
|
|
172
|
+
raise DictionaryNotFoundError, path unless File.exist?(path)
|
|
173
|
+
|
|
174
|
+
File.foreach(path, chomp: true)
|
|
175
|
+
.reject { |line| line.empty? || line.start_with?("#") }
|
|
176
|
+
.map { |word| @case_sensitive ? word : word.downcase }
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Build a hash set for O(1) lookups.
|
|
180
|
+
#
|
|
181
|
+
# @return [Hash] Word to index mapping
|
|
182
|
+
def build_word_set
|
|
183
|
+
@words.each_with_index.to_h
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Calculate Levenshtein edit distance.
|
|
187
|
+
#
|
|
188
|
+
# @param str1 [String] First string
|
|
189
|
+
# @param str2 [String] Second string
|
|
190
|
+
# @return [Integer] Edit distance
|
|
191
|
+
def edit_distance(str1, str2)
|
|
192
|
+
return str2.length if str1.empty?
|
|
193
|
+
return str1.length if str2.empty?
|
|
194
|
+
|
|
195
|
+
# Use smaller string for inner loop
|
|
196
|
+
str1, str2 = str2, str1 if str1.length > str2.length
|
|
197
|
+
|
|
198
|
+
previous = (0..str1.length).to_a
|
|
199
|
+
|
|
200
|
+
str2.each_char.with_index do |char2, j|
|
|
201
|
+
current = [j + 1]
|
|
202
|
+
|
|
203
|
+
str1.each_char.with_index do |char1, i|
|
|
204
|
+
insert_cost = current[i] + 1
|
|
205
|
+
delete_cost = previous[i + 1] + 1
|
|
206
|
+
substitute_cost = previous[i] + (char1 == char2 ? 0 : 1)
|
|
207
|
+
|
|
208
|
+
current << [insert_cost, delete_cost, substitute_cost].min
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
previous = current
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
previous.last
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
end
|