kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,526 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "open-uri"
|
|
4
|
+
require_relative "base"
|
|
5
|
+
require_relative "../readers/lookup_builder"
|
|
6
|
+
require_relative "../readers/aff_reader"
|
|
7
|
+
require_relative "../readers/dic_reader"
|
|
8
|
+
|
|
9
|
+
module Kotoshu
|
|
10
|
+
module Dictionary
|
|
11
|
+
# Hunspell dictionary backend.
|
|
12
|
+
#
|
|
13
|
+
# This dictionary reads Hunspell-formatted dictionary files (.dic and .aff).
|
|
14
|
+
# Hunspell is the spell checker used by LibreOffice, Firefox, Chrome, and many
|
|
15
|
+
# other applications.
|
|
16
|
+
#
|
|
17
|
+
# File format:
|
|
18
|
+
# - .dic: Dictionary file with word count on first line, words with optional flags
|
|
19
|
+
# - .aff: Affix file with prefix/suffix rules and configuration
|
|
20
|
+
#
|
|
21
|
+
# @example Creating a Hunspell dictionary
|
|
22
|
+
# dict = Hunspell.new(
|
|
23
|
+
# dic_path: "en_US.dic",
|
|
24
|
+
# aff_path: "en_US.aff",
|
|
25
|
+
# language_code: "en-US"
|
|
26
|
+
# )
|
|
27
|
+
# dict.lookup?("hello") # => true
|
|
28
|
+
#
|
|
29
|
+
# @example Creating from GitHub cache
|
|
30
|
+
# dict = Hunspell.from_github("de")
|
|
31
|
+
# dict.lookup?("über") # => true
|
|
32
|
+
#
|
|
33
|
+
# @see https://hunspell.github.io/ Hunspell documentation
|
|
34
|
+
class Hunspell < Base
|
|
35
|
+
# @return [String] Path to the .dic file
|
|
36
|
+
attr_reader :dic_path
|
|
37
|
+
|
|
38
|
+
# @return [String] Path to the .aff file
|
|
39
|
+
attr_reader :aff_path
|
|
40
|
+
|
|
41
|
+
# @return [Hash] Affix rules (flag => array of rules)
|
|
42
|
+
attr_reader :affix_rules
|
|
43
|
+
|
|
44
|
+
# @return [Hash] Configuration options from affix file
|
|
45
|
+
attr_reader :aff_config
|
|
46
|
+
|
|
47
|
+
# @return [Hash] Raw aff data from AffReader (cached for Lookuper)
|
|
48
|
+
attr_reader :aff_data
|
|
49
|
+
|
|
50
|
+
# @return [Array] Raw words from DicReader (cached for Lookuper)
|
|
51
|
+
attr_reader :dic_words
|
|
52
|
+
|
|
53
|
+
# @return [Algorithms::Lookup::Lookuper] The lookup algorithm instance
|
|
54
|
+
def lookuper
|
|
55
|
+
@lookuper ||= Readers::LookupBuilder.from_data(@aff_data, @dic_words).build
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
class << self
|
|
59
|
+
# Load Hunspell dictionary from GitHub cache, downloading if necessary.
|
|
60
|
+
#
|
|
61
|
+
# This class method provides automatic dictionary management by:
|
|
62
|
+
# 1. Checking the local cache for existing dictionaries
|
|
63
|
+
# 2. Downloading from GitHub if not cached or expired
|
|
64
|
+
# 3. Managing cache metadata and TTL
|
|
65
|
+
#
|
|
66
|
+
# @example Load English dictionary
|
|
67
|
+
# dict = Hunspell.from_github("en")
|
|
68
|
+
# dict.lookup?("hello") # => true
|
|
69
|
+
#
|
|
70
|
+
# @example Load German dictionary
|
|
71
|
+
# dict = Hunspell.from_github("de")
|
|
72
|
+
# dict.lookup?("über") # => true
|
|
73
|
+
#
|
|
74
|
+
# @example Force re-download
|
|
75
|
+
# dict = Hunspell.from_github("fr", force_download: true)
|
|
76
|
+
#
|
|
77
|
+
# @param language_code [String] ISO 639-1 language code (e.g., 'en', 'de', 'fr')
|
|
78
|
+
# @param cache [Cache::LanguageCache, nil] Custom cache instance (optional)
|
|
79
|
+
# @param force_download [Boolean] Force re-download even if cached
|
|
80
|
+
# @return [Hunspell] Configured Hunspell dictionary instance
|
|
81
|
+
# @raise [ArgumentError] If language_code is not supported
|
|
82
|
+
def from_github(language_code, cache: nil, force_download: false)
|
|
83
|
+
require_relative '../cache/language_cache'
|
|
84
|
+
|
|
85
|
+
cache ||= Cache::LanguageCache.new
|
|
86
|
+
cached = cache.get_dictionary(language_code, force_download: force_download)
|
|
87
|
+
|
|
88
|
+
new(
|
|
89
|
+
dic_path: cached[:dic_path],
|
|
90
|
+
aff_path: cached[:aff_path],
|
|
91
|
+
language_code: language_code,
|
|
92
|
+
metadata: {
|
|
93
|
+
source: 'github',
|
|
94
|
+
github_url: cached[:metadata]['url'],
|
|
95
|
+
checksum: cached[:metadata]['checksum'],
|
|
96
|
+
downloaded_at: cached[:metadata]['downloaded_at']
|
|
97
|
+
}
|
|
98
|
+
)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Check if a language is available on GitHub.
|
|
102
|
+
#
|
|
103
|
+
# @param language_code [String] ISO 639-1 language code
|
|
104
|
+
# @param cache [Cache::LanguageCache, nil] Custom cache instance (optional)
|
|
105
|
+
# @return [Boolean] True if language is supported
|
|
106
|
+
def available_on_github?(language_code, cache: nil)
|
|
107
|
+
require_relative '../cache/language_cache'
|
|
108
|
+
|
|
109
|
+
cache ||= Cache::LanguageCache.new
|
|
110
|
+
cache.available_languages.include?(language_code)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Get list of available languages on GitHub.
|
|
114
|
+
#
|
|
115
|
+
# @param cache [Cache::LanguageCache, nil] Custom cache instance (optional)
|
|
116
|
+
# @return [Array<String>] List of supported language codes
|
|
117
|
+
def available_github_languages(cache: nil)
|
|
118
|
+
require_relative '../cache/language_cache'
|
|
119
|
+
|
|
120
|
+
cache ||= Cache::LanguageCache.new
|
|
121
|
+
cache.available_languages
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Get information about a language from GitHub.
|
|
125
|
+
#
|
|
126
|
+
# @param language_code [String] ISO 639-1 language code
|
|
127
|
+
# @param cache [Cache::LanguageCache, nil] Custom cache instance (optional)
|
|
128
|
+
# @return [Hash] Language information
|
|
129
|
+
def language_info(language_code, cache: nil)
|
|
130
|
+
require_relative '../cache/language_cache'
|
|
131
|
+
|
|
132
|
+
cache ||= Cache::LanguageCache.new
|
|
133
|
+
cache.get_language_info(language_code)
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Create a new Hunspell dictionary.
|
|
138
|
+
#
|
|
139
|
+
# @param dic_path [String] Path or URL to the .dic file
|
|
140
|
+
# @param aff_path [String] Path or URL to the .aff file
|
|
141
|
+
# @param language_code [String] The language code
|
|
142
|
+
# @param locale [String, nil] The locale (optional)
|
|
143
|
+
# @param metadata [Hash] Additional metadata (optional)
|
|
144
|
+
def initialize(dic_path:, aff_path:, language_code:, locale: nil, metadata: {})
|
|
145
|
+
super(language_code, locale: locale, metadata: metadata)
|
|
146
|
+
|
|
147
|
+
@dic_path = resolve_path(dic_path)
|
|
148
|
+
@aff_path = resolve_path(aff_path)
|
|
149
|
+
|
|
150
|
+
raise DictionaryNotFoundError, @aff_path unless File.exist?(@aff_path)
|
|
151
|
+
raise DictionaryNotFoundError, @dic_path unless File.exist?(@dic_path)
|
|
152
|
+
|
|
153
|
+
# Read aff file using AffReader and cache the data
|
|
154
|
+
aff_reader = Readers::AffReader.new(@aff_path)
|
|
155
|
+
@aff_data = aff_reader.read
|
|
156
|
+
@aff_config = @aff_data # For backward compatibility
|
|
157
|
+
|
|
158
|
+
# Read dic file using DicReader with the same encoding as the aff file
|
|
159
|
+
dic_reader = Readers::DicReader.new(@dic_path,
|
|
160
|
+
encoding: aff_reader.encoding,
|
|
161
|
+
flag_format: @aff_data['FLAG'] || 'short',
|
|
162
|
+
flag_synonyms: @aff_data['AF'] || {})
|
|
163
|
+
@dic_words = dic_reader.read
|
|
164
|
+
|
|
165
|
+
# Build legacy structures for backward compatibility
|
|
166
|
+
@word_index = build_word_index(@dic_words)
|
|
167
|
+
@affix_rules = parse_affix_rules(@aff_config)
|
|
168
|
+
|
|
169
|
+
# Lazy initialization of Lookuper (only created when needed)
|
|
170
|
+
@lookuper = nil
|
|
171
|
+
|
|
172
|
+
# Register this dictionary type
|
|
173
|
+
self.class.register_type(:hunspell) unless Dictionary.registry.key?(:hunspell)
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
private
|
|
177
|
+
|
|
178
|
+
# Build word index from DicReader words.
|
|
179
|
+
#
|
|
180
|
+
# @param words [Array<Readers::Word>] Words from DicReader
|
|
181
|
+
# @return [Hash] Word index (word => flags)
|
|
182
|
+
def build_word_index(words)
|
|
183
|
+
index = {}
|
|
184
|
+
words.each do |word|
|
|
185
|
+
index[word.stem.downcase] = word.flags.to_a
|
|
186
|
+
end
|
|
187
|
+
index
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Parse affix rules from AffReader data.
|
|
191
|
+
#
|
|
192
|
+
# @param aff_data [Hash] Aff data from AffReader
|
|
193
|
+
# @return [Hash] Affix rules by type
|
|
194
|
+
def parse_affix_rules(aff_data)
|
|
195
|
+
rules = {
|
|
196
|
+
prefix: Hash.new { |h, k| h[k] = [] },
|
|
197
|
+
suffix: Hash.new { |h, k| h[k] = [] }
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
# Convert AffReader's SFX/PFX data to legacy format
|
|
201
|
+
# AffReader returns: 'SFX' => { flag => [Affix, ...] }
|
|
202
|
+
# We need to convert each Affix to Models::AffixRule
|
|
203
|
+
|
|
204
|
+
aff_data['SFX']&.each do |flag, affix_list|
|
|
205
|
+
rules[:suffix][flag] = affix_list.map do |affix|
|
|
206
|
+
convert_to_affix_rule(affix, :suffix)
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
aff_data['PFX']&.each do |flag, affix_list|
|
|
211
|
+
rules[:prefix][flag] = affix_list.map do |affix|
|
|
212
|
+
convert_to_affix_rule(affix, :prefix)
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
rules
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Convert AffReader Affix to Models::AffixRule.
|
|
220
|
+
#
|
|
221
|
+
# @param affix [Readers::Affix] The affix to convert
|
|
222
|
+
# @param type [Symbol] :prefix or :suffix
|
|
223
|
+
# @return [Models::AffixRule] The converted rule
|
|
224
|
+
def convert_to_affix_rule(affix, type)
|
|
225
|
+
# Create a simple string representation for from_hunspell
|
|
226
|
+
# Format: PFX/SFX FLAG crossproduct strip add condition
|
|
227
|
+
cross_str = affix.crossproduct ? 'Y' : 'N'
|
|
228
|
+
strip_str = affix.strip.empty? ? '0' : affix.strip
|
|
229
|
+
add_str = affix.add.empty? ? '0' : affix.add
|
|
230
|
+
condition_str = affix.condition || '.'
|
|
231
|
+
|
|
232
|
+
type_str = type == :prefix ? 'PFX' : 'SFX'
|
|
233
|
+
rule_line = "#{type_str} #{affix.flag} #{cross_str} #{strip_str} #{add_str} #{condition_str}"
|
|
234
|
+
|
|
235
|
+
Models::AffixRule.from_hunspell(rule_line, type)
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Check if path is a URL
|
|
239
|
+
# @param path [String] Path to check
|
|
240
|
+
# @return [Boolean] True if path is a URL
|
|
241
|
+
def url?(path)
|
|
242
|
+
path.start_with?("http://", "https://")
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Resolve path to local file path (downloading if URL)
|
|
246
|
+
# @param path [String] Path or URL
|
|
247
|
+
# @return [String] Local file path
|
|
248
|
+
def resolve_path(path)
|
|
249
|
+
return File.expand_path(path) unless url?(path)
|
|
250
|
+
|
|
251
|
+
download_to_temp(path)
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
# Download URL to temporary file
|
|
255
|
+
# @param url [String] URL to download
|
|
256
|
+
# @return [String] Temporary file path
|
|
257
|
+
def download_to_temp(url)
|
|
258
|
+
require "tempfile"
|
|
259
|
+
|
|
260
|
+
uri = URI.parse(url)
|
|
261
|
+
filename = File.basename(uri.path)
|
|
262
|
+
|
|
263
|
+
temp = Tempfile.new([filename, ""], encoding: "UTF-8")
|
|
264
|
+
temp.binmode
|
|
265
|
+
|
|
266
|
+
URI.open(uri, "rb") do |remote_file|
|
|
267
|
+
IO.copy_stream(remote_file, temp)
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
temp.close
|
|
271
|
+
temp.path
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
public
|
|
275
|
+
|
|
276
|
+
# Check if a word exists in the dictionary.
|
|
277
|
+
#
|
|
278
|
+
# Uses the Lookup::Lookuper algorithm for full affix and compound support.
|
|
279
|
+
#
|
|
280
|
+
# @param word [String] The word to look up
|
|
281
|
+
# @return [Boolean] True if the word exists
|
|
282
|
+
def lookup(word)
|
|
283
|
+
return false if word.nil? || word.empty?
|
|
284
|
+
|
|
285
|
+
# Use the Lookuper for full Hunspell algorithm support
|
|
286
|
+
lookuper.call(word)
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# Generate spelling suggestions.
|
|
290
|
+
#
|
|
291
|
+
# @param word [String] The misspelled word
|
|
292
|
+
# @param max_suggestions [Integer] Maximum suggestions
|
|
293
|
+
# @return [Array<String>] List of suggested words
|
|
294
|
+
def suggest(word, max_suggestions: 10)
|
|
295
|
+
return [] if word.nil? || word.empty?
|
|
296
|
+
|
|
297
|
+
all_words = @word_index.keys + generate_affix_variants
|
|
298
|
+
lookup_word = word.downcase
|
|
299
|
+
|
|
300
|
+
# Find words with same prefix
|
|
301
|
+
prefix_len = [lookup_word.length - 1, 2].max
|
|
302
|
+
prefix = lookup_word[0...prefix_len]
|
|
303
|
+
candidates = all_words.select { |w| w.downcase.start_with?(prefix) }
|
|
304
|
+
|
|
305
|
+
# Calculate edit distances
|
|
306
|
+
candidates.map do |dict_word|
|
|
307
|
+
dist = edit_distance(lookup_word, dict_word.downcase)
|
|
308
|
+
[dict_word, dist]
|
|
309
|
+
end.select { |_, dist| dist.positive? && dist <= 2 }
|
|
310
|
+
.sort_by { |_, dist| dist }
|
|
311
|
+
.first(max_suggestions)
|
|
312
|
+
.map(&:first)
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
# Add a word to the dictionary.
|
|
316
|
+
#
|
|
317
|
+
# @param word [String] The word to add
|
|
318
|
+
# @param flags [Array<String>] Morphological flags
|
|
319
|
+
# @return [Boolean] True if added
|
|
320
|
+
def add_word(word, flags: [])
|
|
321
|
+
return false if word.nil? || word.empty?
|
|
322
|
+
|
|
323
|
+
word_key = word.downcase
|
|
324
|
+
@word_index[word_key] = flags
|
|
325
|
+
|
|
326
|
+
true
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
# Remove a word from the dictionary.
|
|
330
|
+
#
|
|
331
|
+
# @param word [String] The word to remove
|
|
332
|
+
# @return [Boolean] True if removed
|
|
333
|
+
def remove_word(word)
|
|
334
|
+
return false if word.nil? || word.empty?
|
|
335
|
+
|
|
336
|
+
word_key = word.downcase
|
|
337
|
+
!@word_index.delete(word_key).nil?
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
# Get all words in the dictionary.
|
|
341
|
+
#
|
|
342
|
+
# @return [Array<String>] All words
|
|
343
|
+
def words
|
|
344
|
+
@word_index.keys.dup
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
# Get word variants using affix rules.
|
|
348
|
+
#
|
|
349
|
+
# @param word [String] The word
|
|
350
|
+
# @return [Array<String>] Word variants
|
|
351
|
+
def word_variants(word)
|
|
352
|
+
return [] if word.nil? || word.empty?
|
|
353
|
+
|
|
354
|
+
variants = []
|
|
355
|
+
|
|
356
|
+
# Get flags for this word (if any)
|
|
357
|
+
word_key = word.downcase
|
|
358
|
+
flags = @word_index[word_key] || []
|
|
359
|
+
|
|
360
|
+
# Generate prefix variants
|
|
361
|
+
@affix_rules[:prefix].each do |flag, rules|
|
|
362
|
+
next unless flags.include?(flag)
|
|
363
|
+
|
|
364
|
+
rules.each do |rule|
|
|
365
|
+
variant = rule.apply(word)
|
|
366
|
+
variants << variant if variant
|
|
367
|
+
end
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
# Generate suffix variants
|
|
371
|
+
@affix_rules[:suffix].each do |flag, rules|
|
|
372
|
+
next unless flags.include?(flag)
|
|
373
|
+
|
|
374
|
+
rules.each do |rule|
|
|
375
|
+
variant = rule.apply(word)
|
|
376
|
+
variants << variant if variant
|
|
377
|
+
end
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
variants
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
private
|
|
384
|
+
|
|
385
|
+
# Load the dictionary file.
|
|
386
|
+
#
|
|
387
|
+
# @param path [String] Path to .dic file
|
|
388
|
+
# @return [Hash] Word index (word => flags)
|
|
389
|
+
def load_dic_file(path)
|
|
390
|
+
index = {}
|
|
391
|
+
lines = File.readlines(path, chomp: true)
|
|
392
|
+
|
|
393
|
+
# First line is word count
|
|
394
|
+
return index if lines.empty?
|
|
395
|
+
|
|
396
|
+
# Parse remaining lines
|
|
397
|
+
lines[1..].each do |line|
|
|
398
|
+
next if line.empty? || line.start_with?("#") || line.strip.empty?
|
|
399
|
+
|
|
400
|
+
parts = line.split("/")
|
|
401
|
+
word = parts[0]
|
|
402
|
+
|
|
403
|
+
# Skip if word is nil or empty after stripping
|
|
404
|
+
next if word.nil? || word.strip.empty?
|
|
405
|
+
|
|
406
|
+
word = word.strip
|
|
407
|
+
flags = parts[1] ? parts[1].split("") : []
|
|
408
|
+
|
|
409
|
+
index[word.downcase] = flags
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
index
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
# Load the affix file.
|
|
416
|
+
#
|
|
417
|
+
# @param path [String] Path to .aff file
|
|
418
|
+
# @return [Hash] Configuration options
|
|
419
|
+
def load_aff_file(path)
|
|
420
|
+
config = {
|
|
421
|
+
set: "UTF-8",
|
|
422
|
+
try: "",
|
|
423
|
+
flag: "char", # or "long" or "num"
|
|
424
|
+
affix_rules: []
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
File.foreach(path, chomp: true) do |line|
|
|
428
|
+
next if line.empty? || line.start_with?("#")
|
|
429
|
+
|
|
430
|
+
parts = line.split
|
|
431
|
+
next if parts.empty?
|
|
432
|
+
|
|
433
|
+
keyword = parts[0].upcase
|
|
434
|
+
|
|
435
|
+
case keyword
|
|
436
|
+
when "SET"
|
|
437
|
+
config[:set] = parts[1]
|
|
438
|
+
when "TRY"
|
|
439
|
+
config[:try] = parts[1]
|
|
440
|
+
when "FLAG"
|
|
441
|
+
config[:flag] = parts[1]
|
|
442
|
+
when "PFX", "SFX"
|
|
443
|
+
config[:affix_rules] << line
|
|
444
|
+
when "REP", "MAP", "COMPOUNDRULE", "COMPOUNDWORDMIN", "COMPOUNDFLAG"
|
|
445
|
+
# Store for future use
|
|
446
|
+
config[keyword.downcase.to_sym] ||= []
|
|
447
|
+
config[keyword.downcase.to_sym] << line
|
|
448
|
+
end
|
|
449
|
+
end
|
|
450
|
+
|
|
451
|
+
config
|
|
452
|
+
end
|
|
453
|
+
|
|
454
|
+
# Direct lookup without affix processing.
|
|
455
|
+
#
|
|
456
|
+
# @param word [String] The word
|
|
457
|
+
# @return [Boolean] True if word exists
|
|
458
|
+
def direct_lookup?(word)
|
|
459
|
+
word_key = word.downcase
|
|
460
|
+
@word_index.key?(word_key)
|
|
461
|
+
end
|
|
462
|
+
|
|
463
|
+
# Generate all possible affix variants.
|
|
464
|
+
#
|
|
465
|
+
# @return [Array<String>] All variants
|
|
466
|
+
def generate_affix_variants
|
|
467
|
+
variants = []
|
|
468
|
+
|
|
469
|
+
@affix_rules[:prefix].each do |flag, rules|
|
|
470
|
+
rules.each do |rule|
|
|
471
|
+
@word_index.each do |word, flags|
|
|
472
|
+
next unless flags.include?(flag)
|
|
473
|
+
|
|
474
|
+
variant = rule.apply(word)
|
|
475
|
+
variants << variant if variant
|
|
476
|
+
end
|
|
477
|
+
end
|
|
478
|
+
end
|
|
479
|
+
|
|
480
|
+
@affix_rules[:suffix].each do |flag, rules|
|
|
481
|
+
rules.each do |rule|
|
|
482
|
+
@word_index.each do |word, flags|
|
|
483
|
+
next unless flags.include?(flag)
|
|
484
|
+
|
|
485
|
+
variant = rule.apply(word)
|
|
486
|
+
variants << variant if variant
|
|
487
|
+
end
|
|
488
|
+
end
|
|
489
|
+
end
|
|
490
|
+
|
|
491
|
+
variants.uniq
|
|
492
|
+
end
|
|
493
|
+
|
|
494
|
+
# Calculate Levenshtein edit distance.
|
|
495
|
+
#
|
|
496
|
+
# @param str1 [String] First string
|
|
497
|
+
# @param str2 [String] Second string
|
|
498
|
+
# @return [Integer] Edit distance
|
|
499
|
+
def edit_distance(str1, str2)
|
|
500
|
+
return str2.length if str1.empty?
|
|
501
|
+
return str1.length if str2.empty?
|
|
502
|
+
|
|
503
|
+
# Use smaller string for inner loop
|
|
504
|
+
str1, str2 = str2, str1 if str1.length > str2.length
|
|
505
|
+
|
|
506
|
+
previous = (0..str1.length).to_a
|
|
507
|
+
|
|
508
|
+
str2.each_char.with_index do |char2, j|
|
|
509
|
+
current = [j + 1]
|
|
510
|
+
|
|
511
|
+
str1.each_char.with_index do |char1, i|
|
|
512
|
+
insert_cost = current[i] + 1
|
|
513
|
+
delete_cost = previous[i + 1] + 1
|
|
514
|
+
substitute_cost = previous[i] + (char1 == char2 ? 0 : 1)
|
|
515
|
+
|
|
516
|
+
current << [insert_cost, delete_cost, substitute_cost].min
|
|
517
|
+
end
|
|
518
|
+
|
|
519
|
+
previous = current
|
|
520
|
+
end
|
|
521
|
+
|
|
522
|
+
previous.last
|
|
523
|
+
end
|
|
524
|
+
end
|
|
525
|
+
end
|
|
526
|
+
end
|