kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,876 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Algorithms
|
|
5
|
+
# Main "is this word correct?" algorithm implementation.
|
|
6
|
+
#
|
|
7
|
+
# Ported from Spylls (Python) lookup.py
|
|
8
|
+
#
|
|
9
|
+
# On a bird's-eye view level:
|
|
10
|
+
# * Word correctness check is an attempt to analyze word form
|
|
11
|
+
# (maybe it has this suffix? maybe it has this prefix? maybe it
|
|
12
|
+
# consists of several words?)
|
|
13
|
+
# * The word is considered correct if at least one form is found that
|
|
14
|
+
# has valid suffixes/prefixes from .aff file and valid stem from
|
|
15
|
+
# .dic file, and they are all compatible with each other.
|
|
16
|
+
#
|
|
17
|
+
# To follow algorithm details, start reading from Lookup.call method.
|
|
18
|
+
module Lookup
|
|
19
|
+
NUMBER_REGEXP = /^\d+(\.\d+)?$/.freeze
|
|
20
|
+
|
|
21
|
+
# Position of word part in compound word.
|
|
22
|
+
#
|
|
23
|
+
# Used when checking whether a word could be part of a compound
|
|
24
|
+
# (specifically its begin/middle/end).
|
|
25
|
+
module CompoundPos
|
|
26
|
+
BEGIN_POS = :begin
|
|
27
|
+
MIDDLE = :middle
|
|
28
|
+
END_POS = :end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# AffixForm is a hypothesis of how some word might be split into
|
|
32
|
+
# stem, suffixes and prefixes.
|
|
33
|
+
#
|
|
34
|
+
# It always has full text and stem, and may have up to two suffixes
|
|
35
|
+
# and up to two prefixes.
|
|
36
|
+
#
|
|
37
|
+
# The following is always true (considering absent affixes as empty):
|
|
38
|
+
# prefix + prefix2 + stem + suffix2 + suffix = text
|
|
39
|
+
#
|
|
40
|
+
# prefix2/suffix2 are "secondary", so if the word has only one suffix,
|
|
41
|
+
# it is stored in suffix and suffix2 is nil.
|
|
42
|
+
class AffixForm
|
|
43
|
+
# @return [String] Full word text
|
|
44
|
+
attr_reader :text
|
|
45
|
+
|
|
46
|
+
# @return [String] Stem (word without affixes)
|
|
47
|
+
attr_reader :stem
|
|
48
|
+
|
|
49
|
+
# @return [Hash, nil] Prefix affix data
|
|
50
|
+
attr_reader :prefix
|
|
51
|
+
|
|
52
|
+
# @return [Hash, nil] Suffix affix data
|
|
53
|
+
attr_reader :suffix
|
|
54
|
+
|
|
55
|
+
# @return [Hash, nil] Secondary prefix affix data
|
|
56
|
+
attr_reader :prefix2
|
|
57
|
+
|
|
58
|
+
# @return [Hash, nil] Secondary suffix affix data
|
|
59
|
+
attr_reader :suffix2
|
|
60
|
+
|
|
61
|
+
# @return [Hash, nil] Dictionary entry for stem
|
|
62
|
+
attr_reader :in_dictionary
|
|
63
|
+
|
|
64
|
+
def initialize(text, stem,
|
|
65
|
+
prefix: nil, suffix: nil,
|
|
66
|
+
prefix2: nil, suffix2: nil,
|
|
67
|
+
in_dictionary: nil)
|
|
68
|
+
@text = text
|
|
69
|
+
@stem = stem
|
|
70
|
+
@prefix = prefix
|
|
71
|
+
@suffix = suffix
|
|
72
|
+
@prefix2 = prefix2
|
|
73
|
+
@suffix2 = suffix2
|
|
74
|
+
@in_dictionary = in_dictionary
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Create a copy with changes.
|
|
78
|
+
#
|
|
79
|
+
# @param changes [Hash] Changes to apply
|
|
80
|
+
# @return [AffixForm] New affix form with changes applied
|
|
81
|
+
def replace(**changes)
|
|
82
|
+
self.class.new(
|
|
83
|
+
changes.fetch(:text, @text),
|
|
84
|
+
changes.fetch(:stem, @stem),
|
|
85
|
+
prefix: changes.fetch(:prefix, @prefix),
|
|
86
|
+
suffix: changes.fetch(:suffix, @suffix),
|
|
87
|
+
prefix2: changes.fetch(:prefix2, @prefix2),
|
|
88
|
+
suffix2: changes.fetch(:suffix2, @suffix2),
|
|
89
|
+
in_dictionary: changes.fetch(:in_dictionary, @in_dictionary)
|
|
90
|
+
)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Check if this form has any affixes.
|
|
94
|
+
#
|
|
95
|
+
# @return [Boolean]
|
|
96
|
+
def has_affixes?
|
|
97
|
+
!@suffix.nil? || !@prefix.nil?
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Check if this is a base form (no affixes).
|
|
101
|
+
#
|
|
102
|
+
# @return [Boolean]
|
|
103
|
+
def is_base?
|
|
104
|
+
!has_affixes?
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Get all flags from stem and affixes.
|
|
108
|
+
#
|
|
109
|
+
# @return [Set<String>] Combined flags
|
|
110
|
+
def flags
|
|
111
|
+
flags = @in_dictionary ? Set.new(@in_dictionary[:flags] || []) : Set.new
|
|
112
|
+
flags.merge(@prefix[:flags] || []) if @prefix
|
|
113
|
+
flags.merge(@suffix[:flags] || []) if @suffix
|
|
114
|
+
flags
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Get all affixes (excluding nils).
|
|
118
|
+
#
|
|
119
|
+
# @return [Array<Hash>] List of affix data
|
|
120
|
+
def all_affixes
|
|
121
|
+
[@prefix2, @prefix, @suffix, @suffix2].compact
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# String representation.
|
|
125
|
+
#
|
|
126
|
+
# @return [String]
|
|
127
|
+
def to_s
|
|
128
|
+
return @text if is_base?
|
|
129
|
+
|
|
130
|
+
parts = []
|
|
131
|
+
parts << @prefix.inspect if @prefix
|
|
132
|
+
parts << @prefix2.inspect if @prefix2
|
|
133
|
+
parts << @stem
|
|
134
|
+
parts << @suffix2.inspect if @suffix2
|
|
135
|
+
parts << @suffix.inspect if @suffix
|
|
136
|
+
|
|
137
|
+
"AffixForm(#{@text} = #{parts.join(' + ')})"
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
alias inspect to_s
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# CompoundForm is a hypothesis of how some word could be split into
|
|
144
|
+
# several AffixForms (word parts with their own stems and possible affixes).
|
|
145
|
+
#
|
|
146
|
+
# Typically, only first part is allowed to have prefix, and only last
|
|
147
|
+
# part is allowed to have suffix, but there are languages where middle
|
|
148
|
+
# parts can have affixes too, specified by special flags.
|
|
149
|
+
class CompoundForm
|
|
150
|
+
# @return [Array<AffixForm>] Parts of the compound word
|
|
151
|
+
attr_reader :parts
|
|
152
|
+
|
|
153
|
+
def initialize(parts)
|
|
154
|
+
@parts = parts
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# String representation.
|
|
158
|
+
#
|
|
159
|
+
# @return [String]
|
|
160
|
+
def to_s
|
|
161
|
+
"CompoundForm(#{@parts.map(&:to_s).join(' + ')})"
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
alias inspect to_s
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Main word correctness lookup class.
|
|
168
|
+
#
|
|
169
|
+
# Typically, you would not use this directly.
|
|
170
|
+
#
|
|
171
|
+
# Example:
|
|
172
|
+
# dictionary = Kotoshu::Dictionary.load('en_US')
|
|
173
|
+
# lookuper = dictionary.lookuper
|
|
174
|
+
#
|
|
175
|
+
# lookuper.call('spylls') # => false
|
|
176
|
+
# lookuper.call('spells') # => true
|
|
177
|
+
#
|
|
178
|
+
# lookuper.good_forms('spells') do |form|
|
|
179
|
+
# puts form
|
|
180
|
+
# end
|
|
181
|
+
# # AffixForm(spells = spells)
|
|
182
|
+
# # AffixForm(spells = spell + Suffix(s: S×, on [[^sxzhy]]$))
|
|
183
|
+
class Lookuper
|
|
184
|
+
# @return [Hash] Aff data structure (from aff file)
|
|
185
|
+
attr_reader :aff
|
|
186
|
+
|
|
187
|
+
# @return [Hash] Dic data structure (from dic file)
|
|
188
|
+
attr_reader :dic
|
|
189
|
+
|
|
190
|
+
def initialize(aff, dic)
|
|
191
|
+
@aff = aff
|
|
192
|
+
@dic = dic
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# The outermost word correctness check.
|
|
196
|
+
#
|
|
197
|
+
# Basically, prepares word for check (converting/removing chars), and
|
|
198
|
+
# then checks whether any good word form can be produced with good_forms.
|
|
199
|
+
# If there is none, also tries to break word by break-points.
|
|
200
|
+
#
|
|
201
|
+
# @param word [String] Word to check
|
|
202
|
+
# @param capitalization [Boolean] If false, check only exact capitalization
|
|
203
|
+
# @param allow_nosuggest [Boolean] If false, don't consider NOSUGGEST words as correct
|
|
204
|
+
# @return [Boolean] Whether word is correct
|
|
205
|
+
def call(word, capitalization: true, allow_nosuggest: true)
|
|
206
|
+
# Check if word is correct
|
|
207
|
+
is_correct = ->(w) do
|
|
208
|
+
good_forms(w, capitalization: capitalization, allow_nosuggest: allow_nosuggest).any?
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# If all entries matching the word have FORBIDDENWORD flag, word can't be correct
|
|
212
|
+
if @aff[:FORBIDDENWORD] && @dic[:has_flag]&.call(word, @aff[:FORBIDDENWORD], for_all: true)
|
|
213
|
+
return false
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Convert word with ICONV table
|
|
217
|
+
word_to_check = @aff[:ICONV] ? @aff[:ICONV].call(word) : word
|
|
218
|
+
|
|
219
|
+
# Remove ignored characters
|
|
220
|
+
if @aff[:IGNORE]
|
|
221
|
+
ignore_chars = @aff[:IGNORE]
|
|
222
|
+
word_to_check = word_to_check.chars.reject { |c| ignore_chars.include?(c) }.join
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Numbers are always good
|
|
226
|
+
return true if NUMBER_REGEXP.match?(word_to_check)
|
|
227
|
+
|
|
228
|
+
# Try breaking word by break patterns
|
|
229
|
+
break_word(word_to_check).each do |parts|
|
|
230
|
+
if parts.all? { |part| part.empty? || is_correct.call(part) }
|
|
231
|
+
return true
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
false
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Recursively produce all possible lists of word breaking by break
|
|
239
|
+
# patterns (like dashes).
|
|
240
|
+
#
|
|
241
|
+
# Example: "pre-processed-meat" would produce:
|
|
242
|
+
# ["pre-processed-meat"]
|
|
243
|
+
# ["pre", "processed-meat"]
|
|
244
|
+
# ["pre", "processed", "meat"]
|
|
245
|
+
# ["pre-processed", "meat"]
|
|
246
|
+
#
|
|
247
|
+
# This is necessary because dictionary might contain "pre-processed"
|
|
248
|
+
# as a separate entry.
|
|
249
|
+
#
|
|
250
|
+
# @param text [String] Text to break
|
|
251
|
+
# @param depth [Integer] Current recursion depth
|
|
252
|
+
# @yield [Array<String>] Each possible breaking
|
|
253
|
+
# @return [Enumerator] If no block given
|
|
254
|
+
def break_word(text, depth = 0)
|
|
255
|
+
return enum_for(:break_word, text, depth) unless block_given?
|
|
256
|
+
return if depth > 10
|
|
257
|
+
|
|
258
|
+
# Return whole text as first option
|
|
259
|
+
yield [text]
|
|
260
|
+
|
|
261
|
+
break_patterns = @aff[:BREAK] || []
|
|
262
|
+
break_patterns.each do |pattern|
|
|
263
|
+
str = text.to_s
|
|
264
|
+
pos = 0
|
|
265
|
+
|
|
266
|
+
while (match_data = pattern[:matcher].match(str, pos))
|
|
267
|
+
start = str[0...match_data.begin(1)]
|
|
268
|
+
rest = str[match_data.end(1)..]
|
|
269
|
+
|
|
270
|
+
break_word(rest, depth + 1) do |breaking|
|
|
271
|
+
yield [start, *breaking]
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
pos = match_data.end(0)
|
|
275
|
+
break if pos >= str.length
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# The main producer of correct word forms.
|
|
281
|
+
#
|
|
282
|
+
# Produces all ways the proposed string might correspond to dictionary/
|
|
283
|
+
# affixes. If there is at least one, the word is correctly spelled.
|
|
284
|
+
#
|
|
285
|
+
# Example:
|
|
286
|
+
# lookuper.good_forms('building') do |form|
|
|
287
|
+
# puts form
|
|
288
|
+
# end
|
|
289
|
+
# # AffixForm(building = building) # noun
|
|
290
|
+
# # AffixForm(building = build + Suffix(ing: G×, on [[^e]]$)) # verb
|
|
291
|
+
#
|
|
292
|
+
# @param word [String] Word to check
|
|
293
|
+
# @param capitalization [Boolean] If false, use only exact capitalization
|
|
294
|
+
# @param allow_nosuggest [Boolean] If false, exclude NOSUGGEST words
|
|
295
|
+
# @param affix_forms [Boolean] If false, only return compound forms
|
|
296
|
+
# @param compound_forms [Boolean] If false, only return affix forms
|
|
297
|
+
# @yield [AffixForm, CompoundForm] Each valid word form
|
|
298
|
+
def good_forms(word,
|
|
299
|
+
capitalization: true,
|
|
300
|
+
allow_nosuggest: true,
|
|
301
|
+
affix_forms: true,
|
|
302
|
+
compound_forms: true)
|
|
303
|
+
return enum_for(:good_forms, word,
|
|
304
|
+
capitalization: capitalization,
|
|
305
|
+
allow_nosuggest: allow_nosuggest,
|
|
306
|
+
affix_forms: affix_forms,
|
|
307
|
+
compound_forms: compound_forms) unless block_given?
|
|
308
|
+
|
|
309
|
+
# Get capitalization variants
|
|
310
|
+
if capitalization
|
|
311
|
+
captype, variants = @aff[:casing].variants(word)
|
|
312
|
+
else
|
|
313
|
+
captype = @aff[:casing].guess(word)
|
|
314
|
+
variants = [word]
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
# Check each variant
|
|
318
|
+
variants.each do |variant|
|
|
319
|
+
if affix_forms
|
|
320
|
+
affix_forms_internal(variant, captype: captype, allow_nosuggest: allow_nosuggest) do |form|
|
|
321
|
+
# Special German ß handling
|
|
322
|
+
if @aff[:CHECKSHARPS] && @aff[:KEEPCASE]
|
|
323
|
+
stem = form.in_dictionary ? form.in_dictionary[:stem] : form.stem
|
|
324
|
+
if stem.include?('ß') &&
|
|
325
|
+
captype == Capitalization::Type::ALL &&
|
|
326
|
+
word.include?('ß') &&
|
|
327
|
+
form.flags.include?(@aff[:KEEPCASE])
|
|
328
|
+
next
|
|
329
|
+
end
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
yield form
|
|
333
|
+
end
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
if compound_forms
|
|
337
|
+
compound_forms_internal(variant, captype: captype, allow_nosuggest: allow_nosuggest) do |form|
|
|
338
|
+
yield form
|
|
339
|
+
end
|
|
340
|
+
end
|
|
341
|
+
end
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
# Check if the word is correct without yielding forms.
|
|
345
|
+
#
|
|
346
|
+
# Convenience method for simple correctness checks.
|
|
347
|
+
#
|
|
348
|
+
# @param word [String] Word to check
|
|
349
|
+
# @param capitalization [Boolean] Check capitalization variants
|
|
350
|
+
# @param allow_nosuggest [Boolean] Include NOSUGGEST words
|
|
351
|
+
# @param affix_forms [Boolean] Check affix forms
|
|
352
|
+
# @param compound_forms [Boolean] Check compound forms
|
|
353
|
+
# @return [Boolean] Whether word is correct
|
|
354
|
+
def correct?(word,
|
|
355
|
+
capitalization: true,
|
|
356
|
+
allow_nosuggest: true,
|
|
357
|
+
affix_forms: true,
|
|
358
|
+
compound_forms: true)
|
|
359
|
+
good_forms(word,
|
|
360
|
+
capitalization: capitalization,
|
|
361
|
+
allow_nosuggest: allow_nosuggest,
|
|
362
|
+
affix_forms: affix_forms,
|
|
363
|
+
compound_forms: compound_forms).any?
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
# Alias for better readability
|
|
367
|
+
alias is_correct? correct?
|
|
368
|
+
|
|
369
|
+
private
|
|
370
|
+
|
|
371
|
+
# Internal affix forms generator.
|
|
372
|
+
#
|
|
373
|
+
# @param word [String] Word to process
|
|
374
|
+
# @param captype [Symbol] Capitalization type
|
|
375
|
+
# @param allow_nosuggest [Boolean] Include NOSUGGEST words
|
|
376
|
+
# @yield [AffixForm] Each valid affix form
|
|
377
|
+
def affix_forms_internal(word, captype:, allow_nosuggest:)
|
|
378
|
+
return enum_for(:affix_forms_internal, word, captype: captype, allow_nosuggest: allow_nosuggest) unless block_given?
|
|
379
|
+
|
|
380
|
+
# Yield all possible affix forms
|
|
381
|
+
produce_affix_forms(word).each do |form|
|
|
382
|
+
# Check homonyms
|
|
383
|
+
homonyms = @dic[:homonyms]&.call(form.stem) || []
|
|
384
|
+
next if homonyms.empty?
|
|
385
|
+
|
|
386
|
+
# Check FORBIDDENWORD for compound/affix forms
|
|
387
|
+
if @aff[:FORBIDDENWORD] && form.has_affixes?
|
|
388
|
+
if homonyms.any? { |h| (h[:flags] || []).include?(@aff[:FORBIDDENWORD]) }
|
|
389
|
+
return
|
|
390
|
+
end
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
# Check each homonym
|
|
394
|
+
homonyms.each do |homonym|
|
|
395
|
+
candidate = form.replace(in_dictionary: homonym)
|
|
396
|
+
if is_good_form(candidate, captype: captype, allow_nosuggest: allow_nosuggest)
|
|
397
|
+
yield candidate
|
|
398
|
+
end
|
|
399
|
+
end
|
|
400
|
+
|
|
401
|
+
# Special case: FORCEUCASE for compound beginning
|
|
402
|
+
if captype == Capitalization::Type::INIT && @aff[:FORCEUCASE]
|
|
403
|
+
lower_homonyms = @dic[:homonyms]&.call(form.stem.downcase) || []
|
|
404
|
+
lower_homonyms.each do |homonym|
|
|
405
|
+
candidate = form.replace(in_dictionary: homonym)
|
|
406
|
+
if is_good_form(candidate, captype: captype, allow_nosuggest: allow_nosuggest)
|
|
407
|
+
yield candidate
|
|
408
|
+
end
|
|
409
|
+
end
|
|
410
|
+
end
|
|
411
|
+
end
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
# Internal compound forms generator.
|
|
415
|
+
#
|
|
416
|
+
# @param word [String] Word to process
|
|
417
|
+
# @param captype [Symbol] Capitalization type
|
|
418
|
+
# @param allow_nosuggest [Boolean] Include NOSUGGEST words
|
|
419
|
+
# @yield [CompoundForm] Each valid compound form
|
|
420
|
+
def compound_forms_internal(word, captype:, allow_nosuggest:)
|
|
421
|
+
return enum_for(:compound_forms_internal, word, captype: captype, allow_nosuggest: allow_nosuggest) unless block_given?
|
|
422
|
+
|
|
423
|
+
# Check if any affix form has FORBIDDENWORD
|
|
424
|
+
if @aff[:FORBIDDENWORD]
|
|
425
|
+
forbidden_found = false
|
|
426
|
+
affix_forms_internal(word, captype: captype, allow_nosuggest: allow_nosuggest, with_forbidden: true) do |form|
|
|
427
|
+
if form.flags.include?(@aff[:FORBIDDENWORD])
|
|
428
|
+
forbidden_found = true
|
|
429
|
+
break
|
|
430
|
+
end
|
|
431
|
+
end
|
|
432
|
+
return if forbidden_found
|
|
433
|
+
end
|
|
434
|
+
|
|
435
|
+
# Try compounds by flags
|
|
436
|
+
if @aff[:COMPOUNDBEGIN] || @aff[:COMPOUNDFLAG]
|
|
437
|
+
compounds_by_flags(word, captype: captype, allow_nosuggest: allow_nosuggest) do |compound|
|
|
438
|
+
yield compound unless is_bad_compound(compound, captype)
|
|
439
|
+
end
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
# Try compounds by rules
|
|
443
|
+
if @aff[:COMPOUNDRULE]
|
|
444
|
+
compounds_by_rules(word, allow_nosuggest: allow_nosuggest) do |compound|
|
|
445
|
+
yield compound unless is_bad_compound(compound, captype)
|
|
446
|
+
end
|
|
447
|
+
end
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
# Produce all possible affix forms for a word.
|
|
451
|
+
#
|
|
452
|
+
# @param word [String] Word to process
|
|
453
|
+
# @param compoundpos [Symbol, nil] Position in compound
|
|
454
|
+
# @param prefix_flags [Array<String>] Required prefix flags
|
|
455
|
+
# @param suffix_flags [Array<String>] Required suffix flags
|
|
456
|
+
# @param forbidden_flags [Array<String>] Forbidden affix flags
|
|
457
|
+
# @yield [AffixForm] Each possible affix form
|
|
458
|
+
def produce_affix_forms(word,
|
|
459
|
+
compoundpos: nil,
|
|
460
|
+
prefix_flags: [],
|
|
461
|
+
suffix_flags: [],
|
|
462
|
+
forbidden_flags: [])
|
|
463
|
+
return enum_for(:produce_affix_forms, word,
|
|
464
|
+
compoundpos: compoundpos,
|
|
465
|
+
prefix_flags: prefix_flags,
|
|
466
|
+
suffix_flags: suffix_flags,
|
|
467
|
+
forbidden_flags: forbidden_flags) unless block_given?
|
|
468
|
+
|
|
469
|
+
# "Whole word" is always an option
|
|
470
|
+
yield AffixForm.new(word, word)
|
|
471
|
+
|
|
472
|
+
# Check if suffixes/prefixes are allowed
|
|
473
|
+
suffix_allowed = compoundpos.nil? || compoundpos == CompoundPos::END_POS || !suffix_flags.empty?
|
|
474
|
+
prefix_allowed = compoundpos.nil? || compoundpos == CompoundPos::BEGIN_POS || !prefix_flags.empty?
|
|
475
|
+
|
|
476
|
+
# Generate suffix forms
|
|
477
|
+
if suffix_allowed
|
|
478
|
+
desuffix(word, required_flags: suffix_flags, forbidden_flags: forbidden_flags) do |form|
|
|
479
|
+
yield form
|
|
480
|
+
end
|
|
481
|
+
end
|
|
482
|
+
|
|
483
|
+
# Generate prefix forms
|
|
484
|
+
if prefix_allowed
|
|
485
|
+
deprefix(word, required_flags: prefix_flags, forbidden_flags: forbidden_flags) do |form|
|
|
486
|
+
yield form
|
|
487
|
+
|
|
488
|
+
# Try prefix + suffix if allowed
|
|
489
|
+
if suffix_allowed && form.prefix && form.prefix[:crossproduct]
|
|
490
|
+
desuffix(form.stem,
|
|
491
|
+
required_flags: suffix_flags,
|
|
492
|
+
forbidden_flags: forbidden_flags,
|
|
493
|
+
crossproduct: true) do |form2|
|
|
494
|
+
yield form2.replace(text: form.text, prefix: form.prefix)
|
|
495
|
+
end
|
|
496
|
+
end
|
|
497
|
+
end
|
|
498
|
+
end
|
|
499
|
+
end
|
|
500
|
+
|
|
501
|
+
# Remove suffixes from word.
|
|
502
|
+
#
|
|
503
|
+
# @param word [String] Word to process
|
|
504
|
+
# @param required_flags [Array<String>] Required suffix flags
|
|
505
|
+
# @param forbidden_flags [Array<String>] Forbidden suffix flags
|
|
506
|
+
# @param nested [Boolean] Whether this is a nested call
|
|
507
|
+
# @param crossproduct [Boolean] Whether suffix must have crossproduct
|
|
508
|
+
# @yield [AffixForm] Each form with suffix removed
|
|
509
|
+
def desuffix(word, required_flags: [], forbidden_flags: [], nested: false, crossproduct: false)
|
|
510
|
+
return enum_for(:desuffix, word,
|
|
511
|
+
required_flags: required_flags,
|
|
512
|
+
forbidden_flags: forbidden_flags,
|
|
513
|
+
nested: nested,
|
|
514
|
+
crossproduct: crossproduct) unless block_given?
|
|
515
|
+
|
|
516
|
+
suffixes_index = @aff[:suffixes_index] || {}
|
|
517
|
+
word_reversed = word.reverse
|
|
518
|
+
|
|
519
|
+
suffixes_index[word_reversed[0]] ||= []
|
|
520
|
+
suffixes_index[word_reversed[0]].each do |suffix|
|
|
521
|
+
# Check if suffix is valid
|
|
522
|
+
next if crossproduct && !suffix[:crossproduct]
|
|
523
|
+
next unless (required_flags - (suffix[:flags] || [])).empty?
|
|
524
|
+
next unless (forbidden_flags & (suffix[:flags] || [])).empty?
|
|
525
|
+
|
|
526
|
+
# Check if suffix matches
|
|
527
|
+
if word.end_with?(suffix[:affix])
|
|
528
|
+
# Remove suffix and add strip value
|
|
529
|
+
stem = word[0...-suffix[:affix].length] + suffix[:affix_data][:strip]
|
|
530
|
+
|
|
531
|
+
# Check condition (only if condition_checker is present)
|
|
532
|
+
next if suffix[:condition_checker] && !suffix[:condition_checker].matches?(stem)
|
|
533
|
+
|
|
534
|
+
yield AffixForm.new(word, stem, suffix: suffix)
|
|
535
|
+
|
|
536
|
+
# Try removing another suffix (one level only)
|
|
537
|
+
unless nested
|
|
538
|
+
desuffix(stem,
|
|
539
|
+
required_flags: [suffix[:flag], *required_flags],
|
|
540
|
+
forbidden_flags: forbidden_flags,
|
|
541
|
+
nested: true,
|
|
542
|
+
crossproduct: crossproduct) do |form2|
|
|
543
|
+
yield form2.replace(suffix2: suffix, text: word)
|
|
544
|
+
end
|
|
545
|
+
end
|
|
546
|
+
end
|
|
547
|
+
end
|
|
548
|
+
end
|
|
549
|
+
|
|
550
|
+
# Remove prefixes from word.
|
|
551
|
+
#
|
|
552
|
+
# @param word [String] Word to process
|
|
553
|
+
# @param required_flags [Array<String>] Required prefix flags
|
|
554
|
+
# @param forbidden_flags [Array<String>] Forbidden prefix flags
|
|
555
|
+
# @param nested [Boolean] Whether this is a nested call
|
|
556
|
+
# @yield [AffixForm] Each form with prefix removed
|
|
557
|
+
def deprefix(word, required_flags: [], forbidden_flags: [], nested: false)
|
|
558
|
+
return enum_for(:deprefix, word,
|
|
559
|
+
required_flags: required_flags,
|
|
560
|
+
forbidden_flags: forbidden_flags,
|
|
561
|
+
nested: nested) unless block_given?
|
|
562
|
+
|
|
563
|
+
prefixes_index = @aff[:prefixes_index] || {}
|
|
564
|
+
|
|
565
|
+
prefixes_index[word[0]] ||= []
|
|
566
|
+
prefixes_index[word[0]].each do |prefix|
|
|
567
|
+
# Check if prefix is valid
|
|
568
|
+
next unless (required_flags - (prefix[:flags] || [])).empty?
|
|
569
|
+
next unless (forbidden_flags & (prefix[:flags] || [])).empty?
|
|
570
|
+
|
|
571
|
+
# Check if prefix matches
|
|
572
|
+
if word.start_with?(prefix[:affix])
|
|
573
|
+
# Remove prefix and add strip value
|
|
574
|
+
stem = word[prefix[:affix].length..] + prefix[:affix_data][:strip]
|
|
575
|
+
|
|
576
|
+
# Check condition (only if condition_checker is present)
|
|
577
|
+
next if prefix[:condition_checker] && !prefix[:condition_checker].matches?(stem)
|
|
578
|
+
|
|
579
|
+
yield AffixForm.new(word, stem, prefix: prefix)
|
|
580
|
+
|
|
581
|
+
# Try removing another prefix if COMPLEXPREFIXES is set
|
|
582
|
+
unless nested || !@aff[:COMPLEXPREFIXES]
|
|
583
|
+
deprefix(stem,
|
|
584
|
+
required_flags: [prefix[:flag], *required_flags],
|
|
585
|
+
forbidden_flags: forbidden_flags,
|
|
586
|
+
nested: true) do |form2|
|
|
587
|
+
yield form2.replace(prefix2: prefix, text: word)
|
|
588
|
+
end
|
|
589
|
+
end
|
|
590
|
+
end
|
|
591
|
+
end
|
|
592
|
+
end
|
|
593
|
+
|
|
594
|
+
# Check if an affix form is valid.
|
|
595
|
+
#
|
|
596
|
+
# @param form [AffixForm] Form to check
|
|
597
|
+
# @param captype [Symbol] Original word's capitalization type
|
|
598
|
+
# @param allow_nosuggest [Boolean] Include NOSUGGEST words
|
|
599
|
+
# @return [Boolean] Whether form is valid
|
|
600
|
+
def is_good_form(form, captype:, allow_nosuggest:)
|
|
601
|
+
return false unless form.in_dictionary
|
|
602
|
+
|
|
603
|
+
root_flags = form.in_dictionary[:flags] || []
|
|
604
|
+
all_flags = form.flags
|
|
605
|
+
|
|
606
|
+
# Check NOSUGGEST
|
|
607
|
+
if !allow_nosuggest && @aff[:NOSUGGEST] && root_flags.include?(@aff[:NOSUGGEST])
|
|
608
|
+
return false
|
|
609
|
+
end
|
|
610
|
+
|
|
611
|
+
# Check KEEPCASE
|
|
612
|
+
if @aff[:KEEPCASE] && root_flags.include?(@aff[:KEEPCASE])
|
|
613
|
+
stem_captype = @aff[:casing].guess(form.in_dictionary[:stem])
|
|
614
|
+
return false if captype != stem_captype && !(@aff[:CHECKSHARPS] && form.in_dictionary[:stem].include?('ß'))
|
|
615
|
+
end
|
|
616
|
+
|
|
617
|
+
# Check NEEDAFFIX
|
|
618
|
+
if @aff[:NEEDAFFIX]
|
|
619
|
+
if root_flags.include?(@aff[:NEEDAFFIX]) && !form.has_affixes?
|
|
620
|
+
return false
|
|
621
|
+
end
|
|
622
|
+
if form.has_affixes? && form.all_affixes.all? { |a| (a[:flags] || []).include?(@aff[:NEEDAFFIX]) }
|
|
623
|
+
return false
|
|
624
|
+
end
|
|
625
|
+
end
|
|
626
|
+
|
|
627
|
+
# Check prefix flag compatibility
|
|
628
|
+
if form.prefix && !all_flags.include?(form.prefix[:flag])
|
|
629
|
+
return false
|
|
630
|
+
end
|
|
631
|
+
|
|
632
|
+
# Check suffix flag compatibility
|
|
633
|
+
if form.suffix && !all_flags.include?(form.suffix[:flag])
|
|
634
|
+
return false
|
|
635
|
+
end
|
|
636
|
+
|
|
637
|
+
# Check CIRCUMFIX
|
|
638
|
+
if @aff[:CIRCUMFIX]
|
|
639
|
+
suffix_has = form.suffix && (form.suffix[:flags] || []).include?(@aff[:CIRCUMFIX])
|
|
640
|
+
prefix_has = form.prefix && (form.prefix[:flags] || []).include?(@aff[:CIRCUMFIX])
|
|
641
|
+
return false if suffix_has != prefix_has
|
|
642
|
+
end
|
|
643
|
+
|
|
644
|
+
# If not checking compound position, just check ONLYINCOMPOUND
|
|
645
|
+
return !all_flags.include?(@aff[:ONLYINCOMPOUND])
|
|
646
|
+
|
|
647
|
+
true
|
|
648
|
+
end
|
|
649
|
+
|
|
650
|
+
# Generate compound forms by flags.
|
|
651
|
+
#
|
|
652
|
+
# @param word_rest [String] Remaining word to process
|
|
653
|
+
# @param captype [Symbol] Capitalization type
|
|
654
|
+
# @param depth [Integer] Current recursion depth
|
|
655
|
+
# @param allow_nosuggest [Boolean] Include NOSUGGEST words
|
|
656
|
+
# @yield [CompoundForm] Each valid compound form
|
|
657
|
+
def compounds_by_flags(word_rest, captype:, depth: 0, allow_nosuggest: true)
|
|
658
|
+
return enum_for(:compounds_by_flags, word_rest,
|
|
659
|
+
captype: captype,
|
|
660
|
+
depth: depth,
|
|
661
|
+
allow_nosuggest: allow_nosuggest) unless block_given?
|
|
662
|
+
|
|
663
|
+
aff = @aff
|
|
664
|
+
compound_min = aff[:COMPOUNDMIN] || 3
|
|
665
|
+
compound_word_max = aff[:COMPOUNDWORDMAX]
|
|
666
|
+
compound_begin = aff[:COMPOUNDBEGIN]
|
|
667
|
+
compound_middle = aff[:COMPOUNDMIDDLE]
|
|
668
|
+
compound_end = aff[:COMPOUNDEND]
|
|
669
|
+
compound_flag = aff[:COMPOUNDFLAG]
|
|
670
|
+
compound_permit_flag = aff[:COMPOUNDPERMITFLAG]
|
|
671
|
+
compound_forbid_flag = aff[:COMPOUNDFORBIDFLAG]
|
|
672
|
+
|
|
673
|
+
forbidden_flags = compound_forbid_flag ? [compound_forbid_flag] : []
|
|
674
|
+
permit_flags = compound_permit_flag ? [compound_permit_flag] : []
|
|
675
|
+
|
|
676
|
+
# Check if rest can be compound end
|
|
677
|
+
if depth.positive?
|
|
678
|
+
affix_forms_internal(word_rest, captype: captype, allow_nosuggest: allow_nosuggest) do |form|
|
|
679
|
+
# Check if form can be at compound end
|
|
680
|
+
if can_be_at_compound_pos?(form, :end, compound_begin, compound_middle, compound_end, compound_flag, permit_flags)
|
|
681
|
+
yield CompoundForm.new([form])
|
|
682
|
+
end
|
|
683
|
+
end
|
|
684
|
+
end
|
|
685
|
+
|
|
686
|
+
# Check compounding limits
|
|
687
|
+
return if word_rest.length < compound_min * 2
|
|
688
|
+
return if compound_word_max && depth >= compound_word_max
|
|
689
|
+
|
|
690
|
+
compoundpos = depth.zero? ? CompoundPos::BEGIN_POS : CompoundPos::MIDDLE
|
|
691
|
+
prefix_flags = compoundpos == CompoundPos::BEGIN_POS ? [] : permit_flags
|
|
692
|
+
|
|
693
|
+
# Try all possible split positions
|
|
694
|
+
(compound_min...(word_rest.length - compound_min + 1)).each do |pos|
|
|
695
|
+
beg = word_rest[0...pos]
|
|
696
|
+
rest = word_rest[pos..]
|
|
697
|
+
|
|
698
|
+
# Check if beg is a valid word at this position
|
|
699
|
+
affix_forms_internal(beg, captype: captype, allow_nosuggest: allow_nosuggest) do |form|
|
|
700
|
+
next unless can_be_at_compound_pos?(form, compoundpos, compound_begin, compound_middle, compound_end, compound_flag, permit_flags)
|
|
701
|
+
|
|
702
|
+
# Recursively check rest
|
|
703
|
+
compounds_by_flags(rest, captype: captype, depth: depth + 1, allow_nosuggest: allow_nosuggest) do |partial|
|
|
704
|
+
yield CompoundForm.new([form, *partial.parts])
|
|
705
|
+
end
|
|
706
|
+
end
|
|
707
|
+
|
|
708
|
+
# SIMPLIFIEDTRIPLE handling
|
|
709
|
+
if aff[:SIMPLIFIEDTRIPLE] && !beg.empty? && !rest.empty? && beg[-1] == rest[0]
|
|
710
|
+
affix_forms_internal(beg + beg[-1], captype: captype, allow_nosuggest: allow_nosuggest) do |form|
|
|
711
|
+
next unless can_be_at_compound_pos?(form, compoundpos, compound_begin, compound_middle, compound_end, compound_flag, permit_flags)
|
|
712
|
+
|
|
713
|
+
compounds_by_flags(rest, captype: captype, depth: depth + 1, allow_nosuggest: allow_nosuggest) do |partial|
|
|
714
|
+
yield CompoundForm.new([form.replace(text: beg), *partial.parts])
|
|
715
|
+
end
|
|
716
|
+
end
|
|
717
|
+
end
|
|
718
|
+
end
|
|
719
|
+
end
|
|
720
|
+
|
|
721
|
+
# Generate compound forms by rules.
|
|
722
|
+
#
|
|
723
|
+
# @param word_rest [String] Remaining word to process
|
|
724
|
+
# @param prev_parts [Array<Hash>] Previously processed parts
|
|
725
|
+
# @param rules [Array<Hash>] Valid compound rules
|
|
726
|
+
# @param allow_nosuggest [Boolean] Include NOSUGGEST words
|
|
727
|
+
# @yield [CompoundForm] Each valid compound form
|
|
728
|
+
def compounds_by_rules(word_rest, prev_parts: [], rules: nil, allow_nosuggest: true)
|
|
729
|
+
return enum_for(:compounds_by_rules, word_rest,
|
|
730
|
+
prev_parts: prev_parts,
|
|
731
|
+
rules: rules,
|
|
732
|
+
allow_nosuggest: allow_nosuggest) unless block_given?
|
|
733
|
+
|
|
734
|
+
aff = @aff
|
|
735
|
+
compound_min = aff[:COMPOUNDMIN] || 3
|
|
736
|
+
compound_word_max = aff[:COMPOUNDWORDMAX]
|
|
737
|
+
compound_rules = aff[:COMPOUNDRULE] || []
|
|
738
|
+
|
|
739
|
+
rules ||= compound_rules
|
|
740
|
+
|
|
741
|
+
# Check if rest can be a complete compound
|
|
742
|
+
if prev_parts.any?
|
|
743
|
+
homonyms = @dic[:homonyms]&.call(word_rest) || []
|
|
744
|
+
homonyms.each do |homonym|
|
|
745
|
+
parts = [*prev_parts, homonym]
|
|
746
|
+
flag_sets = parts.map { |p| p[:flags] || [] }
|
|
747
|
+
|
|
748
|
+
if compound_rules.any? { |rule| rule[:full_match]&.call(flag_sets) }
|
|
749
|
+
yield CompoundForm.new([AffixForm.new(word_rest, word_rest)])
|
|
750
|
+
end
|
|
751
|
+
end
|
|
752
|
+
end
|
|
753
|
+
|
|
754
|
+
# Check limits
|
|
755
|
+
return if word_rest.length < compound_min * 2
|
|
756
|
+
return if compound_word_max && prev_parts.length >= compound_word_max
|
|
757
|
+
|
|
758
|
+
# Try all possible split positions
|
|
759
|
+
(compound_min...(word_rest.length - compound_min + 1)).each do |pos|
|
|
760
|
+
beg = word_rest[0...pos]
|
|
761
|
+
homonyms = @dic[:homonyms]&.call(beg) || []
|
|
762
|
+
|
|
763
|
+
homonyms.each do |homonym|
|
|
764
|
+
parts = [*prev_parts, homonym]
|
|
765
|
+
flag_sets = parts.map { |p| p[:flags] || [] }
|
|
766
|
+
|
|
767
|
+
matching_rules = compound_rules.select { |rule| rule[:partial_match]&.call(flag_sets) }
|
|
768
|
+
next if matching_rules.empty?
|
|
769
|
+
|
|
770
|
+
compounds_by_rules(word_rest[pos..], prev_parts: parts, rules: matching_rules, allow_nosuggest: allow_nosuggest) do |partial|
|
|
771
|
+
yield CompoundForm.new([AffixForm.new(beg, beg), *partial.parts])
|
|
772
|
+
end
|
|
773
|
+
end
|
|
774
|
+
end
|
|
775
|
+
end
|
|
776
|
+
|
|
777
|
+
# Check if form can be at specified compound position.
|
|
778
|
+
#
|
|
779
|
+
# @param form [AffixForm] Form to check
|
|
780
|
+
# @param pos [Symbol] Compound position
|
|
781
|
+
# @return [Boolean]
|
|
782
|
+
def can_be_at_compound_pos?(form, pos, compound_begin, compound_middle, compound_end, compound_flag, permit_flags)
|
|
783
|
+
flags = form.flags
|
|
784
|
+
|
|
785
|
+
return true if compound_flag && flags.include?(compound_flag)
|
|
786
|
+
|
|
787
|
+
case pos
|
|
788
|
+
when CompoundPos::BEGIN_POS
|
|
789
|
+
flags.include?(compound_begin)
|
|
790
|
+
when CompoundPos::MIDDLE
|
|
791
|
+
flags.include?(compound_middle) || permit_flags.any? { |f| flags.include?(f) }
|
|
792
|
+
when CompoundPos::END_POS
|
|
793
|
+
flags.include?(compound_end) || permit_flags.any? { |f| flags.include?(f) }
|
|
794
|
+
else
|
|
795
|
+
false
|
|
796
|
+
end
|
|
797
|
+
end
|
|
798
|
+
|
|
799
|
+
# Check if compound form has any issues.
|
|
800
|
+
#
|
|
801
|
+
# @param compound [CompoundForm] Compound to check
|
|
802
|
+
# @param captype [Symbol] Capitalization type
|
|
803
|
+
# @return [Boolean] Whether compound is bad
|
|
804
|
+
def is_bad_compound(compound, captype)
|
|
805
|
+
aff = @aff
|
|
806
|
+
|
|
807
|
+
# FORCEUCASE check
|
|
808
|
+
if aff[:FORCEUCASE] && ![Capitalization::Type::ALL, Capitalization::Type::INIT].include?(captype)
|
|
809
|
+
if @dic[:has_flag]&.call(compound.parts.last.text, aff[:FORCEUCASE])
|
|
810
|
+
return true
|
|
811
|
+
end
|
|
812
|
+
end
|
|
813
|
+
|
|
814
|
+
# Check all adjacent pairs
|
|
815
|
+
compound.parts.each_with_index do |left_paradigm, idx|
|
|
816
|
+
break if idx >= compound.parts.length - 1
|
|
817
|
+
|
|
818
|
+
left = left_paradigm.text
|
|
819
|
+
right_paradigm = compound.parts[idx + 1]
|
|
820
|
+
right = right_paradigm.text
|
|
821
|
+
|
|
822
|
+
# COMPOUNDFORBIDFLAG check
|
|
823
|
+
if aff[:COMPOUNDFORBIDFLAG] && @dic[:has_flag]&.call(left, aff[:COMPOUNDFORBIDFLAG])
|
|
824
|
+
return true
|
|
825
|
+
end
|
|
826
|
+
|
|
827
|
+
# Check if "left right" exists as single dictionary entry
|
|
828
|
+
if @affix_forms&.call(left + ' ' + right, captype: captype)&.any?
|
|
829
|
+
return true
|
|
830
|
+
end
|
|
831
|
+
|
|
832
|
+
# CHECKCOMPOUNDREP check
|
|
833
|
+
if aff[:CHECKCOMPOUNDREP] && aff[:REP]
|
|
834
|
+
Kotoshu::Algorithms::Permutations.replchars(left + right, aff[:REP]) do |candidate|
|
|
835
|
+
if candidate.is_a?(String) && @affix_forms&.call(candidate, captype: captype)&.any?
|
|
836
|
+
return true
|
|
837
|
+
end
|
|
838
|
+
end
|
|
839
|
+
end
|
|
840
|
+
|
|
841
|
+
# CHECKCOMPOUNDTRIPLE check
|
|
842
|
+
if aff[:CHECKCOMPOUNDTRIPLE]
|
|
843
|
+
if (left[-2..] + right[0]).chars.uniq.length == 1 ||
|
|
844
|
+
(left[-1] + right[0..1]).chars.uniq.length == 1
|
|
845
|
+
return true
|
|
846
|
+
end
|
|
847
|
+
end
|
|
848
|
+
|
|
849
|
+
# CHECKCOMPOUNDCASE check
|
|
850
|
+
if aff[:CHECKCOMPOUNDCASE]
|
|
851
|
+
right_c = right[0]
|
|
852
|
+
left_c = left[-1]
|
|
853
|
+
if (right_c == right_c.upcase || left_c == left_c.upcase) && right_c != '-' && left_c != '-'
|
|
854
|
+
return true
|
|
855
|
+
end
|
|
856
|
+
end
|
|
857
|
+
|
|
858
|
+
# CHECKCOMPOUNDPATTERN check
|
|
859
|
+
if aff[:CHECKCOMPOUNDPATTERN]
|
|
860
|
+
if aff[:CHECKCOMPOUNDPATTERN].any? { |pattern| pattern[:match]&.call(left_paradigm, right_paradigm) }
|
|
861
|
+
return true
|
|
862
|
+
end
|
|
863
|
+
end
|
|
864
|
+
|
|
865
|
+
# CHECKCOMPOUNDDUP check
|
|
866
|
+
if aff[:CHECKCOMPOUNDDUP] && left == right && idx == compound.parts.length - 2
|
|
867
|
+
return true
|
|
868
|
+
end
|
|
869
|
+
end
|
|
870
|
+
|
|
871
|
+
false
|
|
872
|
+
end
|
|
873
|
+
end
|
|
874
|
+
end
|
|
875
|
+
end
|
|
876
|
+
end
|