kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Suggestions
|
|
5
|
+
# Pipeline for composable suggestion strategies.
|
|
6
|
+
#
|
|
7
|
+
# Allows chaining multiple suggestion strategies that execute in sequence,
|
|
8
|
+
# with optional early termination when a stage produces no results.
|
|
9
|
+
#
|
|
10
|
+
# @example Creating a pipeline
|
|
11
|
+
# pipeline = Pipeline.new do |p|
|
|
12
|
+
# p.add :sym_spell
|
|
13
|
+
# p.add :phonetic
|
|
14
|
+
# p.add :ngram
|
|
15
|
+
# end
|
|
16
|
+
#
|
|
17
|
+
# @example Executing a pipeline
|
|
18
|
+
# result = pipeline.execute(context, strategies)
|
|
19
|
+
class Pipeline
|
|
20
|
+
# @return [Array<Symbol>] Ordered stage names
|
|
21
|
+
attr_reader :stages
|
|
22
|
+
|
|
23
|
+
# Create a new pipeline.
|
|
24
|
+
#
|
|
25
|
+
# @yield [pipeline] Optional block to add stages
|
|
26
|
+
# @return [Pipeline] New pipeline
|
|
27
|
+
#
|
|
28
|
+
# @example With block
|
|
29
|
+
# pipeline = Pipeline.new do |p|
|
|
30
|
+
# p.add :sym_spell
|
|
31
|
+
# p.add :phonetic
|
|
32
|
+
# end
|
|
33
|
+
def initialize
|
|
34
|
+
@stages = []
|
|
35
|
+
yield self if block_given?
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Add a stage to the pipeline.
|
|
39
|
+
#
|
|
40
|
+
# @param stage_name [Symbol] Name of the stage
|
|
41
|
+
# @return [Pipeline] Self for chaining
|
|
42
|
+
#
|
|
43
|
+
# @example
|
|
44
|
+
# pipeline.add(:sym_spell)
|
|
45
|
+
def add(stage_name)
|
|
46
|
+
@stages << stage_name
|
|
47
|
+
self
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Remove a stage from the pipeline.
|
|
51
|
+
#
|
|
52
|
+
# @param stage_name [Symbol] Name of the stage to remove
|
|
53
|
+
# @return [Pipeline] Self for chaining
|
|
54
|
+
#
|
|
55
|
+
# @example
|
|
56
|
+
# pipeline.remove(:phonetic)
|
|
57
|
+
def remove(stage_name)
|
|
58
|
+
@stages.delete(stage_name)
|
|
59
|
+
self
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Execute strategies through the pipeline.
|
|
63
|
+
#
|
|
64
|
+
# Strategies are executed in sequence. If a strategy returns
|
|
65
|
+
# an empty SuggestionSet, subsequent strategies are still executed
|
|
66
|
+
# unless early_termination is enabled.
|
|
67
|
+
#
|
|
68
|
+
# @param context [Context] The suggestion context
|
|
69
|
+
# @param strategies [Hash] Hash of stage_name => strategy_instance
|
|
70
|
+
# @param early_termination [Boolean] Whether to stop on empty result
|
|
71
|
+
# @return [SuggestionSet] Combined results from all stages
|
|
72
|
+
#
|
|
73
|
+
# @example
|
|
74
|
+
# strategies = { sym_spell: sym_spell_strategy, phonetic: phonetic_strategy }
|
|
75
|
+
# result = pipeline.execute(context, strategies)
|
|
76
|
+
def execute(context, strategies = nil, early_termination: false)
|
|
77
|
+
combined = SuggestionSet.empty
|
|
78
|
+
|
|
79
|
+
@stages.each do |stage_name|
|
|
80
|
+
strategy = if strategies.is_a?(Hash)
|
|
81
|
+
strategies[stage_name]
|
|
82
|
+
else
|
|
83
|
+
strategies
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
next unless strategy
|
|
87
|
+
|
|
88
|
+
result = strategy.generate(context)
|
|
89
|
+
|
|
90
|
+
# Combine results
|
|
91
|
+
combined = combine_results(combined, result)
|
|
92
|
+
|
|
93
|
+
# Early termination on empty result
|
|
94
|
+
break if early_termination && result.empty?
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
combined
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Check if pipeline has a stage.
|
|
101
|
+
#
|
|
102
|
+
# @param stage_name [Symbol] Stage name
|
|
103
|
+
# @return [Boolean] True if stage exists
|
|
104
|
+
def has_stage?(stage_name)
|
|
105
|
+
@stages.include?(stage_name)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Clear all stages.
|
|
109
|
+
#
|
|
110
|
+
# @return [Pipeline] Self for chaining
|
|
111
|
+
def clear
|
|
112
|
+
@stages.clear
|
|
113
|
+
self
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Clone the pipeline.
|
|
117
|
+
#
|
|
118
|
+
# @return [Pipeline] New pipeline with same stages
|
|
119
|
+
def clone
|
|
120
|
+
self.class.new.tap { |p| @stages.each { |s| p.add(s) } }
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
private
|
|
124
|
+
|
|
125
|
+
# Combine two suggestion sets.
|
|
126
|
+
#
|
|
127
|
+
# @param combined [SuggestionSet] Current combined results
|
|
128
|
+
# @param new_result [SuggestionSet] New results to add
|
|
129
|
+
# @return [SuggestionSet] Combined suggestion set
|
|
130
|
+
def combine_results(combined, new_result)
|
|
131
|
+
combined.concat(new_result.suggestions).unique
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Suggestions
|
|
5
|
+
module Strategies
|
|
6
|
+
# Base class for suggestion strategies.
|
|
7
|
+
#
|
|
8
|
+
# Subclasses must implement the {#generate} method.
|
|
9
|
+
#
|
|
10
|
+
# @example Implementing a custom strategy
|
|
11
|
+
# class MyStrategy < BaseStrategy
|
|
12
|
+
# def generate(context)
|
|
13
|
+
# # Return suggestions based on context.word
|
|
14
|
+
# SuggestionSet.from_words(%w[word1 word2], source: :my_strategy)
|
|
15
|
+
# end
|
|
16
|
+
# end
|
|
17
|
+
class BaseStrategy
|
|
18
|
+
# @return [Symbol] Strategy name
|
|
19
|
+
attr_reader :name
|
|
20
|
+
|
|
21
|
+
# @return [Hash] Strategy configuration
|
|
22
|
+
attr_reader :config
|
|
23
|
+
|
|
24
|
+
# Create a new base strategy.
|
|
25
|
+
#
|
|
26
|
+
# @param name [String, Symbol] Strategy name
|
|
27
|
+
# @param config [Hash] Configuration options
|
|
28
|
+
# @option config [Integer] max_results Maximum results to return
|
|
29
|
+
# @option config [Boolean] enabled Whether strategy is enabled
|
|
30
|
+
def initialize(name: :base, **config)
|
|
31
|
+
@name = name.to_sym
|
|
32
|
+
@config = config
|
|
33
|
+
@enabled = config.fetch(:enabled, true)
|
|
34
|
+
@max_results = config.fetch(:max_results, 10)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Generate suggestions for a word.
|
|
38
|
+
#
|
|
39
|
+
# @abstract Subclasses must implement this method.
|
|
40
|
+
# @param context [Context] The suggestion context
|
|
41
|
+
# @return [SuggestionSet] Generated suggestions
|
|
42
|
+
# @raise [NotImplementedError] Subclass must implement
|
|
43
|
+
def generate(context)
|
|
44
|
+
raise NotImplementedError, "#{self.class} must implement #generate"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Check if this strategy is enabled.
|
|
48
|
+
#
|
|
49
|
+
# @return [Boolean] True if enabled
|
|
50
|
+
def enabled?
|
|
51
|
+
@enabled
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Get the max results configuration.
|
|
55
|
+
#
|
|
56
|
+
# @param default [Integer] Default value if not set
|
|
57
|
+
# @return [Integer] Max results
|
|
58
|
+
def max_results(default = 10)
|
|
59
|
+
@max_results || default
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Get a configuration value.
|
|
63
|
+
#
|
|
64
|
+
# @param key [Symbol] The config key
|
|
65
|
+
# @param default [Object] Default value if not set
|
|
66
|
+
# @return [Object] The config value
|
|
67
|
+
def get_config(key, default = nil)
|
|
68
|
+
@config.fetch(key, default)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Check if a config value is present.
|
|
72
|
+
#
|
|
73
|
+
# @param key [Symbol] The config key
|
|
74
|
+
# @return [Boolean] True if config has the key
|
|
75
|
+
def has_config?(key)
|
|
76
|
+
@config.key?(key)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Get the priority for this strategy.
|
|
80
|
+
#
|
|
81
|
+
# @return [Integer] Priority (lower = higher priority)
|
|
82
|
+
def priority
|
|
83
|
+
@config.fetch(:priority, 100)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Check if this strategy should handle the context.
|
|
87
|
+
#
|
|
88
|
+
# Default implementation checks if the word is not in the dictionary.
|
|
89
|
+
# Subclasses can override for more specific logic.
|
|
90
|
+
#
|
|
91
|
+
# @param context [Context] The suggestion context
|
|
92
|
+
# @return [Boolean] True if the strategy should handle this context
|
|
93
|
+
def handles?(context)
|
|
94
|
+
return false unless enabled?
|
|
95
|
+
|
|
96
|
+
!dictionary_lookup(context, context.word)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Create a suggestion from a word.
|
|
100
|
+
#
|
|
101
|
+
# @param word [String] The suggested word
|
|
102
|
+
# @param distance [Integer] Edit distance
|
|
103
|
+
# @param confidence [Float] Confidence score
|
|
104
|
+
# @param metadata [Hash] Additional metadata for ranking
|
|
105
|
+
# @return [Suggestion] New suggestion
|
|
106
|
+
def create_suggestion(word, distance: 0, confidence: 1.0, **metadata)
|
|
107
|
+
Suggestion.new(
|
|
108
|
+
word: word,
|
|
109
|
+
distance: distance,
|
|
110
|
+
confidence: confidence,
|
|
111
|
+
source: @name,
|
|
112
|
+
**metadata
|
|
113
|
+
)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Create a suggestion set from words.
|
|
117
|
+
#
|
|
118
|
+
# @param words [Array<String>] Array of words
|
|
119
|
+
# @param distances [Hash] Optional word => distance mapping
|
|
120
|
+
# @param original_word [String] The original misspelled word (for ranking)
|
|
121
|
+
# @return [SuggestionSet] New suggestion set
|
|
122
|
+
def create_suggestion_set(words, distances: {}, original_word: nil)
|
|
123
|
+
suggestions = words.map do |word|
|
|
124
|
+
# Try case-sensitive first, then case-insensitive for distance lookup
|
|
125
|
+
distance = if distances.key?(word)
|
|
126
|
+
distances[word]
|
|
127
|
+
else
|
|
128
|
+
distances.fetch(word.downcase, 1)
|
|
129
|
+
end
|
|
130
|
+
confidence = calculate_confidence(distance)
|
|
131
|
+
|
|
132
|
+
# Calculate n-gram similarity (like Hunspell) for better ranking
|
|
133
|
+
ngram_score = if original_word
|
|
134
|
+
calculate_ngram_similarity(original_word, word)
|
|
135
|
+
else
|
|
136
|
+
0
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
metadata = {
|
|
140
|
+
original_length: original_word&.length || word.length,
|
|
141
|
+
ngram_score: ngram_score
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
create_suggestion(word, distance: distance, confidence: confidence, **metadata)
|
|
145
|
+
end
|
|
146
|
+
SuggestionSet.new(suggestions, max_size: max_results)
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Calculate typo correction similarity between two words.
|
|
150
|
+
#
|
|
151
|
+
# This is a custom similarity metric designed specifically for spelling
|
|
152
|
+
# correction, combining:
|
|
153
|
+
# - Character overlap (how many characters are shared)
|
|
154
|
+
# - Prefix weight (common prefix is very important for typos)
|
|
155
|
+
# - Suffix weight (common ending is also important)
|
|
156
|
+
# - Length penalty (very different lengths are less similar)
|
|
157
|
+
#
|
|
158
|
+
# Returns a value from 0.0 (no similarity) to 1.0 (identical).
|
|
159
|
+
#
|
|
160
|
+
# @param word1 [String] First word
|
|
161
|
+
# @param word2 [String] Second word
|
|
162
|
+
# @return [Float] Typo correction similarity (0.0 to 1.0)
|
|
163
|
+
def calculate_ngram_similarity(word1, word2)
|
|
164
|
+
return 0 if word1.nil? || word2.nil? || word1.empty? || word2.empty?
|
|
165
|
+
|
|
166
|
+
w1 = word1.downcase
|
|
167
|
+
w2 = word2.downcase
|
|
168
|
+
|
|
169
|
+
# Identical strings have maximum similarity
|
|
170
|
+
return 1.0 if w1 == w2
|
|
171
|
+
|
|
172
|
+
len1 = w1.length
|
|
173
|
+
len2 = w2.length
|
|
174
|
+
max_len = [len1, len2].max
|
|
175
|
+
|
|
176
|
+
# Calculate common prefix length (up to 4 characters)
|
|
177
|
+
prefix_len = 0
|
|
178
|
+
(0...[len1, len2, 4].min).each do |i|
|
|
179
|
+
break if w1[i] != w2[i]
|
|
180
|
+
prefix_len += 1
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Calculate common suffix length
|
|
184
|
+
suffix_len = 0
|
|
185
|
+
(1..[len1, len2, 4].min).each do |i|
|
|
186
|
+
break if w1[-i] != w2[-i]
|
|
187
|
+
suffix_len += 1
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Calculate character overlap (how many characters from w1 are in w2)
|
|
191
|
+
w2_chars = w2.chars
|
|
192
|
+
overlap = w1.chars.count { |c| w2_chars.include?(c) }
|
|
193
|
+
|
|
194
|
+
# Calculate similarity score
|
|
195
|
+
# 1. Base score from character overlap
|
|
196
|
+
similarity = overlap.to_f / max_len
|
|
197
|
+
|
|
198
|
+
# 2. Prefix bonus (common start is very important for typos)
|
|
199
|
+
prefix_bonus = prefix_len * 0.15
|
|
200
|
+
|
|
201
|
+
# 3. Suffix bonus (common ending is also important)
|
|
202
|
+
suffix_bonus = suffix_len * 0.05
|
|
203
|
+
|
|
204
|
+
# 4. Length penalty (very different lengths are less similar)
|
|
205
|
+
length_diff = (len1 - len2).abs
|
|
206
|
+
length_penalty = length_diff * 0.1
|
|
207
|
+
|
|
208
|
+
# Combine all factors
|
|
209
|
+
similarity = similarity + prefix_bonus + suffix_bonus - length_penalty
|
|
210
|
+
|
|
211
|
+
# Cap at 1.0, floor at 0.0
|
|
212
|
+
[[similarity, 1.0].min, 0.0].max
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Generate n-grams for a word.
|
|
216
|
+
#
|
|
217
|
+
# @param word [String] The word
|
|
218
|
+
# @param n [Integer] N-gram size
|
|
219
|
+
# @return [Set<String>] Set of n-grams
|
|
220
|
+
def generate_ngrams(word, n)
|
|
221
|
+
ngrams = Set.new
|
|
222
|
+
(word.length - n + 1).times do |i|
|
|
223
|
+
ngrams.add(word[i, n])
|
|
224
|
+
end
|
|
225
|
+
ngrams
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Convert strategy to string.
|
|
229
|
+
#
|
|
230
|
+
# @return [String] String representation
|
|
231
|
+
def to_s
|
|
232
|
+
"#{self.class.name}(name: #{@name}, enabled: #{enabled?})"
|
|
233
|
+
end
|
|
234
|
+
alias inspect to_s
|
|
235
|
+
|
|
236
|
+
private
|
|
237
|
+
|
|
238
|
+
# Look up a word in the dictionary.
|
|
239
|
+
#
|
|
240
|
+
# @param context [Context] The suggestion context
|
|
241
|
+
# @param word [String] The word to look up
|
|
242
|
+
# @return [Boolean] True if word exists
|
|
243
|
+
def dictionary_lookup(context, word)
|
|
244
|
+
dictionary = context.dictionary
|
|
245
|
+
|
|
246
|
+
# Check if it's a dictionary backend with lookup method
|
|
247
|
+
if dictionary.respond_to?(:lookup)
|
|
248
|
+
dictionary.lookup(word)
|
|
249
|
+
elsif dictionary.is_a?(::Kotoshu::Core::IndexedDictionary)
|
|
250
|
+
dictionary.has_word?(word)
|
|
251
|
+
elsif dictionary.respond_to?(:include?)
|
|
252
|
+
dictionary.include?(word)
|
|
253
|
+
elsif dictionary.is_a?(Hash)
|
|
254
|
+
dictionary.key?(word)
|
|
255
|
+
else
|
|
256
|
+
false
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
# Get all words from the dictionary.
|
|
261
|
+
#
|
|
262
|
+
# @param context [Context] The suggestion context
|
|
263
|
+
# @return [Array<String>] All words
|
|
264
|
+
def dictionary_words(context)
|
|
265
|
+
dictionary = context.dictionary
|
|
266
|
+
|
|
267
|
+
if dictionary.respond_to?(:words)
|
|
268
|
+
dictionary.words
|
|
269
|
+
elsif dictionary.is_a?(Array)
|
|
270
|
+
dictionary
|
|
271
|
+
elsif dictionary.is_a?(Hash)
|
|
272
|
+
dictionary.keys
|
|
273
|
+
elsif dictionary.is_a?(::Kotoshu::Core::IndexedDictionary)
|
|
274
|
+
dictionary.words
|
|
275
|
+
else
|
|
276
|
+
[]
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# Calculate confidence from distance.
|
|
281
|
+
#
|
|
282
|
+
# Higher distance = lower confidence.
|
|
283
|
+
#
|
|
284
|
+
# @param distance [Integer] Edit distance
|
|
285
|
+
# @return [Float] Confidence score (0.0 to 1.0)
|
|
286
|
+
def calculate_confidence(distance)
|
|
287
|
+
return 1.0 if distance.zero?
|
|
288
|
+
|
|
289
|
+
# Simple decay: confidence = 1 / (1 + distance)
|
|
290
|
+
# Can be overridden by subclasses for more sophisticated calculations
|
|
291
|
+
1.0 / (1.0 + distance)
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
end
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Suggestions
|
|
5
|
+
module Strategies
|
|
6
|
+
# Composite strategy that chains multiple suggestion strategies.
|
|
7
|
+
# Implements the Composite Pattern for extensible suggestion generation.
|
|
8
|
+
#
|
|
9
|
+
# This is MORE OOP than Spylls which has a procedural suggestion pipeline.
|
|
10
|
+
# Here, strategies are proper objects that can be added/removed/reordered.
|
|
11
|
+
#
|
|
12
|
+
# @example Using composite strategy
|
|
13
|
+
# pipeline = CompositeStrategy.new(name: :pipeline)
|
|
14
|
+
# pipeline.add(EditDistanceStrategy.new)
|
|
15
|
+
# pipeline.add(PhoneticStrategy.new)
|
|
16
|
+
# pipeline.add(NgramStrategy.new)
|
|
17
|
+
# suggestions = pipeline.generate(context)
|
|
18
|
+
class CompositeStrategy < BaseStrategy
|
|
19
|
+
attr_reader :strategies
|
|
20
|
+
|
|
21
|
+
# @param name [String, Symbol] Name of the composite
|
|
22
|
+
# @param strategies [Array<BaseStrategy>] Initial strategies
|
|
23
|
+
# @param config [Hash] Configuration options
|
|
24
|
+
def initialize(name:, strategies: [], **config)
|
|
25
|
+
@strategies = strategies
|
|
26
|
+
super(name: name, **config)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Add a strategy to the pipeline.
|
|
30
|
+
#
|
|
31
|
+
# @param strategy [BaseStrategy] The strategy to add
|
|
32
|
+
# @return [CompositeStrategy] Self for chaining
|
|
33
|
+
def add(strategy)
|
|
34
|
+
@strategies << strategy
|
|
35
|
+
self
|
|
36
|
+
end
|
|
37
|
+
alias << add
|
|
38
|
+
|
|
39
|
+
# Remove a strategy from the pipeline.
|
|
40
|
+
#
|
|
41
|
+
# @param strategy [BaseStrategy] The strategy to remove
|
|
42
|
+
# @return [CompositeStrategy] Self for chaining
|
|
43
|
+
def remove(strategy)
|
|
44
|
+
@strategies.delete(strategy)
|
|
45
|
+
self
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Clear all strategies.
|
|
49
|
+
#
|
|
50
|
+
# @return [CompositeStrategy] Self for chaining
|
|
51
|
+
def clear
|
|
52
|
+
@strategies.clear
|
|
53
|
+
self
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Get strategies that can handle the given context.
|
|
57
|
+
#
|
|
58
|
+
# @param context [Context] The suggestion context
|
|
59
|
+
# @return [Array<BaseStrategy>] Applicable strategies
|
|
60
|
+
def applicable_strategies(context)
|
|
61
|
+
@strategies.select { |s| s.handles?(context) }
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Generate suggestions by delegating to all child strategies.
|
|
65
|
+
#
|
|
66
|
+
# @param context [Context] The suggestion context
|
|
67
|
+
# @return [SuggestionSet] Combined suggestions from all strategies
|
|
68
|
+
def generate(context)
|
|
69
|
+
# Create result set
|
|
70
|
+
result = SuggestionSet.empty(max_size: context.max_results)
|
|
71
|
+
|
|
72
|
+
# Process each applicable strategy
|
|
73
|
+
applicable_strategies(context).each do |strategy|
|
|
74
|
+
strategy_result = strategy.generate(context)
|
|
75
|
+
result.merge!(strategy_result)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
result
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Check if any strategy can handle the context.
|
|
82
|
+
#
|
|
83
|
+
# @param context [Context] The suggestion context
|
|
84
|
+
# @return [Boolean] True if any strategy handles the context
|
|
85
|
+
def handles?(context)
|
|
86
|
+
applicable_strategies(context).any?
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Get the number of strategies.
|
|
90
|
+
#
|
|
91
|
+
# @return [Integer] Number of strategies
|
|
92
|
+
def size
|
|
93
|
+
@strategies.size
|
|
94
|
+
end
|
|
95
|
+
alias count size
|
|
96
|
+
|
|
97
|
+
# Check if the composite has any strategies.
|
|
98
|
+
#
|
|
99
|
+
# @return [Boolean] True if there are strategies
|
|
100
|
+
def any?
|
|
101
|
+
@strategies.any?
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Iterate over strategies.
|
|
105
|
+
#
|
|
106
|
+
# @yield [strategy] Each strategy
|
|
107
|
+
# @return [Enumerator] Enumerator if no block given
|
|
108
|
+
def each_strategy(&block)
|
|
109
|
+
return enum_for(:each_strategy) unless block_given?
|
|
110
|
+
|
|
111
|
+
@strategies.each(&block)
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Sort strategies by priority.
|
|
115
|
+
#
|
|
116
|
+
# @return [CompositeStrategy] Self for chaining
|
|
117
|
+
def sort_by_priority!
|
|
118
|
+
@strategies.sort_by!(&:priority)
|
|
119
|
+
self
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Convert to string.
|
|
123
|
+
#
|
|
124
|
+
# @return [String] String representation
|
|
125
|
+
def to_s
|
|
126
|
+
"#{self.class.name}(name: #{@name}, strategies: #{@strategies.map(&:name).join(", ")})"
|
|
127
|
+
end
|
|
128
|
+
alias inspect to_s
|
|
129
|
+
|
|
130
|
+
# Create a composite strategy with default algorithms.
|
|
131
|
+
#
|
|
132
|
+
# @param config [Hash] Configuration
|
|
133
|
+
# @return [CompositeStrategy] New composite with default strategies
|
|
134
|
+
def self.with_defaults(**config)
|
|
135
|
+
new(name: :default, **config)
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
end
|