kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Core
|
|
5
|
+
module Trie
|
|
6
|
+
# Builder class for constructing Trie objects.
|
|
7
|
+
# Provides a fluent interface for building tries from various sources.
|
|
8
|
+
class Builder
|
|
9
|
+
def initialize
|
|
10
|
+
@trie = Trie.new
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# Add a single word to the trie.
|
|
14
|
+
#
|
|
15
|
+
# @param word [String] The word to add
|
|
16
|
+
# @param payload [Object] Optional payload
|
|
17
|
+
# @return [Builder] Self for chaining
|
|
18
|
+
def add_word(word, payload = nil)
|
|
19
|
+
@trie.insert(word, payload)
|
|
20
|
+
self
|
|
21
|
+
end
|
|
22
|
+
alias << add_word
|
|
23
|
+
|
|
24
|
+
# Add multiple words to the trie.
|
|
25
|
+
#
|
|
26
|
+
# @param words [Array<String>] Array of words to add
|
|
27
|
+
# @return [Builder] Self for chaining
|
|
28
|
+
def add_words(words)
|
|
29
|
+
words.each { |word| add_word(word) }
|
|
30
|
+
self
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Build a trie from a hash (word => payload mapping).
|
|
34
|
+
#
|
|
35
|
+
# @param hash [Hash] Hash of words to payloads
|
|
36
|
+
# @return [Builder] Self for chaining
|
|
37
|
+
def from_hash(hash)
|
|
38
|
+
hash.each { |word, payload| add_word(word, payload) }
|
|
39
|
+
self
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Build a trie from an array of words.
|
|
43
|
+
#
|
|
44
|
+
# @param array [Array<String>] Array of words
|
|
45
|
+
# @return [Builder] Self for chaining
|
|
46
|
+
def from_array(array)
|
|
47
|
+
add_words(array)
|
|
48
|
+
self
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Build a trie from a file (one word per line).
|
|
52
|
+
#
|
|
53
|
+
# @param path [String] Path to the file
|
|
54
|
+
# @return [Builder] Self for chaining
|
|
55
|
+
def from_file(path)
|
|
56
|
+
File.foreach(path, chomp: true) do |line|
|
|
57
|
+
next if line.empty? || line.start_with?("#")
|
|
58
|
+
|
|
59
|
+
add_word(line)
|
|
60
|
+
end
|
|
61
|
+
self
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Build a trie from a string (newline-separated words).
|
|
65
|
+
#
|
|
66
|
+
# @param text [String] String containing words
|
|
67
|
+
# @return [Builder] Self for chaining
|
|
68
|
+
def from_string(text)
|
|
69
|
+
text.each_line do |line|
|
|
70
|
+
word = line.strip
|
|
71
|
+
next if word.empty? || word.start_with?("#")
|
|
72
|
+
|
|
73
|
+
add_word(word)
|
|
74
|
+
end
|
|
75
|
+
self
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Get the built trie.
|
|
79
|
+
#
|
|
80
|
+
# @return [Trie] The constructed trie
|
|
81
|
+
def build
|
|
82
|
+
@trie.freeze
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Build a trie from a file path (class method).
|
|
86
|
+
#
|
|
87
|
+
# @param path [String] Path to the file
|
|
88
|
+
# @return [Trie] The constructed trie
|
|
89
|
+
def self.from_file(path)
|
|
90
|
+
new.from_file(path).build
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Build a trie from an array of words (class method).
|
|
94
|
+
#
|
|
95
|
+
# @param words [Array<String>] Array of words
|
|
96
|
+
# @return [Trie] The constructed trie
|
|
97
|
+
def self.from_array(words)
|
|
98
|
+
new.from_array(words).build
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Build a trie from a hash (class method).
|
|
102
|
+
#
|
|
103
|
+
# @param hash [Hash] Hash of words to payloads
|
|
104
|
+
# @return [Trie] The constructed trie
|
|
105
|
+
def self.from_hash(hash)
|
|
106
|
+
new.from_hash(hash).build
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Build a trie from a string (class method).
|
|
110
|
+
#
|
|
111
|
+
# @param text [String] String containing words
|
|
112
|
+
# @return [Trie] The constructed trie
|
|
113
|
+
def self.from_string(text)
|
|
114
|
+
new.from_string(text).build
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Core
|
|
5
|
+
module Trie
|
|
6
|
+
# Node in the Trie data structure.
|
|
7
|
+
# Each node represents a character and its children.
|
|
8
|
+
class Node
|
|
9
|
+
attr_reader :character, :children, :terminal, :payload
|
|
10
|
+
|
|
11
|
+
def initialize(character = "")
|
|
12
|
+
@character = character
|
|
13
|
+
@children = {}
|
|
14
|
+
@terminal = false
|
|
15
|
+
@payload = nil
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Add a child node for the given character.
|
|
19
|
+
#
|
|
20
|
+
# @param character [String] The character to add
|
|
21
|
+
# @return [Node] The new or existing child node
|
|
22
|
+
def add_child(character)
|
|
23
|
+
@children[character] ||= Node.new(character)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Get child node for the given character.
|
|
27
|
+
#
|
|
28
|
+
# @param character [String] The character to look up
|
|
29
|
+
# @return [Node, nil] The child node or nil if not found
|
|
30
|
+
def child(character)
|
|
31
|
+
@children[character]
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Check if this node has a child for the given character.
|
|
35
|
+
#
|
|
36
|
+
# @param character [String] The character to check
|
|
37
|
+
# @return [Boolean] True if child exists
|
|
38
|
+
def has_child?(character)
|
|
39
|
+
@children.key?(character)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Mark this node as terminal (end of a word).
|
|
43
|
+
#
|
|
44
|
+
# @param payload [Object] Optional payload to store at this node
|
|
45
|
+
def mark_terminal(payload = nil)
|
|
46
|
+
@terminal = true
|
|
47
|
+
@payload = payload
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Check if this node is terminal.
|
|
51
|
+
#
|
|
52
|
+
# @return [Boolean] True if this is the end of a word
|
|
53
|
+
def terminal?
|
|
54
|
+
@terminal
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Get all children of this node.
|
|
58
|
+
#
|
|
59
|
+
# @return [Hash] Hash of character to node mappings
|
|
60
|
+
def all_children
|
|
61
|
+
@children
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Check if this node has any children.
|
|
65
|
+
#
|
|
66
|
+
# @return [Boolean] True if there are children
|
|
67
|
+
def has_children?
|
|
68
|
+
!@children.empty?
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Get the number of children.
|
|
72
|
+
#
|
|
73
|
+
# @return [Integer] Number of child nodes
|
|
74
|
+
def child_count
|
|
75
|
+
@children.size
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Convert node to string representation.
|
|
79
|
+
#
|
|
80
|
+
# @return [String] String representation
|
|
81
|
+
def to_s
|
|
82
|
+
"Node('#{@character}', terminal: #{@terminal}, children: #{@children.keys})"
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Inspect the node.
|
|
86
|
+
#
|
|
87
|
+
# @return [String] Inspection string
|
|
88
|
+
def inspect
|
|
89
|
+
to_s
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Core
|
|
5
|
+
module Trie
|
|
6
|
+
# Trie (prefix tree) data structure for efficient word storage and lookup.
|
|
7
|
+
# Supports prefix matching, word validation, and traversal.
|
|
8
|
+
class Trie
|
|
9
|
+
attr_reader :root, :size
|
|
10
|
+
|
|
11
|
+
def initialize
|
|
12
|
+
@root = Node.new
|
|
13
|
+
@size = 0
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Insert a word into the trie.
|
|
17
|
+
#
|
|
18
|
+
# @param word [String] The word to insert
|
|
19
|
+
# @param payload [Object] Optional payload to store with the word
|
|
20
|
+
# @return [Trie] Self for chaining
|
|
21
|
+
def insert(word, payload = nil)
|
|
22
|
+
node = @root
|
|
23
|
+
word.each_char do |char|
|
|
24
|
+
node = node.add_child(char)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Only increment size if this is a new word
|
|
28
|
+
@size += 1 unless node.terminal?
|
|
29
|
+
node.mark_terminal(payload)
|
|
30
|
+
|
|
31
|
+
self
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Check if a word exists in the trie.
|
|
35
|
+
#
|
|
36
|
+
# @param word [String] The word to look up
|
|
37
|
+
# @return [Boolean] True if the word exists
|
|
38
|
+
def lookup(word)
|
|
39
|
+
node = find_node(word)
|
|
40
|
+
!node.nil? && node.terminal?
|
|
41
|
+
end
|
|
42
|
+
alias has_word? lookup
|
|
43
|
+
alias contains? lookup
|
|
44
|
+
|
|
45
|
+
# Check if any words in the trie start with the given prefix.
|
|
46
|
+
#
|
|
47
|
+
# @param prefix [String] The prefix to check
|
|
48
|
+
# @return [Boolean] True if any words have this prefix
|
|
49
|
+
def has_prefix?(prefix)
|
|
50
|
+
!find_node(prefix).nil?
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Get the node for a given word/prefix.
|
|
54
|
+
#
|
|
55
|
+
# @param word [String] The word or prefix to find
|
|
56
|
+
# @return [Node, nil] The node or nil if not found
|
|
57
|
+
def find_node(word)
|
|
58
|
+
node = @root
|
|
59
|
+
word.each_char do |char|
|
|
60
|
+
return nil unless node.has_child?(char)
|
|
61
|
+
|
|
62
|
+
node = node.child(char)
|
|
63
|
+
end
|
|
64
|
+
node
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Get all words with the given prefix.
|
|
68
|
+
#
|
|
69
|
+
# @param prefix [String] The prefix to match
|
|
70
|
+
# @return [Array<String>] Array of words with the prefix
|
|
71
|
+
def words_with_prefix(prefix)
|
|
72
|
+
start_node = find_node(prefix)
|
|
73
|
+
return [] if start_node.nil?
|
|
74
|
+
|
|
75
|
+
words = []
|
|
76
|
+
collect_words(start_node, prefix, words)
|
|
77
|
+
words
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Get all words in the trie.
|
|
81
|
+
#
|
|
82
|
+
# @return [Array<String>] Array of all words
|
|
83
|
+
def all_words
|
|
84
|
+
words = []
|
|
85
|
+
collect_words(@root, "", words)
|
|
86
|
+
words
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Count words with the given prefix.
|
|
90
|
+
#
|
|
91
|
+
# @param prefix [String] The prefix to count
|
|
92
|
+
# @return [Integer] Number of words with the prefix
|
|
93
|
+
def count_prefix(prefix)
|
|
94
|
+
words_with_prefix(prefix).size
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Get suggestions for a word based on prefix matching.
|
|
98
|
+
# Returns words that share the longest common prefix.
|
|
99
|
+
#
|
|
100
|
+
# @param word [String] The word to get suggestions for
|
|
101
|
+
# @param max_results [Integer] Maximum number of results
|
|
102
|
+
# @return [Array<String>] Array of suggested words
|
|
103
|
+
def suggestions(word, max_results: 10)
|
|
104
|
+
# Find the longest matching prefix
|
|
105
|
+
node = @root
|
|
106
|
+
i = 0
|
|
107
|
+
|
|
108
|
+
while i < word.length && node.has_child?(word[i])
|
|
109
|
+
node = node.child(word[i])
|
|
110
|
+
i += 1
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Collect all completions from this point
|
|
114
|
+
words = []
|
|
115
|
+
collect_words_limited(node, word[0...i], words, max_results)
|
|
116
|
+
words
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Iterate over all words in the trie.
|
|
120
|
+
#
|
|
121
|
+
# @yield [word, payload] Each word and its optional payload
|
|
122
|
+
# @return [Enumerator] Enumerator if no block given
|
|
123
|
+
def each_word
|
|
124
|
+
return enum_for(:each_word) unless block_given?
|
|
125
|
+
|
|
126
|
+
traverse(@root, "") do |word, node|
|
|
127
|
+
yield word, node.payload if node.terminal?
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
self
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Traverse the trie with a visitor.
|
|
134
|
+
#
|
|
135
|
+
# @yield [prefix, node] Each prefix and node visited
|
|
136
|
+
# @return [Trie] Self for chaining
|
|
137
|
+
def traverse(node = @root, prefix = "", &block)
|
|
138
|
+
return enum_for(:traverse, node, prefix) unless block_given?
|
|
139
|
+
|
|
140
|
+
yield prefix, node
|
|
141
|
+
|
|
142
|
+
node.all_children.each_value do |child|
|
|
143
|
+
traverse(child, prefix + child.character, &block)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
self
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Check if the trie is empty.
|
|
150
|
+
#
|
|
151
|
+
# @return [Boolean] True if trie has no words
|
|
152
|
+
def empty?
|
|
153
|
+
@size.zero?
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Clear all words from the trie.
|
|
157
|
+
#
|
|
158
|
+
# @return [Trie] Self for chaining
|
|
159
|
+
def clear
|
|
160
|
+
@root = Node.new
|
|
161
|
+
@size = 0
|
|
162
|
+
self
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Merge another trie into this one.
|
|
166
|
+
#
|
|
167
|
+
# @param other [Trie] The trie to merge
|
|
168
|
+
# @return [Trie] Self for chaining
|
|
169
|
+
def merge!(other)
|
|
170
|
+
other.each_word do |word, payload|
|
|
171
|
+
insert(word, payload)
|
|
172
|
+
end
|
|
173
|
+
self
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Create a new trie with common words from two tries.
|
|
177
|
+
#
|
|
178
|
+
# @param other [Trie] The other trie
|
|
179
|
+
# @return [Trie] New trie with common words
|
|
180
|
+
def &(other)
|
|
181
|
+
result = Trie.new
|
|
182
|
+
each_word do |word, _payload|
|
|
183
|
+
result.insert(word) if other.lookup(word)
|
|
184
|
+
end
|
|
185
|
+
result
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Create a new trie with words from either trie.
|
|
189
|
+
#
|
|
190
|
+
# @param other [Trie] The other trie
|
|
191
|
+
# @return [Trie] New trie with all words
|
|
192
|
+
def |(other)
|
|
193
|
+
result = Trie.new
|
|
194
|
+
each_word { |word, payload| result.insert(word, payload) }
|
|
195
|
+
other.each_word { |word, payload| result.insert(word, payload) }
|
|
196
|
+
result
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Convert trie to string representation.
|
|
200
|
+
#
|
|
201
|
+
# @return [String] String representation
|
|
202
|
+
def to_s
|
|
203
|
+
"Trie(size: #{@size})"
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Inspect the trie.
|
|
207
|
+
#
|
|
208
|
+
# @return [String] Inspection string
|
|
209
|
+
def inspect
|
|
210
|
+
to_s
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
private
|
|
214
|
+
|
|
215
|
+
# Collect all words from a given node.
|
|
216
|
+
#
|
|
217
|
+
# @param node [Node] The starting node
|
|
218
|
+
# @param prefix [String] The current prefix
|
|
219
|
+
# @param words [Array] Array to collect words into
|
|
220
|
+
def collect_words(node, prefix, words)
|
|
221
|
+
words << prefix if node.terminal?
|
|
222
|
+
|
|
223
|
+
node.all_children.each do |char, child|
|
|
224
|
+
collect_words(child, prefix + char, words)
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Collect words with a limit.
|
|
229
|
+
#
|
|
230
|
+
# @param node [Node] The starting node
|
|
231
|
+
# @param prefix [String] The current prefix
|
|
232
|
+
# @param words [Array] Array to collect words into
|
|
233
|
+
# @param limit [Integer] Maximum number of words to collect
|
|
234
|
+
def collect_words_limited(node, prefix, words, limit)
|
|
235
|
+
return if words.size >= limit
|
|
236
|
+
|
|
237
|
+
words << prefix if node.terminal?
|
|
238
|
+
|
|
239
|
+
return if words.size >= limit
|
|
240
|
+
|
|
241
|
+
node.all_children.each_value do |child|
|
|
242
|
+
collect_words_limited(child, prefix + child.character, words, limit)
|
|
243
|
+
break if words.size >= limit
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
end
|
data/lib/kotoshu/core.rb
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
# Core domain models and infrastructure.
|
|
5
|
+
#
|
|
6
|
+
# This module contains the fundamental domain models for the spell checker:
|
|
7
|
+
# - IndexedDictionary: Fast word lookup with multiple indexes
|
|
8
|
+
# - Trie: Prefix tree data structure for efficient string operations
|
|
9
|
+
# - Models: Value objects and result types
|
|
10
|
+
#
|
|
11
|
+
# @example Creating an indexed dictionary
|
|
12
|
+
# dict = Kotoshu::Core::IndexedDictionary.new(%w[hello world test])
|
|
13
|
+
# dict.include?("hello") # => true
|
|
14
|
+
#
|
|
15
|
+
# @example Creating a trie
|
|
16
|
+
# trie = Kotoshu::Core::Trie::Trie.new
|
|
17
|
+
# trie.insert("hello")
|
|
18
|
+
# trie.lookup("hello") # => true
|
|
19
|
+
module Core
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Require core submodules
|
|
24
|
+
require_relative "core/exceptions"
|
|
25
|
+
require_relative "core/indexed_dictionary"
|
|
26
|
+
require_relative "core/trie/trie"
|
|
27
|
+
require_relative "core/trie/builder"
|
|
28
|
+
require_relative "core/trie/node"
|