kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module DataStructures
|
|
5
|
+
# Bloom filter - probabilistic data structure for fast membership testing.
|
|
6
|
+
#
|
|
7
|
+
# A Bloom filter is a space-efficient probabilistic data structure that
|
|
8
|
+
# is used to test whether an element is a member of a set. False positive
|
|
9
|
+
# matches are possible, but false negatives are not.
|
|
10
|
+
#
|
|
11
|
+
# @example Basic usage
|
|
12
|
+
# filter = BloomFilter.new
|
|
13
|
+
# filter.add("hello")
|
|
14
|
+
# filter.include?("hello") # => true (definitely in set)
|
|
15
|
+
# filter.include?("world") # => false (probably not in set)
|
|
16
|
+
#
|
|
17
|
+
# @see https://en.wikipedia.org/wiki/Bloom_filter Bloom filter Wikipedia
|
|
18
|
+
class BloomFilter
|
|
19
|
+
# Default false positive rate (1%)
|
|
20
|
+
DEFAULT_FALSE_POSITIVE_RATE = 0.01
|
|
21
|
+
|
|
22
|
+
# Default expected number of elements
|
|
23
|
+
DEFAULT_EXPECTED_SIZE = 10_000
|
|
24
|
+
|
|
25
|
+
# @return [Integer] Size of the bit array
|
|
26
|
+
attr_reader :size
|
|
27
|
+
|
|
28
|
+
# @return [Integer] Number of hash functions
|
|
29
|
+
attr_reader :hash_count
|
|
30
|
+
|
|
31
|
+
# @return [Integer] Number of items added
|
|
32
|
+
attr_reader :item_count
|
|
33
|
+
|
|
34
|
+
# Create a new Bloom filter.
|
|
35
|
+
#
|
|
36
|
+
# @param expected_size [Integer] Expected number of elements (default: 10_000)
|
|
37
|
+
# @param false_positive_rate [Float] Desired false positive rate (default: 0.01)
|
|
38
|
+
# @param case_sensitive [Boolean] Whether lookups are case-sensitive (default: false)
|
|
39
|
+
def initialize(expected_size: DEFAULT_EXPECTED_SIZE,
|
|
40
|
+
false_positive_rate: DEFAULT_FALSE_POSITIVE_RATE,
|
|
41
|
+
case_sensitive: false)
|
|
42
|
+
@case_sensitive = case_sensitive
|
|
43
|
+
@item_count = 0
|
|
44
|
+
|
|
45
|
+
# Calculate optimal size and hash count
|
|
46
|
+
# m = -n * ln(p) / (ln(2)^2)
|
|
47
|
+
# k = (m/n) * ln(2)
|
|
48
|
+
@size = calculate_size(expected_size, false_positive_rate)
|
|
49
|
+
@hash_count = calculate_hash_count(@size, expected_size)
|
|
50
|
+
|
|
51
|
+
# Initialize bit array
|
|
52
|
+
@bits = Array.new(@size, false)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Add an element to the filter.
|
|
56
|
+
#
|
|
57
|
+
# @param item [String] The item to add
|
|
58
|
+
# @return [self] Self for chaining
|
|
59
|
+
def add(item)
|
|
60
|
+
normalized_item = normalize_item(item)
|
|
61
|
+
|
|
62
|
+
@hash_count.times do |i|
|
|
63
|
+
index = hash_index(normalized_item, i)
|
|
64
|
+
@bits[index] = true
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
@item_count += 1
|
|
68
|
+
self
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Check if an element might be in the filter.
|
|
72
|
+
#
|
|
73
|
+
# Note: Returns false if the element is definitely NOT in the filter.
|
|
74
|
+
# Returns true if the element is PROBABLY in the filter (may be false positive).
|
|
75
|
+
#
|
|
76
|
+
# @param item [String] The item to check
|
|
77
|
+
# @return [Boolean] True if possibly in filter, false if definitely not
|
|
78
|
+
def include?(item)
|
|
79
|
+
normalized_item = normalize_item(item)
|
|
80
|
+
|
|
81
|
+
@hash_count.times do |i|
|
|
82
|
+
index = hash_index(normalized_item, i)
|
|
83
|
+
return false unless @bits[index]
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
true
|
|
87
|
+
end
|
|
88
|
+
alias include? include?
|
|
89
|
+
alias might_include? include?
|
|
90
|
+
|
|
91
|
+
# Merge another bloom filter into this one.
|
|
92
|
+
#
|
|
93
|
+
# @param other [BloomFilter] Another bloom filter with same parameters
|
|
94
|
+
# @return [self] Self for chaining
|
|
95
|
+
def merge(other)
|
|
96
|
+
raise ArgumentError, "Cannot merge filters with different sizes" unless other.size == @size
|
|
97
|
+
raise ArgumentError, "Cannot merge filters with different hash counts" unless other.hash_count == @hash_count
|
|
98
|
+
|
|
99
|
+
@size.times do |i|
|
|
100
|
+
@bits[i] = @bits[i] || other.instance_variable_get(:@bits)[i]
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
@item_count += other.item_count
|
|
104
|
+
self
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Clear all elements from the filter.
|
|
108
|
+
#
|
|
109
|
+
# @return [self] Self for chaining
|
|
110
|
+
def clear
|
|
111
|
+
@bits = Array.new(@size, false)
|
|
112
|
+
@item_count = 0
|
|
113
|
+
self
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Get filter statistics.
|
|
117
|
+
#
|
|
118
|
+
# @return [Hash] Statistics including :size, :hash_count, :item_count
|
|
119
|
+
def stats
|
|
120
|
+
{
|
|
121
|
+
size: @size,
|
|
122
|
+
hash_count: @hash_count,
|
|
123
|
+
item_count: @item_count
|
|
124
|
+
}
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
private
|
|
128
|
+
|
|
129
|
+
# Normalize item for consistent hashing.
|
|
130
|
+
#
|
|
131
|
+
# @param item [String] The item to normalize
|
|
132
|
+
# @return [String] Normalized item
|
|
133
|
+
def normalize_item(item)
|
|
134
|
+
@case_sensitive ? item.to_s : item.to_s.downcase
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Calculate optimal bit array size.
|
|
138
|
+
#
|
|
139
|
+
# @param n [Integer] Expected number of elements
|
|
140
|
+
# @param p [Float] False positive rate
|
|
141
|
+
# @return [Integer] Optimal size in bits
|
|
142
|
+
def calculate_size(n, p)
|
|
143
|
+
# m = -n * ln(p) / (ln(2)^2)
|
|
144
|
+
m = (-n * Math.log(p)) / (Math.log(2)**2)
|
|
145
|
+
m.ceil.to_i
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Calculate optimal number of hash functions.
|
|
149
|
+
#
|
|
150
|
+
# @param m [Integer] Size of bit array
|
|
151
|
+
# @param n [Integer] Expected number of elements
|
|
152
|
+
# @return [Integer] Optimal number of hash functions
|
|
153
|
+
def calculate_hash_count(m, n)
|
|
154
|
+
# k = (m/n) * ln(2)
|
|
155
|
+
k = (m.to_f / n) * Math.log(2)
|
|
156
|
+
[1, k.ceil.to_i].max # At least 1 hash function
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Calculate hash index for item with seed.
|
|
160
|
+
#
|
|
161
|
+
# Uses double hashing for multiple hash functions:
|
|
162
|
+
# hash_i(item) = (hash1(item) + i * hash2(item)) % m
|
|
163
|
+
#
|
|
164
|
+
# @param item [String] The item to hash
|
|
165
|
+
# @param seed [Integer] Hash function index
|
|
166
|
+
# @return [Integer] Bit array index
|
|
167
|
+
def hash_index(item, seed)
|
|
168
|
+
# Use Ruby's built-in hash with different seeds
|
|
169
|
+
hash1 = item.hash
|
|
170
|
+
hash2 = (item.hash * 31) + seed
|
|
171
|
+
|
|
172
|
+
(hash1 + seed * hash2.abs) % @size
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Debug
|
|
5
|
+
# Debug logger for detailed spellchecking information.
|
|
6
|
+
#
|
|
7
|
+
# Provides structured logging for lookup operations, suggestion generation,
|
|
8
|
+
# cache behavior, and decision trees.
|
|
9
|
+
class Logger
|
|
10
|
+
# Log levels
|
|
11
|
+
LEVELS = %i[info verbose trace].freeze
|
|
12
|
+
|
|
13
|
+
attr_reader :output, :level
|
|
14
|
+
|
|
15
|
+
# Create a new debug logger.
|
|
16
|
+
#
|
|
17
|
+
# @param output [IO] Output stream (default: $stderr)
|
|
18
|
+
# @param level [Symbol] Log level (:info, :verbose, :trace)
|
|
19
|
+
def initialize(output: $stderr, level: :info)
|
|
20
|
+
@output = output
|
|
21
|
+
@level = level
|
|
22
|
+
@indent = 0
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Log lookup operation.
|
|
26
|
+
#
|
|
27
|
+
# @param word [String] The word being looked up
|
|
28
|
+
# @param result [Boolean] The lookup result
|
|
29
|
+
# @param time [Float] Time taken in milliseconds
|
|
30
|
+
def debug_lookup(word, result:, time:)
|
|
31
|
+
return unless should_log?(:info)
|
|
32
|
+
|
|
33
|
+
status = result ? "✓" : "✗"
|
|
34
|
+
output.puts "DEBUG: lookup #{status} \"#{word}\" - #{time.round(3)}ms"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Log suggestion generation.
|
|
38
|
+
#
|
|
39
|
+
# @param word [String] The input word
|
|
40
|
+
# @param suggestions [Array] Generated suggestions
|
|
41
|
+
# @param time [Float] Time taken in milliseconds
|
|
42
|
+
def debug_suggestions(word, suggestions:, time:)
|
|
43
|
+
return unless should_log?(:verbose)
|
|
44
|
+
|
|
45
|
+
output.puts "DEBUG: suggestions for \"#{word}\" (#{time.round(3)}ms)"
|
|
46
|
+
|
|
47
|
+
return unless should_log?(:trace)
|
|
48
|
+
|
|
49
|
+
@indent += 2
|
|
50
|
+
suggestions.each do |suggestion|
|
|
51
|
+
dist = suggestion.distance
|
|
52
|
+
conf = suggestion.confidence
|
|
53
|
+
source = suggestion.source
|
|
54
|
+
output.puts "#{" " * @indent}#{suggestion.word} (dist: #{dist}, conf: #{conf.round(2)}, src: #{source})"
|
|
55
|
+
end
|
|
56
|
+
@indent -= 2
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Log cache operation.
|
|
60
|
+
#
|
|
61
|
+
# @param cache_type [String] Type of cache
|
|
62
|
+
# @param key [String] The cache key
|
|
63
|
+
# @param hit [Boolean] True if cache hit
|
|
64
|
+
def debug_cache(cache_type, key, hit:)
|
|
65
|
+
return unless should_log?(:trace)
|
|
66
|
+
|
|
67
|
+
status = hit ? "HIT" : "MISS"
|
|
68
|
+
output.puts "DEBUG: cache #{cache_type.upcase} #{status} \"#{key}\""
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Log decision tree.
|
|
72
|
+
#
|
|
73
|
+
# @param word [String] The input word
|
|
74
|
+
# @param decisions [Array] Array of decision nodes
|
|
75
|
+
def debug_decision_tree(word, decisions:)
|
|
76
|
+
return unless should_log?(:trace)
|
|
77
|
+
|
|
78
|
+
output.puts "DEBUG: decision tree for \"#{word}\""
|
|
79
|
+
@indent += 2
|
|
80
|
+
print_decisions(decisions)
|
|
81
|
+
@indent -= 2
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Log info message.
|
|
85
|
+
#
|
|
86
|
+
# @param message [String] The message
|
|
87
|
+
def info(message)
|
|
88
|
+
return unless should_log?(:info)
|
|
89
|
+
|
|
90
|
+
output.puts "DEBUG: #{message}"
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Log verbose message.
|
|
94
|
+
#
|
|
95
|
+
# @param message [String] The message
|
|
96
|
+
def verbose(message)
|
|
97
|
+
return unless should_log?(:verbose)
|
|
98
|
+
|
|
99
|
+
output.puts "DEBUG: #{message}"
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Log trace message.
|
|
103
|
+
#
|
|
104
|
+
# @param message [String] The message
|
|
105
|
+
def trace(message)
|
|
106
|
+
return unless should_log?(:trace)
|
|
107
|
+
|
|
108
|
+
output.puts "DEBUG: #{message}"
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
private
|
|
112
|
+
|
|
113
|
+
# Check if should log at current level.
|
|
114
|
+
#
|
|
115
|
+
# @param required_level [Symbol] Required level
|
|
116
|
+
# @return [Boolean] True if should log
|
|
117
|
+
def should_log?(required_level)
|
|
118
|
+
LEVELS.index(required_level) <= LEVELS.index(@level)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Print decisions tree.
|
|
122
|
+
#
|
|
123
|
+
# @param decisions [Array] Decision nodes
|
|
124
|
+
def print_decisions(decisions, index = 0)
|
|
125
|
+
decisions.each do |decision|
|
|
126
|
+
prefix = "#{" " * @indent}#{index}. "
|
|
127
|
+
output.puts "#{prefix}#{decision[:description]}"
|
|
128
|
+
|
|
129
|
+
if should_log?(:trace) && decision[:details]
|
|
130
|
+
@indent += 2
|
|
131
|
+
decision[:details].each do |key, value|
|
|
132
|
+
output.puts "#{" " * @indent}#{key}: #{value}"
|
|
133
|
+
end
|
|
134
|
+
@indent -= 2
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
next unless decision[:children] && !decision[:children].empty?
|
|
138
|
+
|
|
139
|
+
@indent += 2
|
|
140
|
+
print_decisions(decision[:children], index + 1)
|
|
141
|
+
@indent -= 2
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
end
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
# Debug mode for detailed spellchecking insights.
|
|
5
|
+
#
|
|
6
|
+
# When enabled, debug mode provides:
|
|
7
|
+
# - Lookup timing information
|
|
8
|
+
# - Suggestion scoring details
|
|
9
|
+
# - Decision tree visualization
|
|
10
|
+
# - Cache hit/miss tracking
|
|
11
|
+
# - Performance metrics
|
|
12
|
+
#
|
|
13
|
+
# @example Enable debug mode
|
|
14
|
+
# Kotoshu::Debug.enable
|
|
15
|
+
# Kotoshu.correct?("hello")
|
|
16
|
+
# # Output: DEBUG: lookup "hello" - 0.001ms
|
|
17
|
+
#
|
|
18
|
+
# @example Disable debug mode
|
|
19
|
+
# Kotoshu::Debug.disable
|
|
20
|
+
module Debug
|
|
21
|
+
class << self
|
|
22
|
+
# Enable debug mode.
|
|
23
|
+
#
|
|
24
|
+
# @param output [IO] Output stream (default: $stderr)
|
|
25
|
+
# @param level [Symbol] Debug level (:info, :verbose, :trace)
|
|
26
|
+
def enable(output: $stderr, level: :info)
|
|
27
|
+
@enabled = true
|
|
28
|
+
@output = output
|
|
29
|
+
@level = level
|
|
30
|
+
@logger = Debug::Logger.new(output: output, level: level)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Disable debug mode.
|
|
34
|
+
def disable
|
|
35
|
+
@enabled = false
|
|
36
|
+
@logger = nil
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Check if debug mode is enabled.
|
|
40
|
+
#
|
|
41
|
+
# @return [Boolean] True if enabled
|
|
42
|
+
def enabled?
|
|
43
|
+
@enabled ||= false
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Get the debug logger.
|
|
47
|
+
#
|
|
48
|
+
# @return [Debug::Logger, nil] The logger instance
|
|
49
|
+
attr_reader :logger
|
|
50
|
+
|
|
51
|
+
# Log a lookup operation.
|
|
52
|
+
#
|
|
53
|
+
# @param word [String] The word being looked up
|
|
54
|
+
# @param result [Boolean] The lookup result
|
|
55
|
+
# @param time [Float] Time taken in milliseconds
|
|
56
|
+
def log_lookup(word, result:, time:)
|
|
57
|
+
return unless enabled?
|
|
58
|
+
|
|
59
|
+
logger&.debug_lookup(word, result: result, time: time)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Log a suggestion generation.
|
|
63
|
+
#
|
|
64
|
+
# @param word [String] The input word
|
|
65
|
+
# @param suggestions [Array] Generated suggestions
|
|
66
|
+
# @param time [Float] Time taken in milliseconds
|
|
67
|
+
def log_suggestions(word, suggestions:, time:)
|
|
68
|
+
return unless enabled?
|
|
69
|
+
|
|
70
|
+
logger&.debug_suggestions(word, suggestions: suggestions, time: time)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Log a cache hit/miss.
|
|
74
|
+
#
|
|
75
|
+
# @param cache_type [String] Type of cache (lookup, suggestion)
|
|
76
|
+
# @param key [String] The cache key
|
|
77
|
+
# @param hit [Boolean] True if cache hit
|
|
78
|
+
def log_cache(cache_type, key, hit:)
|
|
79
|
+
return unless enabled?
|
|
80
|
+
|
|
81
|
+
logger&.debug_cache(cache_type, key, hit: hit)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Log a decision tree.
|
|
85
|
+
#
|
|
86
|
+
# @param word [String] The input word
|
|
87
|
+
# @param decisions [Array] Array of decision nodes
|
|
88
|
+
def log_decision_tree(word, decisions:)
|
|
89
|
+
return unless enabled?
|
|
90
|
+
|
|
91
|
+
logger&.debug_decision_tree(word, decisions: decisions)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Start a timing context.
|
|
95
|
+
#
|
|
96
|
+
# @yield Block to time
|
|
97
|
+
# @return [Object] Block result
|
|
98
|
+
def time(label)
|
|
99
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
100
|
+
result = yield
|
|
101
|
+
elapsed = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000
|
|
102
|
+
|
|
103
|
+
logger&.info("#{label}: #{elapsed.round(3)}ms")
|
|
104
|
+
result
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Measure and log a lookup.
|
|
108
|
+
#
|
|
109
|
+
# @yield Block that performs the lookup
|
|
110
|
+
# @return [Object] Block result
|
|
111
|
+
def measure_lookup(word)
|
|
112
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
113
|
+
result = yield
|
|
114
|
+
elapsed = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000
|
|
115
|
+
|
|
116
|
+
log_lookup(word, result: result, time: elapsed)
|
|
117
|
+
result
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Measure and log suggestions.
|
|
121
|
+
#
|
|
122
|
+
# @yield Block that generates suggestions
|
|
123
|
+
# @return [Object] Block result
|
|
124
|
+
def measure_suggestions(word)
|
|
125
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
126
|
+
result = yield
|
|
127
|
+
elapsed = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000
|
|
128
|
+
|
|
129
|
+
log_suggestions(word, suggestions: result, time: elapsed)
|
|
130
|
+
result
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "configuration/builder"
|
|
4
|
+
|
|
5
|
+
module Kotoshu
|
|
6
|
+
# Sensible defaults for Kotoshu configuration.
|
|
7
|
+
#
|
|
8
|
+
# Provides auto-detection of system dictionaries and fallback
|
|
9
|
+
# to bundled dictionaries, ensuring Kotoshu works out of the box.
|
|
10
|
+
module Defaults
|
|
11
|
+
# Standard system dictionary paths.
|
|
12
|
+
SYSTEM_DICTIONARY_PATHS = [
|
|
13
|
+
"/usr/share/dict/words",
|
|
14
|
+
"/usr/share/dict/web2",
|
|
15
|
+
"/usr/share/dict/web2a",
|
|
16
|
+
"/usr/dict/words"
|
|
17
|
+
].freeze
|
|
18
|
+
|
|
19
|
+
# Bundled dictionary paths (relative to gem root).
|
|
20
|
+
BUNDLED_DICTIONARY_PATHS = [
|
|
21
|
+
"dictionaries/unix_words/words",
|
|
22
|
+
"dictionaries/unix_words/web2",
|
|
23
|
+
"dictionaries/unix_words/web2a"
|
|
24
|
+
].freeze
|
|
25
|
+
|
|
26
|
+
class << self
|
|
27
|
+
# Detect system dictionary.
|
|
28
|
+
#
|
|
29
|
+
# @return [String, nil] Path to system dictionary or nil
|
|
30
|
+
def detect_system_dictionary
|
|
31
|
+
SYSTEM_DICTIONARY_PATHS.find do |path|
|
|
32
|
+
File.exist?(path)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Get path to bundled dictionary.
|
|
37
|
+
#
|
|
38
|
+
# @return [String, nil] Path to bundled dictionary or nil
|
|
39
|
+
def bundled_dictionary_path
|
|
40
|
+
BUNDLED_DICTIONARY_PATHS.find do |path|
|
|
41
|
+
full_path = File.expand_path("../../#{path}", __dir__)
|
|
42
|
+
File.exist?(full_path)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Get default dictionary.
|
|
47
|
+
#
|
|
48
|
+
# Tries system dictionary first, then bundled dictionary,
|
|
49
|
+
# then falls back to an empty custom dictionary.
|
|
50
|
+
#
|
|
51
|
+
# @return [Dictionary::Base] A working dictionary
|
|
52
|
+
def default_dictionary
|
|
53
|
+
# Try system dictionary
|
|
54
|
+
system_path = detect_system_dictionary
|
|
55
|
+
return Dictionary::PlainText.new(system_path, language_code: "en") if system_path
|
|
56
|
+
|
|
57
|
+
# Try bundled dictionary
|
|
58
|
+
bundled_path = bundled_dictionary_path
|
|
59
|
+
if bundled_path
|
|
60
|
+
full_path = File.expand_path("../../#{bundled_path}", __dir__)
|
|
61
|
+
return Dictionary::PlainText.new(full_path, language_code: "en")
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Fall back to minimal dictionary with common words
|
|
65
|
+
Dictionary::PlainText.from_words(
|
|
66
|
+
%w[the and for are but not you all any can had has him his how her its now our our was what],
|
|
67
|
+
language_code: "en"
|
|
68
|
+
)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Configure Kotoshu with sensible defaults.
|
|
72
|
+
#
|
|
73
|
+
# @return [Configuration] The configured instance
|
|
74
|
+
def configure
|
|
75
|
+
default_dictionary
|
|
76
|
+
|
|
77
|
+
Configuration::Builder.build do |c|
|
|
78
|
+
c.dictionary_type = :plain_text
|
|
79
|
+
c.language = "en-US"
|
|
80
|
+
c.max_suggestions = 10
|
|
81
|
+
c.case_sensitive = false
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|