kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,176 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module DataStructures
5
+ # Bloom filter - probabilistic data structure for fast membership testing.
6
+ #
7
+ # A Bloom filter is a space-efficient probabilistic data structure that
8
+ # is used to test whether an element is a member of a set. False positive
9
+ # matches are possible, but false negatives are not.
10
+ #
11
+ # @example Basic usage
12
+ # filter = BloomFilter.new
13
+ # filter.add("hello")
14
+ # filter.include?("hello") # => true (definitely in set)
15
+ # filter.include?("world") # => false (probably not in set)
16
+ #
17
+ # @see https://en.wikipedia.org/wiki/Bloom_filter Bloom filter Wikipedia
18
+ class BloomFilter
19
+ # Default false positive rate (1%)
20
+ DEFAULT_FALSE_POSITIVE_RATE = 0.01
21
+
22
+ # Default expected number of elements
23
+ DEFAULT_EXPECTED_SIZE = 10_000
24
+
25
+ # @return [Integer] Size of the bit array
26
+ attr_reader :size
27
+
28
+ # @return [Integer] Number of hash functions
29
+ attr_reader :hash_count
30
+
31
+ # @return [Integer] Number of items added
32
+ attr_reader :item_count
33
+
34
+ # Create a new Bloom filter.
35
+ #
36
+ # @param expected_size [Integer] Expected number of elements (default: 10_000)
37
+ # @param false_positive_rate [Float] Desired false positive rate (default: 0.01)
38
+ # @param case_sensitive [Boolean] Whether lookups are case-sensitive (default: false)
39
+ def initialize(expected_size: DEFAULT_EXPECTED_SIZE,
40
+ false_positive_rate: DEFAULT_FALSE_POSITIVE_RATE,
41
+ case_sensitive: false)
42
+ @case_sensitive = case_sensitive
43
+ @item_count = 0
44
+
45
+ # Calculate optimal size and hash count
46
+ # m = -n * ln(p) / (ln(2)^2)
47
+ # k = (m/n) * ln(2)
48
+ @size = calculate_size(expected_size, false_positive_rate)
49
+ @hash_count = calculate_hash_count(@size, expected_size)
50
+
51
+ # Initialize bit array
52
+ @bits = Array.new(@size, false)
53
+ end
54
+
55
+ # Add an element to the filter.
56
+ #
57
+ # @param item [String] The item to add
58
+ # @return [self] Self for chaining
59
+ def add(item)
60
+ normalized_item = normalize_item(item)
61
+
62
+ @hash_count.times do |i|
63
+ index = hash_index(normalized_item, i)
64
+ @bits[index] = true
65
+ end
66
+
67
+ @item_count += 1
68
+ self
69
+ end
70
+
71
+ # Check if an element might be in the filter.
72
+ #
73
+ # Note: Returns false if the element is definitely NOT in the filter.
74
+ # Returns true if the element is PROBABLY in the filter (may be false positive).
75
+ #
76
+ # @param item [String] The item to check
77
+ # @return [Boolean] True if possibly in filter, false if definitely not
78
+ def include?(item)
79
+ normalized_item = normalize_item(item)
80
+
81
+ @hash_count.times do |i|
82
+ index = hash_index(normalized_item, i)
83
+ return false unless @bits[index]
84
+ end
85
+
86
+ true
87
+ end
88
+ alias include? include?
89
+ alias might_include? include?
90
+
91
+ # Merge another bloom filter into this one.
92
+ #
93
+ # @param other [BloomFilter] Another bloom filter with same parameters
94
+ # @return [self] Self for chaining
95
+ def merge(other)
96
+ raise ArgumentError, "Cannot merge filters with different sizes" unless other.size == @size
97
+ raise ArgumentError, "Cannot merge filters with different hash counts" unless other.hash_count == @hash_count
98
+
99
+ @size.times do |i|
100
+ @bits[i] = @bits[i] || other.instance_variable_get(:@bits)[i]
101
+ end
102
+
103
+ @item_count += other.item_count
104
+ self
105
+ end
106
+
107
+ # Clear all elements from the filter.
108
+ #
109
+ # @return [self] Self for chaining
110
+ def clear
111
+ @bits = Array.new(@size, false)
112
+ @item_count = 0
113
+ self
114
+ end
115
+
116
+ # Get filter statistics.
117
+ #
118
+ # @return [Hash] Statistics including :size, :hash_count, :item_count
119
+ def stats
120
+ {
121
+ size: @size,
122
+ hash_count: @hash_count,
123
+ item_count: @item_count
124
+ }
125
+ end
126
+
127
+ private
128
+
129
+ # Normalize item for consistent hashing.
130
+ #
131
+ # @param item [String] The item to normalize
132
+ # @return [String] Normalized item
133
+ def normalize_item(item)
134
+ @case_sensitive ? item.to_s : item.to_s.downcase
135
+ end
136
+
137
+ # Calculate optimal bit array size.
138
+ #
139
+ # @param n [Integer] Expected number of elements
140
+ # @param p [Float] False positive rate
141
+ # @return [Integer] Optimal size in bits
142
+ def calculate_size(n, p)
143
+ # m = -n * ln(p) / (ln(2)^2)
144
+ m = (-n * Math.log(p)) / (Math.log(2)**2)
145
+ m.ceil.to_i
146
+ end
147
+
148
+ # Calculate optimal number of hash functions.
149
+ #
150
+ # @param m [Integer] Size of bit array
151
+ # @param n [Integer] Expected number of elements
152
+ # @return [Integer] Optimal number of hash functions
153
+ def calculate_hash_count(m, n)
154
+ # k = (m/n) * ln(2)
155
+ k = (m.to_f / n) * Math.log(2)
156
+ [1, k.ceil.to_i].max # At least 1 hash function
157
+ end
158
+
159
+ # Calculate hash index for item with seed.
160
+ #
161
+ # Uses double hashing for multiple hash functions:
162
+ # hash_i(item) = (hash1(item) + i * hash2(item)) % m
163
+ #
164
+ # @param item [String] The item to hash
165
+ # @param seed [Integer] Hash function index
166
+ # @return [Integer] Bit array index
167
+ def hash_index(item, seed)
168
+ # Use Ruby's built-in hash with different seeds
169
+ hash1 = item.hash
170
+ hash2 = (item.hash * 31) + seed
171
+
172
+ (hash1 + seed * hash2.abs) % @size
173
+ end
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,146 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Debug
5
+ # Debug logger for detailed spellchecking information.
6
+ #
7
+ # Provides structured logging for lookup operations, suggestion generation,
8
+ # cache behavior, and decision trees.
9
+ class Logger
10
+ # Log levels
11
+ LEVELS = %i[info verbose trace].freeze
12
+
13
+ attr_reader :output, :level
14
+
15
+ # Create a new debug logger.
16
+ #
17
+ # @param output [IO] Output stream (default: $stderr)
18
+ # @param level [Symbol] Log level (:info, :verbose, :trace)
19
+ def initialize(output: $stderr, level: :info)
20
+ @output = output
21
+ @level = level
22
+ @indent = 0
23
+ end
24
+
25
+ # Log lookup operation.
26
+ #
27
+ # @param word [String] The word being looked up
28
+ # @param result [Boolean] The lookup result
29
+ # @param time [Float] Time taken in milliseconds
30
+ def debug_lookup(word, result:, time:)
31
+ return unless should_log?(:info)
32
+
33
+ status = result ? "✓" : "✗"
34
+ output.puts "DEBUG: lookup #{status} \"#{word}\" - #{time.round(3)}ms"
35
+ end
36
+
37
+ # Log suggestion generation.
38
+ #
39
+ # @param word [String] The input word
40
+ # @param suggestions [Array] Generated suggestions
41
+ # @param time [Float] Time taken in milliseconds
42
+ def debug_suggestions(word, suggestions:, time:)
43
+ return unless should_log?(:verbose)
44
+
45
+ output.puts "DEBUG: suggestions for \"#{word}\" (#{time.round(3)}ms)"
46
+
47
+ return unless should_log?(:trace)
48
+
49
+ @indent += 2
50
+ suggestions.each do |suggestion|
51
+ dist = suggestion.distance
52
+ conf = suggestion.confidence
53
+ source = suggestion.source
54
+ output.puts "#{" " * @indent}#{suggestion.word} (dist: #{dist}, conf: #{conf.round(2)}, src: #{source})"
55
+ end
56
+ @indent -= 2
57
+ end
58
+
59
+ # Log cache operation.
60
+ #
61
+ # @param cache_type [String] Type of cache
62
+ # @param key [String] The cache key
63
+ # @param hit [Boolean] True if cache hit
64
+ def debug_cache(cache_type, key, hit:)
65
+ return unless should_log?(:trace)
66
+
67
+ status = hit ? "HIT" : "MISS"
68
+ output.puts "DEBUG: cache #{cache_type.upcase} #{status} \"#{key}\""
69
+ end
70
+
71
+ # Log decision tree.
72
+ #
73
+ # @param word [String] The input word
74
+ # @param decisions [Array] Array of decision nodes
75
+ def debug_decision_tree(word, decisions:)
76
+ return unless should_log?(:trace)
77
+
78
+ output.puts "DEBUG: decision tree for \"#{word}\""
79
+ @indent += 2
80
+ print_decisions(decisions)
81
+ @indent -= 2
82
+ end
83
+
84
+ # Log info message.
85
+ #
86
+ # @param message [String] The message
87
+ def info(message)
88
+ return unless should_log?(:info)
89
+
90
+ output.puts "DEBUG: #{message}"
91
+ end
92
+
93
+ # Log verbose message.
94
+ #
95
+ # @param message [String] The message
96
+ def verbose(message)
97
+ return unless should_log?(:verbose)
98
+
99
+ output.puts "DEBUG: #{message}"
100
+ end
101
+
102
+ # Log trace message.
103
+ #
104
+ # @param message [String] The message
105
+ def trace(message)
106
+ return unless should_log?(:trace)
107
+
108
+ output.puts "DEBUG: #{message}"
109
+ end
110
+
111
+ private
112
+
113
+ # Check if should log at current level.
114
+ #
115
+ # @param required_level [Symbol] Required level
116
+ # @return [Boolean] True if should log
117
+ def should_log?(required_level)
118
+ LEVELS.index(required_level) <= LEVELS.index(@level)
119
+ end
120
+
121
+ # Print decisions tree.
122
+ #
123
+ # @param decisions [Array] Decision nodes
124
+ def print_decisions(decisions, index = 0)
125
+ decisions.each do |decision|
126
+ prefix = "#{" " * @indent}#{index}. "
127
+ output.puts "#{prefix}#{decision[:description]}"
128
+
129
+ if should_log?(:trace) && decision[:details]
130
+ @indent += 2
131
+ decision[:details].each do |key, value|
132
+ output.puts "#{" " * @indent}#{key}: #{value}"
133
+ end
134
+ @indent -= 2
135
+ end
136
+
137
+ next unless decision[:children] && !decision[:children].empty?
138
+
139
+ @indent += 2
140
+ print_decisions(decision[:children], index + 1)
141
+ @indent -= 2
142
+ end
143
+ end
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,134 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ # Debug mode for detailed spellchecking insights.
5
+ #
6
+ # When enabled, debug mode provides:
7
+ # - Lookup timing information
8
+ # - Suggestion scoring details
9
+ # - Decision tree visualization
10
+ # - Cache hit/miss tracking
11
+ # - Performance metrics
12
+ #
13
+ # @example Enable debug mode
14
+ # Kotoshu::Debug.enable
15
+ # Kotoshu.correct?("hello")
16
+ # # Output: DEBUG: lookup "hello" - 0.001ms
17
+ #
18
+ # @example Disable debug mode
19
+ # Kotoshu::Debug.disable
20
+ module Debug
21
+ class << self
22
+ # Enable debug mode.
23
+ #
24
+ # @param output [IO] Output stream (default: $stderr)
25
+ # @param level [Symbol] Debug level (:info, :verbose, :trace)
26
+ def enable(output: $stderr, level: :info)
27
+ @enabled = true
28
+ @output = output
29
+ @level = level
30
+ @logger = Debug::Logger.new(output: output, level: level)
31
+ end
32
+
33
+ # Disable debug mode.
34
+ def disable
35
+ @enabled = false
36
+ @logger = nil
37
+ end
38
+
39
+ # Check if debug mode is enabled.
40
+ #
41
+ # @return [Boolean] True if enabled
42
+ def enabled?
43
+ @enabled ||= false
44
+ end
45
+
46
+ # Get the debug logger.
47
+ #
48
+ # @return [Debug::Logger, nil] The logger instance
49
+ attr_reader :logger
50
+
51
+ # Log a lookup operation.
52
+ #
53
+ # @param word [String] The word being looked up
54
+ # @param result [Boolean] The lookup result
55
+ # @param time [Float] Time taken in milliseconds
56
+ def log_lookup(word, result:, time:)
57
+ return unless enabled?
58
+
59
+ logger&.debug_lookup(word, result: result, time: time)
60
+ end
61
+
62
+ # Log a suggestion generation.
63
+ #
64
+ # @param word [String] The input word
65
+ # @param suggestions [Array] Generated suggestions
66
+ # @param time [Float] Time taken in milliseconds
67
+ def log_suggestions(word, suggestions:, time:)
68
+ return unless enabled?
69
+
70
+ logger&.debug_suggestions(word, suggestions: suggestions, time: time)
71
+ end
72
+
73
+ # Log a cache hit/miss.
74
+ #
75
+ # @param cache_type [String] Type of cache (lookup, suggestion)
76
+ # @param key [String] The cache key
77
+ # @param hit [Boolean] True if cache hit
78
+ def log_cache(cache_type, key, hit:)
79
+ return unless enabled?
80
+
81
+ logger&.debug_cache(cache_type, key, hit: hit)
82
+ end
83
+
84
+ # Log a decision tree.
85
+ #
86
+ # @param word [String] The input word
87
+ # @param decisions [Array] Array of decision nodes
88
+ def log_decision_tree(word, decisions:)
89
+ return unless enabled?
90
+
91
+ logger&.debug_decision_tree(word, decisions: decisions)
92
+ end
93
+
94
+ # Start a timing context.
95
+ #
96
+ # @yield Block to time
97
+ # @return [Object] Block result
98
+ def time(label)
99
+ start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
100
+ result = yield
101
+ elapsed = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000
102
+
103
+ logger&.info("#{label}: #{elapsed.round(3)}ms")
104
+ result
105
+ end
106
+
107
+ # Measure and log a lookup.
108
+ #
109
+ # @yield Block that performs the lookup
110
+ # @return [Object] Block result
111
+ def measure_lookup(word)
112
+ start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
113
+ result = yield
114
+ elapsed = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000
115
+
116
+ log_lookup(word, result: result, time: elapsed)
117
+ result
118
+ end
119
+
120
+ # Measure and log suggestions.
121
+ #
122
+ # @yield Block that generates suggestions
123
+ # @return [Object] Block result
124
+ def measure_suggestions(word)
125
+ start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
126
+ result = yield
127
+ elapsed = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000
128
+
129
+ log_suggestions(word, suggestions: result, time: elapsed)
130
+ result
131
+ end
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "configuration/builder"
4
+
5
+ module Kotoshu
6
+ # Sensible defaults for Kotoshu configuration.
7
+ #
8
+ # Provides auto-detection of system dictionaries and fallback
9
+ # to bundled dictionaries, ensuring Kotoshu works out of the box.
10
+ module Defaults
11
+ # Standard system dictionary paths.
12
+ SYSTEM_DICTIONARY_PATHS = [
13
+ "/usr/share/dict/words",
14
+ "/usr/share/dict/web2",
15
+ "/usr/share/dict/web2a",
16
+ "/usr/dict/words"
17
+ ].freeze
18
+
19
+ # Bundled dictionary paths (relative to gem root).
20
+ BUNDLED_DICTIONARY_PATHS = [
21
+ "dictionaries/unix_words/words",
22
+ "dictionaries/unix_words/web2",
23
+ "dictionaries/unix_words/web2a"
24
+ ].freeze
25
+
26
+ class << self
27
+ # Detect system dictionary.
28
+ #
29
+ # @return [String, nil] Path to system dictionary or nil
30
+ def detect_system_dictionary
31
+ SYSTEM_DICTIONARY_PATHS.find do |path|
32
+ File.exist?(path)
33
+ end
34
+ end
35
+
36
+ # Get path to bundled dictionary.
37
+ #
38
+ # @return [String, nil] Path to bundled dictionary or nil
39
+ def bundled_dictionary_path
40
+ BUNDLED_DICTIONARY_PATHS.find do |path|
41
+ full_path = File.expand_path("../../#{path}", __dir__)
42
+ File.exist?(full_path)
43
+ end
44
+ end
45
+
46
+ # Get default dictionary.
47
+ #
48
+ # Tries system dictionary first, then bundled dictionary,
49
+ # then falls back to an empty custom dictionary.
50
+ #
51
+ # @return [Dictionary::Base] A working dictionary
52
+ def default_dictionary
53
+ # Try system dictionary
54
+ system_path = detect_system_dictionary
55
+ return Dictionary::PlainText.new(system_path, language_code: "en") if system_path
56
+
57
+ # Try bundled dictionary
58
+ bundled_path = bundled_dictionary_path
59
+ if bundled_path
60
+ full_path = File.expand_path("../../#{bundled_path}", __dir__)
61
+ return Dictionary::PlainText.new(full_path, language_code: "en")
62
+ end
63
+
64
+ # Fall back to minimal dictionary with common words
65
+ Dictionary::PlainText.from_words(
66
+ %w[the and for are but not you all any can had has him his how her its now our our was what],
67
+ language_code: "en"
68
+ )
69
+ end
70
+
71
+ # Configure Kotoshu with sensible defaults.
72
+ #
73
+ # @return [Configuration] The configured instance
74
+ def configure
75
+ default_dictionary
76
+
77
+ Configuration::Builder.build do |c|
78
+ c.dictionary_type = :plain_text
79
+ c.language = "en-US"
80
+ c.max_suggestions = 10
81
+ c.case_sensitive = false
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end