kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,298 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "configuration"
4
+ require_relative "suggestions/generator"
5
+ require_relative "core/models/result/word_result"
6
+ require_relative "core/models/result/document_result"
7
+
8
+ module Kotoshu
9
+ # Main spellchecker class.
10
+ #
11
+ # This is the primary facade for spell checking operations,
12
+ # providing methods to check words, text, and files.
13
+ #
14
+ # @example Creating a spellchecker with a dictionary
15
+ # dict = Kotoshu::Dictionary::UnixWords.new("/usr/share/dict/words", language_code: "en-US")
16
+ # spellchecker = Spellchecker.new(dictionary: dict)
17
+ # spellchecker.correct?("hello") # => true
18
+ #
19
+ # @example Using configuration
20
+ # spellchecker = Spellchecker.new(
21
+ # dictionary_path: "/usr/share/dict/words",
22
+ # language: "en-US"
23
+ # )
24
+ class Spellchecker
25
+ # @return [Suggestions::Generator] The suggestion generator
26
+ attr_reader :generator
27
+
28
+ # @return [Configuration] The configuration
29
+ attr_reader :config
30
+
31
+ # @return [ResourceBundle, nil] The resource bundle if provided
32
+ attr_reader :resource_bundle
33
+
34
+ # Create a new spellchecker.
35
+ #
36
+ # @param dictionary [Dictionary::Base, nil] The dictionary (optional)
37
+ # @param config [Configuration, Hash] Configuration or settings
38
+ # @param resource_bundle [ResourceBundle, nil] Pre-resolved resource bundle
39
+ # @param kwargs [Hash] Additional configuration options
40
+ #
41
+ # @example With dictionary
42
+ # spellchecker = Spellchecker.new(dictionary: dict)
43
+ #
44
+ # @example With resource bundle (0.2+)
45
+ # bundle = Kotoshu::ResourceManager.resolve(language: "en")
46
+ # spellchecker = Spellchecker.new(resource_bundle: bundle)
47
+ # spellchecker.correct?("hello") # => true
48
+ #
49
+ # @example With configuration hash
50
+ # spellchecker = Spellchecker.new(
51
+ # dictionary_path: "/usr/share/dict/words",
52
+ # language: "en-US"
53
+ # )
54
+ #
55
+ # @example With Configuration object
56
+ # config = Configuration.new(dictionary_path: "words.txt")
57
+ # spellchecker = Spellchecker.new(config: config)
58
+ def initialize(dictionary: nil, config: nil, resource_bundle: nil, **kwargs)
59
+ @resource_bundle = resource_bundle
60
+
61
+ if resource_bundle
62
+ dictionary ||= resource_bundle.dictionary
63
+ kwargs[:language] = resource_bundle.language unless kwargs.key?(:language)
64
+ end
65
+
66
+ if config.is_a?(Configuration)
67
+ @config = config
68
+ else
69
+ settings = kwargs.dup
70
+ settings[:dictionary_path] = dictionary.path if dictionary.respond_to?(:path)
71
+ @config = Configuration.new(settings)
72
+ end
73
+
74
+ @config.dictionary = dictionary if dictionary
75
+
76
+ dict = @config.dictionary
77
+ max_suggestions = @config.max_suggestions
78
+
79
+ @generator = Suggestions::Generator.new(
80
+ dict,
81
+ max_suggestions: max_suggestions,
82
+ algorithms: @config.suggestion_algorithms
83
+ )
84
+ end
85
+
86
+ # Check if a word is spelled correctly.
87
+ #
88
+ # @param word [String] The word to check
89
+ # @return [Boolean] True if the word is correct
90
+ #
91
+ # @example
92
+ # spellchecker.correct?("hello") # => true
93
+ # spellchecker.correct?("helo") # => false
94
+ def correct?(word)
95
+ return false if word.nil? || word.empty?
96
+
97
+ @generator.correct?(word)
98
+ end
99
+
100
+ # Check if a word is misspelled.
101
+ #
102
+ # @param word [String] The word to check
103
+ # @return [Boolean] True if the word is misspelled
104
+ def incorrect?(word)
105
+ !correct?(word)
106
+ end
107
+
108
+ # Get spelling suggestions for a word.
109
+ #
110
+ # @param word [String] The misspelled word
111
+ # @param max_suggestions [Integer] Maximum suggestions (optional)
112
+ # @return [Suggestions::SuggestionSet] Generated suggestions
113
+ #
114
+ # @example
115
+ # suggestions = spellchecker.suggest("helo")
116
+ # suggestions.to_words # => ["hello", "help", "held", ...]
117
+ def suggest(word, max_suggestions: nil)
118
+ return Suggestions::SuggestionSet.empty if word.nil? || word.empty?
119
+
120
+ @generator.generate(word, max_suggestions: max_suggestions)
121
+ end
122
+
123
+ # Check a word and return a result object.
124
+ #
125
+ # @param word [String] The word to check
126
+ # @return [Models::Result::WordResult] The check result
127
+ #
128
+ # @example
129
+ # result = spellchecker.check_word("hello")
130
+ # result.correct? # => true
131
+ #
132
+ # @example With misspelled word
133
+ # result = spellchecker.check_word("helo")
134
+ # result.correct? # => false
135
+ # result.suggestions # => SuggestionSet with suggestions
136
+ def check_word(word)
137
+ if word.nil? || word.empty?
138
+ return Models::Result::WordResult.new("", correct: false,
139
+ suggestions: Suggestions::SuggestionSet.empty)
140
+ end
141
+
142
+ if correct?(word)
143
+ Models::Result::WordResult.correct(word)
144
+ else
145
+ suggestions = suggest(word)
146
+ Models::Result::WordResult.incorrect(word, suggestions: suggestions)
147
+ end
148
+ end
149
+
150
+ # Check text for spelling errors.
151
+ #
152
+ # @param text [String] The text to check
153
+ # @return [Models::Result::DocumentResult] The check result
154
+ #
155
+ # @example
156
+ # result = spellchecker.check("Hello wrold")
157
+ # result.success? # => false
158
+ # result.errors.map(&:word) # => ["wrold"]
159
+ def check(text)
160
+ return Models::Result::DocumentResult.success if text.nil? || text.empty?
161
+
162
+ words = tokenize(text)
163
+ errors = []
164
+ position = 0
165
+
166
+ words.each do |word_data|
167
+ word, pos = word_data
168
+ result = check_word(word)
169
+
170
+ if result.incorrect?
171
+ errors << Models::Result::WordResult.new(
172
+ word,
173
+ correct: false,
174
+ suggestions: result.suggestions,
175
+ position: pos
176
+ )
177
+ end
178
+
179
+ position = pos
180
+ end
181
+
182
+ Models::Result::DocumentResult.new(
183
+ file: nil,
184
+ errors: errors,
185
+ word_count: words.size
186
+ )
187
+ end
188
+
189
+ # Check a file for spelling errors.
190
+ #
191
+ # @param path [String] The file path
192
+ # @return [Models::Result::DocumentResult] The check result
193
+ #
194
+ # @example
195
+ # result = spellchecker.check_file("README.md")
196
+ # result.to_s # => "File 'README.md': 3 spelling error(s) found"
197
+ def check_file(path)
198
+ raise DictionaryNotFoundError, path unless File.exist?(path)
199
+
200
+ text = File.read(path, encoding: @config.encoding)
201
+ result = check(text)
202
+
203
+ # Create a new result with the file path
204
+ Models::Result::DocumentResult.new(
205
+ file: path,
206
+ errors: result.errors,
207
+ word_count: result.word_count
208
+ )
209
+ end
210
+
211
+ # Check a directory for spelling errors.
212
+ #
213
+ # @param path [String] The directory path
214
+ # @param pattern [String] File pattern to match (default: "*.txt")
215
+ # @return [Array<Models::Result::DocumentResult>] Results for each file
216
+ #
217
+ # @example
218
+ # results = spellchecker.check_directory("docs/")
219
+ # results.select(&:failed?).map(&:file)
220
+ def check_directory(path, pattern: "*.txt")
221
+ raise DictionaryNotFoundError, path unless File.exist?(path) && File.directory?(path)
222
+
223
+ files = Dir.glob(File.join(path, pattern))
224
+ files.map { |file| check_file(file) }
225
+ end
226
+
227
+ # Tokenize text into words.
228
+ #
229
+ # @param text [String] The text to tokenize
230
+ # @return [Array<Array>] Array of [word, position] pairs
231
+ #
232
+ # @example
233
+ # spellchecker.tokenize("Hello world!")
234
+ # # => [["Hello", 0], ["world", 6]]
235
+ def tokenize(text)
236
+ return [] if text.nil? || text.empty?
237
+
238
+ words = []
239
+ position = 0
240
+ word_buffer = String.new
241
+ word_start = 0
242
+
243
+ text.each_char.with_index do |char, i|
244
+ if word_char?(char)
245
+ word_buffer << char
246
+ word_start = i if word_buffer.length == 1
247
+ position = i
248
+ elsif !word_buffer.empty?
249
+ words << [word_buffer.dup.freeze, word_start]
250
+ word_buffer.clear
251
+ end
252
+ end
253
+
254
+ # Don't forget the last word
255
+ words << [word_buffer.dup.freeze, word_start] unless word_buffer.empty?
256
+
257
+ words
258
+ end
259
+
260
+ # Get the dictionary being used.
261
+ #
262
+ # @return [Dictionary::Base] The dictionary
263
+ def dictionary
264
+ @generator.dictionary
265
+ end
266
+
267
+ # Reload the dictionary.
268
+ #
269
+ # @return [self] Self for chaining
270
+ def reload_dictionary
271
+ @config.reset_dictionary
272
+
273
+ dict = @config.dictionary
274
+ @generator = Suggestions::Generator.new(
275
+ dict,
276
+ max_suggestions: @config.max_suggestions,
277
+ algorithms: @config.suggestion_algorithms
278
+ )
279
+
280
+ self
281
+ end
282
+
283
+ private
284
+
285
+ # Check if a character is part of a word.
286
+ #
287
+ # @param char [String] The character
288
+ # @return [Boolean] True if it's a word character
289
+ def word_char?(char)
290
+ case char
291
+ when "a".."z", "A".."Z", "'"
292
+ true
293
+ else
294
+ false
295
+ end
296
+ end
297
+ end
298
+ end
@@ -0,0 +1,153 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ # String similarity metrics for spell checking.
5
+ #
6
+ # Ported from Spylls (Python) string_metrics.py
7
+ #
8
+ # These metrics are used for:
9
+ # - Computing word similarity
10
+ # - Ranking suggestions
11
+ # - N-gram based scoring
12
+ module StringMetrics
13
+ # Number of occurrences of the exactly same characters in exactly same position.
14
+ #
15
+ # @param s1 [String] First string
16
+ # @param s2 [String] Second string
17
+ # @return [Integer] Count of matching characters at same positions
18
+ #
19
+ # @example
20
+ # Kotoshu::StringMetrics.commoncharacters("hello", "hallo") # => 4 ('h', 'l', 'l', 'o' match)
21
+ def self.commoncharacters(s1, s2)
22
+ return 0 if s1.nil? || s2.nil?
23
+
24
+ # Zip strings and count matching character pairs
25
+ [s1.length, s2.length].min.times.count do |i|
26
+ s1[i] == s2[i]
27
+ end
28
+ end
29
+
30
+ # Size of the common start of two strings.
31
+ #
32
+ # @param s1 [String] First string
33
+ # @param s2 [String] Second string
34
+ # @return [Integer] Length of common prefix
35
+ #
36
+ # @example
37
+ # Kotoshu::StringMetrics.leftcommonsubstring("foo", "bar") # => 0
38
+ # Kotoshu::StringMetrics.leftcommonsubstring("built", "build") # => 4
39
+ # Kotoshu::StringMetrics.leftcommonsubstring("cat", "cats") # => 3
40
+ def self.leftcommonsubstring(s1, s2)
41
+ return 0 if s1.nil? || s2.nil?
42
+
43
+ # Find first position where characters differ
44
+ s1.chars.zip(s2.chars).each_with_index do |(c1, c2), i|
45
+ return i if c1 != c2
46
+ end
47
+
48
+ # All characters matched up to shorter string length
49
+ [s1.length, s2.length].min
50
+ end
51
+
52
+ # Calculate n-gram similarity between two strings.
53
+ #
54
+ # Calculates how many n-grams of s1 are contained in s2 (the more the number,
55
+ # the more words are similar).
56
+ #
57
+ # @param max_ngram_size [Integer] Maximum n-gram size to check
58
+ # @param s1 [String] String to compare
59
+ # @param s2 [String] String to compare
60
+ # @param weighted [Boolean] Subtract from result for ngrams NOT contained
61
+ # @param any_mismatch [Boolean] Add penalty for any string length difference
62
+ # @param longer_worse [Boolean] Add penalty when second string is longer
63
+ # @return [Integer] N-gram similarity score (higher is more similar)
64
+ #
65
+ # @example
66
+ # Kotoshu::StringMetrics.ngram(4, "hello", "help") # => 6
67
+ # Kotoshu::StringMetrics.ngram(4, "teachings", "teaching") # => higher score
68
+ def self.ngram(max_ngram_size, s1, s2, weighted: false, any_mismatch: false, longer_worse: false)
69
+ l2 = s2.length
70
+ return 0 if l2.zero?
71
+
72
+ l1 = s1.length
73
+ nscore = 0
74
+
75
+ # For all sizes of ngram up to desired...
76
+ (1..max_ngram_size).each do |ngram_size|
77
+ ns = 0
78
+
79
+ # Check every position in the first string
80
+ (0..(l1 - ngram_size)).each do |pos|
81
+ ngram = s1[pos, ngram_size]
82
+
83
+ # If the ngram is present in ANY place in second string, increase score
84
+ if s2.include?(ngram)
85
+ ns += 1
86
+ elsif weighted
87
+ # For "weighted" ngrams, decrease score if ngram is not found
88
+ ns -= 1
89
+ # Decrease once more if it was the beginning or end of first string
90
+ ns -= 1 if pos.zero? || pos + ngram_size == l1
91
+ end
92
+ end
93
+
94
+ nscore += ns
95
+
96
+ # There is no need to check for 4-gram if there were only one 3-gram
97
+ break if ns < 2 && !weighted
98
+ end
99
+
100
+ # Calculate penalty based on settings
101
+ penalty = if longer_worse
102
+ # Add penalty when second string is longer
103
+ (l2 - l1) - 2
104
+ elsif any_mismatch
105
+ # Add penalty for any string length difference
106
+ (l2 - l1).abs - 2
107
+ else
108
+ 0
109
+ end
110
+
111
+ # Apply penalty if positive
112
+ penalty > 0 ? nscore - penalty : nscore
113
+ end
114
+
115
+ # Calculate LCS (Longest Common Subsequence) length.
116
+ #
117
+ # Classic dynamic programming algorithm. This is different from
118
+ # longest common substring - subsequence doesn't require contiguity.
119
+ #
120
+ # @param s1 [String] First string
121
+ # @param s2 [String] Second string
122
+ # @return [Integer] Length of longest common subsequence
123
+ #
124
+ # @example
125
+ # Kotoshu::StringMetrics.lcslen("AGGTAB", "GXTXAYB") # => 4 ("GTAB")
126
+ def self.lcslen(s1, s2)
127
+ return 0 if s1.nil? || s2.nil? || s1.empty? || s2.empty?
128
+
129
+ m = s1.length
130
+ n = s2.length
131
+
132
+ # Create DP table
133
+ # Using a 2D array for clarity, though we could optimize space
134
+ c = Array.new(m + 1) { Array.new(n + 1, 0) }
135
+
136
+ (0...m).each do |i|
137
+ (0...n).each do |j|
138
+ if s1[i] == s2[j]
139
+ # Characters match - extend diagonal
140
+ c[i + 1][j + 1] = c[i][j] + 1
141
+ elsif c[i][j + 1] >= c[i + 1][j]
142
+ # Take max from top or left
143
+ c[i + 1][j + 1] = c[i][j + 1]
144
+ else
145
+ c[i + 1][j + 1] = c[i + 1][j]
146
+ end
147
+ end
148
+ end
149
+
150
+ c[m][n]
151
+ end
152
+ end
153
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Suggestions
5
+ # Context object passed to suggestion strategies.
6
+ # Encapsulates the state and parameters for suggestion generation.
7
+ class Context
8
+ attr_reader :word, :dictionary, :max_results, :options
9
+
10
+ def initialize(word:, dictionary:, max_results: 10, **options)
11
+ @word = word
12
+ @dictionary = dictionary
13
+ @max_results = max_results
14
+ @options = options
15
+ end
16
+
17
+ # Get an option value.
18
+ #
19
+ # @param key [Symbol] The option key
20
+ # @param default [Object] Default value if not found
21
+ # @return [Object] The option value
22
+ def option(key, default = nil)
23
+ @options.fetch(key, default)
24
+ end
25
+
26
+ # Check if an option is present.
27
+ #
28
+ # @param key [Symbol] The option key
29
+ # @return [Boolean] True if option exists
30
+ def has_option?(key)
31
+ @options.key?(key)
32
+ end
33
+
34
+ # Convert context to hash.
35
+ #
36
+ # @return [Hash] Context as hash
37
+ def to_h
38
+ {
39
+ word: @word,
40
+ dictionary: @dictionary,
41
+ max_results: @max_results,
42
+ options: @options
43
+ }
44
+ end
45
+
46
+ # Inspect the context.
47
+ #
48
+ # @return [String] Inspection string
49
+ def inspect
50
+ "Context(word: '#{@word}', max_results: #{@max_results})"
51
+ end
52
+ alias to_s inspect
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,175 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "context"
4
+ require_relative "suggestion_set"
5
+ require_relative "strategies/base_strategy"
6
+ require_relative "strategies/composite_strategy"
7
+ require_relative "strategies/edit_distance_strategy"
8
+ require_relative "strategies/phonetic_strategy"
9
+ require_relative "strategies/keyboard_proximity_strategy"
10
+ require_relative "strategies/ngram_strategy"
11
+ require_relative "strategies/semantic_strategy"
12
+
13
+ module Kotoshu
14
+ module Suggestions
15
+ # Generator for spelling suggestions.
16
+ #
17
+ # This class orchestrates multiple suggestion algorithms to generate
18
+ # comprehensive spelling suggestions.
19
+ #
20
+ # @example Using default algorithms
21
+ # generator = Generator.new(dictionary)
22
+ # suggestions = generator.generate("helo")
23
+ #
24
+ # @example Using custom algorithms
25
+ # custom_strategy = MyStrategy.new
26
+ # generator = Generator.new(dictionary, algorithms: [custom_strategy])
27
+ class Generator
28
+ # Default suggestion algorithms.
29
+ DEFAULT_ALGORITHMS = [
30
+ Strategies::EditDistanceStrategy,
31
+ Strategies::PhoneticStrategy,
32
+ Strategies::KeyboardProximityStrategy,
33
+ Strategies::NgramStrategy
34
+ ].freeze
35
+
36
+ # @return [Object] The dictionary (any dictionary backend)
37
+ attr_reader :dictionary
38
+
39
+ # @return [Strategies::CompositeStrategy] The composite strategy
40
+ attr_reader :strategy
41
+
42
+ # Create a new suggestion generator.
43
+ #
44
+ # @param dictionary [Object] The dictionary instance
45
+ # @param algorithms [Array<Class, Strategies::BaseStrategy>, nil] Algorithm classes or instances
46
+ # @param max_suggestions [Integer] Maximum suggestions to return
47
+ # @param config [Hash] Configuration options
48
+ def initialize(dictionary, algorithms: nil, max_suggestions: 10, **config)
49
+ @dictionary = dictionary
50
+ @max_suggestions = max_suggestions
51
+ # Use default algorithms if none provided
52
+ algorithms_to_use = algorithms || DEFAULT_ALGORITHMS
53
+ @strategy = build_strategy(algorithms_to_use, config)
54
+ end
55
+
56
+ # Generate suggestions for a word.
57
+ #
58
+ # @param word [String] The misspelled word
59
+ # @param max_suggestions [Integer] Maximum suggestions (optional)
60
+ # @return [SuggestionSet] Generated suggestions
61
+ #
62
+ # @example
63
+ # generator.generate("helo")
64
+ # # => #<Kotoshu::Suggestions::SuggestionSet ...>
65
+ def generate(word, max_suggestions: nil)
66
+ return SuggestionSet.empty if word.nil? || word.empty?
67
+
68
+ context = Context.new(
69
+ word: word,
70
+ dictionary: @dictionary,
71
+ max_results: max_suggestions || @max_suggestions
72
+ )
73
+
74
+ @strategy.generate(context)
75
+ end
76
+
77
+ # Alias for generate for API consistency.
78
+ #
79
+ # @param word [String] The misspelled word
80
+ # @param max_suggestions [Integer] Maximum suggestions (optional)
81
+ # @return [SuggestionSet] Generated suggestions
82
+ #
83
+ # @example
84
+ # generator.suggest("helo")
85
+ # # => #<Kotoshu::Suggestions::SuggestionSet ...>
86
+ alias suggest generate
87
+
88
+ # Check if a word is correct.
89
+ #
90
+ # @param word [String] The word to check
91
+ # @return [Boolean] True if the word is in the dictionary
92
+ #
93
+ # @example
94
+ # generator.correct?("hello") # => true
95
+ # generator.correct?("helo") # => false
96
+ def correct?(word)
97
+ return false if word.nil? || word.empty?
98
+
99
+ dictionary_lookup(word)
100
+ end
101
+
102
+ # Check if a word is incorrect.
103
+ #
104
+ # @param word [String] The word to check
105
+ # @return [Boolean] True if the word is not in the dictionary
106
+ def incorrect?(word)
107
+ !correct?(word)
108
+ end
109
+ alias misspelled? incorrect?
110
+
111
+ # Get the default algorithms.
112
+ #
113
+ # @return [Array<Class>] Default algorithm classes
114
+ #
115
+ # @example
116
+ # Generator.default_algorithms
117
+ def self.default_algorithms
118
+ DEFAULT_ALGORITHMS.dup
119
+ end
120
+
121
+ # Set the default algorithms.
122
+ #
123
+ # @param algorithms [Array<Class>] Algorithm classes
124
+ #
125
+ # @example
126
+ # Generator.default_algorithms = [MyCustomStrategy]
127
+ class << self
128
+ attr_writer :default_algorithms
129
+ end
130
+
131
+ private
132
+
133
+ # Build the composite strategy from algorithm classes.
134
+ #
135
+ # @param algorithms [Array<Class, Strategies::BaseStrategy>] Algorithm classes or instances
136
+ # @param config [Hash] Configuration options
137
+ # @return [Strategies::CompositeStrategy] The composite strategy
138
+ def build_strategy(algorithms, config)
139
+ composite = Strategies::CompositeStrategy.new(name: :default, **config)
140
+
141
+ algorithms.each do |alg|
142
+ strategy = if alg.is_a?(Strategies::BaseStrategy)
143
+ alg
144
+ elsif alg.is_a?(Class) && alg < Strategies::BaseStrategy
145
+ alg.new(**config)
146
+ else
147
+ raise ArgumentError, "Invalid algorithm: #{alg.inspect}"
148
+ end
149
+
150
+ composite.add(strategy)
151
+ end
152
+
153
+ composite
154
+ end
155
+
156
+ # Look up a word in the dictionary.
157
+ #
158
+ # @param word [String] The word
159
+ # @return [Boolean] True if found
160
+ def dictionary_lookup(word)
161
+ if @dictionary.respond_to?(:lookup)
162
+ @dictionary.lookup(word)
163
+ elsif @dictionary.respond_to?(:include?)
164
+ @dictionary.include?(word)
165
+ elsif @dictionary.is_a?(Hash)
166
+ @dictionary.key?(word)
167
+ elsif @dictionary.is_a?(Array)
168
+ @dictionary.include?(word)
169
+ else
170
+ false
171
+ end
172
+ end
173
+ end
174
+ end
175
+ end