kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,237 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Dictionary
5
+ # Base class for all dictionary backends.
6
+ #
7
+ # This abstract class defines the interface that all dictionary
8
+ # implementations must follow.
9
+ #
10
+ # @note Subclasses must implement the abstract methods: {#lookup},
11
+ # {#suggest}, {#add_word}, and {#remove_word}.
12
+ #
13
+ # @example Implementing a custom dictionary
14
+ # class MyDictionary < Base
15
+ # def initialize(path, language_code:, locale: nil)
16
+ # super(language_code, locale: locale)
17
+ # @words = load_words(path)
18
+ # end
19
+ #
20
+ # def lookup(word)
21
+ # @words.include?(word.downcase)
22
+ # end
23
+ #
24
+ # # ... implement other abstract methods
25
+ # end
26
+ class Base
27
+ # @return [String] The language code (e.g., "en-US", "en-GB")
28
+ attr_reader :language_code
29
+
30
+ # @return [String, nil] The locale (e.g., "en", "en_US")
31
+ attr_reader :locale
32
+
33
+ # @return [Hash] Additional metadata
34
+ attr_reader :metadata
35
+
36
+ # Create a new dictionary.
37
+ #
38
+ # @param language_code [String] The language code (e.g., "en-US")
39
+ # @param locale [String, nil] The locale (optional)
40
+ # @param metadata [Hash] Additional metadata (optional)
41
+ def initialize(language_code, locale: nil, metadata: {})
42
+ raise ArgumentError, "Language code cannot be empty" if language_code.nil? || language_code.empty?
43
+
44
+ @language_code = language_code.dup.freeze
45
+ @locale = locale&.dup&.freeze
46
+ @metadata = metadata.dup.freeze
47
+ end
48
+
49
+ # Check if a word exists in the dictionary.
50
+ #
51
+ # @abstract Subclasses must implement this method.
52
+ # @param word [String] The word to look up
53
+ # @return [Boolean] True if the word exists
54
+ # @raise [NotImplementedError] Subclass must implement
55
+ def lookup(word)
56
+ raise NotImplementedError, "#{self.class} must implement #lookup"
57
+ end
58
+
59
+ # Check if a word exists in the dictionary (alias for lookup).
60
+ #
61
+ # @param word [String] The word to look up
62
+ # @return [Boolean] True if the word exists
63
+ def lookup?(word)
64
+ lookup(word)
65
+ end
66
+
67
+ alias has_word? lookup
68
+ alias include? lookup
69
+ alias contains? lookup
70
+
71
+ # Generate spelling suggestions for a word.
72
+ #
73
+ # @abstract Subclasses must implement this method.
74
+ # @param word [String] The misspelled word
75
+ # @param max_suggestions [Integer] Maximum number of suggestions
76
+ # @return [Array<String>] List of suggested words
77
+ # @raise [NotImplementedError] Subclass must implement
78
+ def suggest(word, max_suggestions: 10)
79
+ raise NotImplementedError, "#{self.class} must implement #suggest"
80
+ end
81
+
82
+ # Add a word to the dictionary.
83
+ #
84
+ # @abstract Subclasses must implement this method.
85
+ # @param word [String] The word to add
86
+ # @param flags [Array<String>] Morphological flags (optional)
87
+ # @return [Boolean] True if the word was added
88
+ # @raise [NotImplementedError] Subclass must implement
89
+ def add_word(word, flags: [])
90
+ raise NotImplementedError, "#{self.class} must implement #add_word"
91
+ end
92
+ alias << add_word
93
+
94
+ # Remove a word from the dictionary.
95
+ #
96
+ # @abstract Subclasses must implement this method.
97
+ # @param word [String] The word to remove
98
+ # @return [Boolean] True if the word was removed
99
+ # @raise [NotImplementedError] Subclass must implement
100
+ def remove_word(word)
101
+ raise NotImplementedError, "#{self.class} must implement #remove_word"
102
+ end
103
+
104
+ # Get all words in the dictionary.
105
+ #
106
+ # @abstract Subclasses must implement this method.
107
+ # @return [Array<String>] All words
108
+ # @raise [NotImplementedError] Subclass must implement
109
+ def words
110
+ raise NotImplementedError, "#{self.class} must implement #words"
111
+ end
112
+ alias all_words words
113
+
114
+ # Get the number of words in the dictionary.
115
+ #
116
+ # @return [Integer] Word count
117
+ def size
118
+ words.length
119
+ end
120
+ alias count size
121
+ alias length size
122
+
123
+ # Check if the dictionary is empty.
124
+ #
125
+ # @return [Boolean] True if empty
126
+ def empty?
127
+ size.zero?
128
+ end
129
+
130
+ # Iterate over all words.
131
+ #
132
+ # @yield [word] Each word
133
+ # @return [Enumerator] Enumerator if no block given
134
+ def each_word(&block)
135
+ return enum_for(:each_word) unless block_given?
136
+
137
+ words.each(&block)
138
+ end
139
+
140
+ # Get words starting with a prefix.
141
+ #
142
+ # @param prefix [String] The prefix
143
+ # @return [Array<String>] Words with the prefix
144
+ def words_with_prefix(prefix)
145
+ words.select { |w| w.start_with?(prefix) }
146
+ end
147
+
148
+ # Get words matching a pattern.
149
+ #
150
+ # @param pattern [Regexp] The pattern
151
+ # @return [Array<String>] Matching words
152
+ def words_matching(pattern)
153
+ words.select { |w| w.match?(pattern) }
154
+ end
155
+
156
+ # Convert to string.
157
+ #
158
+ # @return [String] String representation
159
+ def to_s
160
+ "#{self.class.name}(language: #{@language_code}, size: #{size})"
161
+ end
162
+ alias inspect to_s
163
+
164
+ # Dictionary type identifier.
165
+ #
166
+ # @return [Symbol] The dictionary type
167
+ def type
168
+ self.class.name.split("::").last.gsub(/(.)([A-Z])/, '\1_\2').downcase.to_sym
169
+ end
170
+
171
+ # Register this dictionary type.
172
+ #
173
+ # @param type_key [Symbol] The type key to register as
174
+ #
175
+ # @example Registering a custom dictionary type
176
+ # MyDictionary.register_type(:my_custom)
177
+ def self.register_type(type_key)
178
+ Kotoshu::Dictionary.register_type(type_key, self)
179
+ end
180
+
181
+ # Class-level registry for dictionary types.
182
+ #
183
+ # @return [Hash] Registry of type keys to classes
184
+ def self.registry
185
+ @registry ||= {}
186
+ end
187
+
188
+ # Load a dictionary by type.
189
+ #
190
+ # @param type [Symbol] The dictionary type
191
+ # @param args [Array] Arguments to pass to constructor
192
+ # @return [Base] The loaded dictionary
193
+ # @raise [ConfigurationError] If type is not registered
194
+ def self.load(type, *args)
195
+ klass = registry[type]
196
+ raise ConfigurationError, "Unknown dictionary type: #{type}" unless klass
197
+
198
+ klass.new(*args)
199
+ end
200
+ end
201
+
202
+ # Module-level registry for dictionary types.
203
+ #
204
+ # @return [Hash] Registry of type keys to classes
205
+ def self.registry
206
+ @registry ||= {}
207
+ end
208
+
209
+ # Register a dictionary type.
210
+ #
211
+ # @param type [Symbol] The type key
212
+ # @param klass [Class] The dictionary class
213
+ #
214
+ # @example Registering a custom dictionary type
215
+ # Dictionary.register_type(:my_custom, MyDictionary)
216
+ def self.register_type(type, klass)
217
+ @registry ||= {}
218
+ @registry[type] = klass
219
+ end
220
+
221
+ # Load a dictionary by type.
222
+ #
223
+ # @param type [Symbol] The dictionary type
224
+ # @param args [Array] Arguments to pass to constructor
225
+ # @return [Base] The loaded dictionary
226
+ #
227
+ # @example Loading a dictionary
228
+ # dict = Dictionary.load(:unix_words, "/usr/share/dict/words",
229
+ # language_code: "en-US")
230
+ def self.load(type, *args)
231
+ klass = registry[type]
232
+ raise ConfigurationError, "Unknown dictionary type: #{type}" unless klass
233
+
234
+ klass.new(*args)
235
+ end
236
+ end
237
+ end
@@ -0,0 +1,254 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base"
4
+
5
+ module Kotoshu
6
+ module Dictionary
7
+ # CSpell dictionary backend.
8
+ #
9
+ # This dictionary reads CSpell-formatted dictionary files (plain text .txt
10
+ # or compressed .trie files). CSpell is the spell checker used by VS Code.
11
+ #
12
+ # File format:
13
+ # - .txt: Plain text with one word per line, # comments supported
14
+ # - .trie: Compressed trie format (DAFSA - Deterministic Acyclic Finite State Automaton)
15
+ #
16
+ # @example Creating from a text file
17
+ # dict = CSpell.new("words.txt", language_code: "en-US")
18
+ # dict.lookup?("hello") # => true
19
+ #
20
+ # @example Creating from a trie file
21
+ # dict = CSpell.new("words.trie", language_code: "en")
22
+ class CSpell < Base
23
+ # @return [String] The path to the dictionary file
24
+ attr_reader :path
25
+
26
+ # @return [Boolean] Whether lookups are case-sensitive
27
+ attr_reader :case_sensitive
28
+
29
+ # @return [Core::Trie::Trie] The trie data structure
30
+ attr_reader :trie
31
+
32
+ # Create a new CSpell dictionary.
33
+ #
34
+ # @param path [String] Path to the dictionary file (.txt or .trie)
35
+ # @param language_code [String] The language code
36
+ # @param locale [String, nil] The locale (optional)
37
+ # @param case_sensitive [Boolean] Whether lookups are case-sensitive
38
+ # @param metadata [Hash] Additional metadata (optional)
39
+ def initialize(path, language_code:, locale: nil, case_sensitive: false, metadata: {})
40
+ super(language_code, locale: locale, metadata: metadata)
41
+
42
+ @path = File.expand_path(path)
43
+ @case_sensitive = case_sensitive
44
+
45
+ raise DictionaryNotFoundError, @path unless File.exist?(@path)
46
+
47
+ # Load based on file extension
48
+ @trie = if @path.end_with?(".trie")
49
+ load_trie_file(@path)
50
+ else
51
+ load_text_file(@path)
52
+ end
53
+
54
+ # Register this dictionary type
55
+ self.class.register_type(:cspell) unless Dictionary.registry.key?(:cspell)
56
+ end
57
+
58
+ # Check if a word exists in the dictionary.
59
+ #
60
+ # @param word [String] The word to look up
61
+ # @return [Boolean] True if the word exists
62
+ def lookup(word)
63
+ return false if word.nil? || word.empty?
64
+
65
+ lookup_word = @case_sensitive ? word : word.downcase
66
+ @trie.lookup(lookup_word)
67
+ end
68
+
69
+ # Check if the dictionary has words with a prefix.
70
+ #
71
+ # @param prefix [String] The prefix
72
+ # @return [Boolean] True if words exist with the prefix
73
+ def has_prefix?(prefix)
74
+ return false if prefix.nil? || prefix.empty?
75
+
76
+ lookup_prefix = @case_sensitive ? prefix : prefix.downcase
77
+ @trie.has_prefix?(lookup_prefix)
78
+ end
79
+
80
+ # Generate spelling suggestions.
81
+ #
82
+ # Uses trie walk to find similar words.
83
+ #
84
+ # @param word [String] The misspelled word
85
+ # @param max_suggestions [Integer] Maximum suggestions
86
+ # @return [Array<String>] List of suggested words
87
+ def suggest(word, max_suggestions: 10)
88
+ return [] if word.nil? || word.empty?
89
+
90
+ lookup_word = @case_sensitive ? word : word.downcase
91
+
92
+ # First try prefix-based suggestions
93
+ prefix_suggestions = @trie.suggestions(lookup_word, max_results: max_suggestions)
94
+
95
+ # If we have enough prefix suggestions, return them
96
+ return prefix_suggestions if prefix_suggestions.length >= max_suggestions
97
+
98
+ # Otherwise, use edit distance for more suggestions
99
+ all_words = @trie.all_words
100
+ candidates = all_words.select do |w|
101
+ w.length >= lookup_word.length - 2 &&
102
+ w.length <= lookup_word.length + 2
103
+ end
104
+
105
+ # Calculate edit distances
106
+ results = candidates.map do |dict_word|
107
+ dist = edit_distance(lookup_word, dict_word)
108
+ [dict_word, dist]
109
+ end.select { |_, dist| dist.positive? && dist <= 2 }
110
+ .sort_by { |_, dist| dist }
111
+ .first(max_suggestions - prefix_suggestions.length)
112
+ .map(&:first)
113
+
114
+ # Combine both sets
115
+ (prefix_suggestions + results).uniq.first(max_suggestions)
116
+ end
117
+
118
+ # Add a word to the dictionary.
119
+ #
120
+ # @param word [String] The word to add
121
+ # @param flags [Array<String>] Flags (ignored for CSpell)
122
+ # @return [Boolean] True if added
123
+ def add_word(word, flags: [])
124
+ return false if word.nil? || word.empty?
125
+
126
+ lookup_word = @case_sensitive ? word : word.downcase
127
+ return false if @trie.lookup(lookup_word)
128
+
129
+ @trie.insert(lookup_word)
130
+ true
131
+ end
132
+
133
+ # Remove a word from the dictionary.
134
+ #
135
+ # @param word [String] The word to remove
136
+ # @return [Boolean] True if removed
137
+ # @note CSpell dictionaries are typically immutable after loading
138
+ def remove_word(_word)
139
+ # Trie doesn't support removal easily
140
+ # Would need to rebuild the trie
141
+ false
142
+ end
143
+
144
+ # Get all words in the dictionary.
145
+ #
146
+ # @return [Array<String>] All words
147
+ def words
148
+ @trie.all_words
149
+ end
150
+
151
+ # Get words with a prefix.
152
+ #
153
+ # @param prefix [String] The prefix
154
+ # @return [Array<String>] Words with the prefix
155
+ def words_with_prefix(prefix)
156
+ return [] if prefix.nil? || prefix.empty?
157
+
158
+ lookup_prefix = @case_sensitive ? prefix : prefix.downcase
159
+ @trie.words_with_prefix(lookup_prefix)
160
+ end
161
+
162
+ # Create a dictionary from an array of words.
163
+ #
164
+ # @param words [Array<String>] The words
165
+ # @param language_code [String] The language code
166
+ # @param locale [String, nil] The locale (optional)
167
+ # @param case_sensitive [Boolean] Whether lookups are case-sensitive
168
+ # @return [CSpell] New dictionary
169
+ #
170
+ # @example
171
+ # dict = CSpell.from_words(%w[hello world test], language_code: "en")
172
+ def self.from_words(words, language_code:, locale: nil, case_sensitive: false)
173
+ dict = allocate
174
+
175
+ # Build trie from words
176
+ normalized_words = words.map { |w| case_sensitive ? w : w.downcase }.uniq
177
+ trie = Core::Trie::Builder.from_array(normalized_words)
178
+
179
+ dict.instance_variable_set(:@language_code, language_code.dup.freeze)
180
+ dict.instance_variable_set(:@locale, locale&.dup&.freeze)
181
+ dict.instance_variable_set(:@path, nil)
182
+ dict.instance_variable_set(:@case_sensitive, case_sensitive)
183
+ dict.instance_variable_set(:@trie, trie)
184
+ dict.instance_variable_set(:@metadata, {}.freeze)
185
+
186
+ # Register this dictionary type (unless already registered)
187
+ register_type(:cspell) unless Dictionary.registry.key?(:cspell)
188
+
189
+ dict
190
+ end
191
+
192
+ private
193
+
194
+ # Load a text dictionary file.
195
+ #
196
+ # @param path [String] The file path
197
+ # @return [Core::Trie::Trie] The loaded trie
198
+ def load_text_file(path)
199
+ words = File.foreach(path, chomp: true)
200
+ .reject { |line| line.empty? || line.strip.empty? || line.strip.start_with?("#") }
201
+ .map(&:strip)
202
+ .map { |word| @case_sensitive ? word : word.downcase }
203
+ .uniq
204
+
205
+ Core::Trie::Builder.from_array(words)
206
+ end
207
+
208
+ # Load a compressed trie file.
209
+ #
210
+ # @param path [String] The file path
211
+ # @return [Core::Trie::Trie] The loaded trie
212
+ #
213
+ # @note For now, this falls back to treating the file as text.
214
+ # Full .trie format support would require implementing DAFSA decompression.
215
+ def load_trie_file(path)
216
+ # For now, treat as text file
217
+ # Full implementation would parse the CSpell .trie format
218
+ # which uses DAFSA (Deterministic Acyclic Finite State Automaton) compression
219
+ load_text_file(path)
220
+ end
221
+
222
+ # Calculate Levenshtein edit distance.
223
+ #
224
+ # @param str1 [String] First string
225
+ # @param str2 [String] Second string
226
+ # @return [Integer] Edit distance
227
+ def edit_distance(str1, str2)
228
+ return str2.length if str1.empty?
229
+ return str1.length if str2.empty?
230
+
231
+ # Use smaller string for inner loop
232
+ str1, str2 = str2, str1 if str1.length > str2.length
233
+
234
+ previous = (0..str1.length).to_a
235
+
236
+ str2.each_char.with_index do |char2, j|
237
+ current = [j + 1]
238
+
239
+ str1.each_char.with_index do |char1, i|
240
+ insert_cost = current[i] + 1
241
+ delete_cost = previous[i + 1] + 1
242
+ substitute_cost = previous[i] + (char1 == char2 ? 0 : 1)
243
+
244
+ current << [insert_cost, delete_cost, substitute_cost].min
245
+ end
246
+
247
+ previous = current
248
+ end
249
+
250
+ previous.last
251
+ end
252
+ end
253
+ end
254
+ end
@@ -0,0 +1,224 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base"
4
+
5
+ module Kotoshu
6
+ module Dictionary
7
+ # Custom in-memory dictionary.
8
+ #
9
+ # This is a simple dictionary that stores words in memory,
10
+ # designed for runtime customization and user-defined words.
11
+ #
12
+ # @example Creating an empty dictionary
13
+ # dict = Custom.new(language_code: "en-US")
14
+ # dict.add_word("Kotoshu")
15
+ # dict.lookup?("Kotoshu") # => true
16
+ #
17
+ # @example Creating with initial words
18
+ # dict = Custom.new(words: %w[hello world], language_code: "en")
19
+ # dict.lookup?("hello") # => true
20
+ class Custom < Base
21
+ # @return [Boolean] Whether lookups are case-sensitive
22
+ attr_reader :case_sensitive
23
+
24
+ # Create a new Custom dictionary.
25
+ #
26
+ # @param words [Array<String>] Initial words (optional)
27
+ # @param language_code [String] The language code
28
+ # @param locale [String, nil] The locale (optional)
29
+ # @param case_sensitive [Boolean] Whether lookups are case-sensitive
30
+ # @param metadata [Hash] Additional metadata (optional)
31
+ def initialize(language_code:, words: [], locale: nil, case_sensitive: false, metadata: {})
32
+ super(language_code, locale: locale, metadata: metadata)
33
+
34
+ @case_sensitive = case_sensitive
35
+ @words = normalize_words(words)
36
+ @word_set = build_word_set
37
+
38
+ # Register this dictionary type
39
+ self.class.register_type(:custom) unless Dictionary.registry.key?(:custom)
40
+ end
41
+
42
+ # Check if a word exists in the dictionary.
43
+ #
44
+ # @param word [String] The word to look up
45
+ # @return [Boolean] True if the word exists
46
+ def lookup(word)
47
+ return false if word.nil? || word.empty?
48
+
49
+ lookup_word = @case_sensitive ? word : word.downcase
50
+ @word_set.key?(lookup_word)
51
+ end
52
+
53
+ # Generate spelling suggestions.
54
+ #
55
+ # Uses edit distance to find similar words in the dictionary.
56
+ #
57
+ # @param word [String] The misspelled word
58
+ # @param max_suggestions [Integer] Maximum suggestions
59
+ # @return [Array<String>] List of suggested words
60
+ def suggest(word, max_suggestions: 10)
61
+ return [] if word.nil? || word.empty?
62
+
63
+ lookup_word = @case_sensitive ? word : word.downcase
64
+
65
+ # Find words with same prefix
66
+ prefix_len = [lookup_word.length - 1, 2].max
67
+ prefix = lookup_word[0...prefix_len]
68
+ candidates = @words.select { |w| w.start_with?(prefix) }
69
+
70
+ # Calculate edit distances
71
+ candidates.map do |dict_word|
72
+ dist = edit_distance(lookup_word, dict_word)
73
+ [dict_word, dist]
74
+ end.select { |_, dist| dist.positive? && dist <= 2 }
75
+ .sort_by { |_, dist| dist }
76
+ .first(max_suggestions)
77
+ .map(&:first)
78
+ end
79
+
80
+ # Add a word to the dictionary.
81
+ #
82
+ # @param word [String] The word to add
83
+ # @param flags [Array<String>] Flags (ignored for Custom)
84
+ # @return [Boolean] True if added
85
+ def add_word(word, flags: [])
86
+ return false if word.nil? || word.empty?
87
+
88
+ lookup_word = normalize_word(word)
89
+ return false if @word_set.key?(lookup_word)
90
+
91
+ @words << lookup_word
92
+ @word_set[lookup_word] = @words.length - 1
93
+
94
+ true
95
+ end
96
+
97
+ # Remove a word from the dictionary.
98
+ #
99
+ # @param word [String] The word to remove
100
+ # @return [Boolean] True if removed
101
+ def remove_word(word)
102
+ return false if word.nil? || word.empty?
103
+
104
+ lookup_word = normalize_word(word)
105
+ return false unless @word_set.key?(lookup_word)
106
+
107
+ index = @word_set.delete(lookup_word)
108
+ @words.delete_at(index)
109
+
110
+ true
111
+ end
112
+
113
+ # Get all words in the dictionary.
114
+ #
115
+ # @return [Array<String>] All words
116
+ def words
117
+ @words.dup
118
+ end
119
+
120
+ # Clear all words from the dictionary.
121
+ #
122
+ # @return [self] Self for chaining
123
+ def clear
124
+ @words.clear
125
+ @word_set.clear
126
+ self
127
+ end
128
+
129
+ # Check if the dictionary is read-only.
130
+ #
131
+ # @return [Boolean] Always false for Custom dictionary
132
+ def readonly?
133
+ false
134
+ end
135
+
136
+ # Merge another dictionary into this one.
137
+ #
138
+ # @param other [Base, Array<String>] Dictionary or words to merge
139
+ # @return [self] Self for chaining
140
+ #
141
+ # @example Merging another dictionary
142
+ # dict1 = Custom.new(words: %w[hello], language_code: "en")
143
+ # dict2 = Custom.new(words: %w[world], language_code: "en")
144
+ # dict1.merge(dict2)
145
+ #
146
+ # @example Merging an array of words
147
+ # dict.merge(%w[test example])
148
+ def merge(other)
149
+ words_to_add = if other.is_a?(Base)
150
+ other.words
151
+ elsif other.is_a?(Array)
152
+ other
153
+ else
154
+ []
155
+ end
156
+
157
+ words_to_add.each { |word| add_word(word) }
158
+
159
+ self
160
+ end
161
+
162
+ private
163
+
164
+ # Normalize words for storage.
165
+ #
166
+ # @param words [Array<String>] Words to normalize
167
+ # @return [Array<String>] Normalized words
168
+ def normalize_words(words)
169
+ words.map { |w| normalize_word(w) }.compact
170
+ end
171
+
172
+ # Normalize a single word.
173
+ #
174
+ # @param word [String] The word to normalize
175
+ # @return [String, nil] Normalized word or nil if invalid
176
+ def normalize_word(word)
177
+ return nil if word.nil? || word.empty?
178
+
179
+ word = word.strip
180
+ return nil if word.empty?
181
+
182
+ @case_sensitive ? word : word.downcase
183
+ end
184
+
185
+ # Build a hash set for O(1) lookups.
186
+ #
187
+ # @return [Hash] Word to index mapping
188
+ def build_word_set
189
+ @words.each_with_index.to_h
190
+ end
191
+
192
+ # Calculate Levenshtein edit distance.
193
+ #
194
+ # @param str1 [String] First string
195
+ # @param str2 [String] Second string
196
+ # @return [Integer] Edit distance
197
+ def edit_distance(str1, str2)
198
+ return str2.length if str1.empty?
199
+ return str1.length if str2.empty?
200
+
201
+ # Use smaller string for inner loop
202
+ str1, str2 = str2, str1 if str1.length > str2.length
203
+
204
+ previous = (0..str1.length).to_a
205
+
206
+ str2.each_char.with_index do |char2, j|
207
+ current = [j + 1]
208
+
209
+ str1.each_char.with_index do |char1, i|
210
+ insert_cost = current[i] + 1
211
+ delete_cost = previous[i + 1] + 1
212
+ substitute_cost = previous[i] + (char1 == char2 ? 0 : 1)
213
+
214
+ current << [insert_cost, delete_cost, substitute_cost].min
215
+ end
216
+
217
+ previous = current
218
+ end
219
+
220
+ previous.last
221
+ end
222
+ end
223
+ end
224
+ end