kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,702 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "core/exceptions"
4
+ require_relative "dictionary/base"
5
+ require_relative "dictionary/unix_words"
6
+ require_relative "dictionary/plain_text"
7
+ require_relative "dictionary/custom"
8
+ require_relative "dictionary/hunspell"
9
+ require_relative "dictionary/cspell"
10
+ require_relative "configuration/resolver"
11
+
12
+ module Kotoshu
13
+ # Configuration for Kotoshu spell checker.
14
+ #
15
+ # This class manages configuration options for spell checking,
16
+ # including dictionary settings, suggestion limits, and language options.
17
+ #
18
+ # Configuration priority: CLI > ENV > Programmatic > Defaults
19
+ #
20
+ # @example Creating a configuration
21
+ # config = Configuration.new
22
+ # config.dictionary_path = "/usr/share/dict/words"
23
+ # config.dictionary_type = :unix_words
24
+ # config.max_suggestions = 15
25
+ #
26
+ # @example Using a block
27
+ # Configuration.new do |c|
28
+ # c.dictionary_path = "words.txt"
29
+ # c.language = "en-US"
30
+ # end
31
+ #
32
+ # @example Using environment variables
33
+ # ENV['KOTOSHU_LANGUAGE'] = 'de'
34
+ # config = Configuration.new
35
+ # config.language # => 'de'
36
+ class Configuration
37
+ # Configuration schema with ENV variable mappings.
38
+ #
39
+ # Each key maps to a hash with:
40
+ # - :env - Environment variable name
41
+ # - :default - Default value (can be a proc for dynamic defaults)
42
+ # - :description - Human-readable description
43
+ # - :type - Expected type (for validation/conversion)
44
+ SCHEMA = {
45
+ dictionary_path: {
46
+ env: "KOTOSHU_DICTIONARIES_PATH",
47
+ default: nil,
48
+ description: "Path to dictionary file",
49
+ type: String
50
+ },
51
+ cache_path: {
52
+ env: "KOTOSHU_CACHE_PATH",
53
+ default: -> { default_cache_path },
54
+ description: "Path to cache directory (~/.cache/kotoshu)",
55
+ type: String
56
+ },
57
+ config_path: {
58
+ env: "KOTOSHU_CONFIG_PATH",
59
+ default: -> { default_config_path },
60
+ description: "Path to user config directory (~/.config/kotoshu)",
61
+ type: String
62
+ },
63
+ data_path: {
64
+ env: "KOTOSHU_DATA_PATH",
65
+ default: -> { default_data_path },
66
+ description: "Path to data directory (~/.local/share/kotoshu)",
67
+ type: String
68
+ },
69
+ dictionaries_url: {
70
+ env: "KOTOSHU_DICTIONARIES_URL",
71
+ default: "https://raw.githubusercontent.com/kotoshu/dictionaries/main",
72
+ description: "Deprecated: use repos_base_url + dictionaries_pin via SourceRegistry",
73
+ type: String
74
+ },
75
+ models_url: {
76
+ env: "KOTOSHU_MODELS_URL",
77
+ default: "https://github.com/kotoshu/models-fasttext-onnx/raw/main",
78
+ description: "Deprecated: use repos_base_url + models_pin via SourceRegistry",
79
+ type: String
80
+ },
81
+ repos_base_url: {
82
+ env: "KOTOSHU_REPOS_BASE_URL",
83
+ default: -> { Kotoshu::SourceRegistry::DEFAULT_BASE_URL },
84
+ description: "GitHub raw root for all kotoshu content repos",
85
+ type: String
86
+ },
87
+ dictionaries_pin: {
88
+ env: "KOTOSHU_DICTIONARIES_PIN",
89
+ default: "v1",
90
+ description: "Branch/tag/commit pinned for kotoshu/dictionaries",
91
+ type: String
92
+ },
93
+ frequency_pin: {
94
+ env: "KOTOSHU_FREQUENCY_PIN",
95
+ default: "main",
96
+ description: "Branch/tag/commit pinned for kotoshu/frequency-list-kelly",
97
+ type: String
98
+ },
99
+ models_pin: {
100
+ env: "KOTOSHU_MODELS_PIN",
101
+ default: "main",
102
+ description: "Branch/tag/commit pinned for kotoshu/models-fasttext-onnx",
103
+ type: String
104
+ },
105
+ auto_download: {
106
+ env: "KOTOSHU_AUTO_DOWNLOAD",
107
+ default: true,
108
+ description: "Automatically download missing dictionaries",
109
+ type: :boolean
110
+ },
111
+ cache_ttl: {
112
+ env: "KOTOSHU_CACHE_TTL",
113
+ default: 86_400, # 24 hours in seconds
114
+ description: "Cache TTL in seconds",
115
+ type: Integer
116
+ },
117
+ max_cache_size: {
118
+ env: "KOTOSHU_MAX_CACHE_SIZE",
119
+ default: 1_073_741_824, # 1GB
120
+ description: "Maximum cache size in bytes",
121
+ type: Integer
122
+ },
123
+ dictionary_type: {
124
+ env: "KOTOSHU_DICTIONARY_TYPE",
125
+ default: :unix_words,
126
+ description: "Dictionary type (:unix_words, :plain_text, :hunspell, :cspell, :custom)",
127
+ type: Symbol
128
+ },
129
+ language: {
130
+ env: "KOTOSHU_LANGUAGE",
131
+ default: "en-US",
132
+ description: "Language code (e.g., en-US, de-DE, ja-JP)",
133
+ type: String
134
+ },
135
+ locale: {
136
+ env: "KOTOSHU_LOCALE",
137
+ default: nil,
138
+ description: "Locale (e.g., en, en_US, de_DE)",
139
+ type: String
140
+ },
141
+ max_suggestions: {
142
+ env: "KOTOSHU_MAX_SUGGESTIONS",
143
+ default: 10,
144
+ description: "Maximum number of suggestions",
145
+ type: Integer
146
+ },
147
+ case_sensitive: {
148
+ env: "KOTOSHU_CASE_SENSITIVE",
149
+ default: false,
150
+ description: "Enable case-sensitive lookups",
151
+ type: :boolean
152
+ },
153
+ verbose: {
154
+ env: "KOTOSHU_VERBOSE",
155
+ default: false,
156
+ description: "Enable verbose output",
157
+ type: :boolean
158
+ },
159
+ encoding: {
160
+ env: "KOTOSHU_ENCODING",
161
+ default: "UTF-8",
162
+ description: "Character encoding",
163
+ type: String
164
+ },
165
+ dictionaries_path: {
166
+ env: "KOTOSHU_DICTIONARIES_PATH",
167
+ default: nil,
168
+ description: "Path to dictionaries directory (for grammar rules)",
169
+ type: String
170
+ },
171
+ offline: {
172
+ env: "KOTOSHU_OFFLINE",
173
+ default: false,
174
+ description: "Use only cached resources; never download",
175
+ type: :boolean
176
+ },
177
+ resource_pin: {
178
+ env: "KOTOSHU_RESOURCE_PIN",
179
+ default: "main",
180
+ description: "Branch/tag/commit pinned for resource downloads",
181
+ type: String
182
+ },
183
+ default_language: {
184
+ env: "KOTOSHU_DEFAULT_LANGUAGE",
185
+ default: "en",
186
+ description: "Fallback language when detection is inconclusive",
187
+ type: String
188
+ }
189
+ }.freeze
190
+
191
+ # Default configuration values (legacy, for backward compatibility).
192
+ DEFAULTS = {
193
+ dictionary_path: nil,
194
+ dictionary_type: :unix_words,
195
+ language: "en-US",
196
+ locale: nil,
197
+ max_suggestions: 10,
198
+ case_sensitive: false,
199
+ verbose: false,
200
+ suggestion_algorithms: nil, # Use defaults
201
+ custom_words: [],
202
+ encoding: "UTF-8",
203
+ dictionaries_path: nil, # Path to dictionaries directory (for grammar rules)
204
+ offline: false,
205
+ default_language: "en",
206
+ resource_pin: "main"
207
+ }.freeze
208
+
209
+ # @return [String, nil] Path to the dictionary file
210
+ attr_accessor :dictionary_path
211
+
212
+ # @return [Symbol] Dictionary type (:unix_words, :plain_text, :hunspell, :cspell, :custom)
213
+ attr_accessor :dictionary_type
214
+
215
+ # @return [String] Language code (e.g., "en-US", "en-GB")
216
+ attr_accessor :language
217
+
218
+ # @return [String, nil] Locale (e.g., "en", "en_US")
219
+ attr_accessor :locale
220
+
221
+ # @return [Integer] Maximum number of suggestions to return
222
+ attr_accessor :max_suggestions
223
+
224
+ # @return [Boolean] Whether lookups are case-sensitive
225
+ attr_accessor :case_sensitive
226
+
227
+ # @return [Boolean] Whether to enable verbose output
228
+ attr_accessor :verbose
229
+
230
+ # @return [Array<Class>, nil] Suggestion algorithms to use
231
+ attr_accessor :suggestion_algorithms
232
+
233
+ # @return [Array<String>] Custom words to add to dictionary
234
+ attr_accessor :custom_words
235
+
236
+ # @return [String] Character encoding
237
+ attr_accessor :encoding
238
+
239
+ # @return [Dictionary::Base, nil] The loaded dictionary (lazy loaded)
240
+ attr_accessor :dictionary
241
+
242
+ # @return [String, nil] Path to dictionaries directory (for grammar rules)
243
+ attr_accessor :dictionaries_path
244
+
245
+ # @return [Boolean] Whether to use only cached resources (no downloads)
246
+ attr_accessor :offline
247
+
248
+ # @return [String] Fallback language when detection is inconclusive
249
+ attr_accessor :default_language
250
+
251
+ # @return [String] Branch/tag/commit pinned for resource downloads
252
+ attr_accessor :resource_pin
253
+
254
+ # @return [String, nil] Path to cache directory
255
+ attr_accessor :cache_path
256
+
257
+ # @return [String, nil] Path to user config directory
258
+ attr_accessor :config_path
259
+
260
+ # @return [String, nil] Path to data directory (audit log, etc.)
261
+ attr_accessor :data_path
262
+
263
+ # @return [String] Base URL for downloading dictionaries (deprecated)
264
+ attr_accessor :dictionaries_url
265
+
266
+ # @return [String] Base URL for FastText ONNX models (deprecated)
267
+ attr_accessor :models_url
268
+
269
+ # @return [String] GitHub raw root for all kotoshu content repos
270
+ attr_accessor :repos_base_url
271
+
272
+ # @return [String] Pin for kotoshu/dictionaries
273
+ attr_accessor :dictionaries_pin
274
+
275
+ # @return [String] Pin for kotoshu/frequency-list-kelly
276
+ attr_accessor :frequency_pin
277
+
278
+ # @return [String] Branch/tag/commit pinned for model downloads
279
+ attr_accessor :models_pin
280
+
281
+ # @return [#start,#update,#maybe_report_periodic,#finish,nil]
282
+ # Optional progress reporter for downloads. Typically set by the
283
+ # CLI (Cli::ProgressReporter) for human-facing setup runs; nil
284
+ # (silent) for programmatic API usage.
285
+ attr_accessor :download_reporter
286
+
287
+ # @return [Boolean] Whether to automatically download missing dictionaries
288
+ attr_accessor :auto_download
289
+
290
+ # @return [Integer] Cache TTL in seconds
291
+ attr_accessor :cache_ttl
292
+
293
+ # @return [Integer] Maximum cache size in bytes
294
+ attr_accessor :max_cache_size
295
+
296
+ # @return [Resolver] The configuration resolver
297
+ attr_reader :resolver
298
+
299
+ # Create a new configuration.
300
+ #
301
+ # @param args [Array] Variable arguments (positional hash or nothing)
302
+ # @param kwargs [Hash] Keyword arguments for configuration
303
+ # @param block [Proc] Optional block for configuration
304
+ #
305
+ # @example With hash
306
+ # config = Configuration.new(
307
+ # dictionary_path: "/usr/share/dict/words",
308
+ # language: "en-US"
309
+ # )
310
+ #
311
+ # @example With block
312
+ # Configuration.new do |c|
313
+ # c.dictionary_path = "words.txt"
314
+ # c.max_suggestions = 15
315
+ # end
316
+ #
317
+ # @example With CLI options (higher priority)
318
+ # config = Configuration.new(
319
+ # language: "en-US",
320
+ # cli_options: { language: "ja" } # ja will be used
321
+ # )
322
+ def initialize(*args, **kwargs, &block)
323
+ # Handle both positional hash and keyword arguments
324
+ settings = args.first.is_a?(Hash) ? args.first : {}
325
+ settings = settings.merge(kwargs)
326
+
327
+ # Extract cli_options if provided
328
+ cli_options = settings.delete(:cli_options) || {}
329
+
330
+ # Build the resolver with settings as programmatic defaults
331
+ @resolver = Resolver.new(
332
+ env: settings[:env] || {},
333
+ programmatic: settings,
334
+ cli: cli_options,
335
+ defaults: DEFAULTS
336
+ )
337
+
338
+ apply_defaults
339
+ apply_resolver_values
340
+ apply_explicit_settings(settings)
341
+
342
+ yield self if block_given?
343
+ end
344
+
345
+ # Get a configuration value using the resolver.
346
+ #
347
+ # This respects the priority: CLI > ENV > Programmatic > Defaults
348
+ #
349
+ # @param key [Symbol] The configuration key
350
+ # @return [Object] The resolved value
351
+ #
352
+ # @example
353
+ # config.get(:language) # => resolved language value
354
+ def get(key)
355
+ @resolver.get(key)
356
+ end
357
+
358
+ # Check if a configuration key has a value set.
359
+ #
360
+ # @param key [Symbol] The configuration key
361
+ # @return [Boolean] True if the key is set somewhere
362
+ def key?(key)
363
+ @resolver.key?(key)
364
+ end
365
+
366
+ # Load or get the dictionary.
367
+ #
368
+ # @return [Dictionary::Base] The loaded dictionary
369
+ def dictionary
370
+ @dictionary ||= load_dictionary
371
+ end
372
+
373
+ # Load the dictionary based on configuration.
374
+ #
375
+ # @return [Dictionary::Base] The loaded dictionary
376
+ # @raise [DictionaryNotFoundError] If dictionary file not found
377
+ # @raise [ConfigurationError] If dictionary type is invalid
378
+ def load_dictionary
379
+ dict = case @dictionary_type
380
+ when :unix_words
381
+ load_unix_words_dictionary
382
+ when :plain_text
383
+ load_plain_text_dictionary
384
+ when :custom
385
+ load_custom_dictionary
386
+ when :hunspell
387
+ load_hunspell_dictionary
388
+ when :cspell
389
+ load_cspell_dictionary
390
+ else
391
+ raise ConfigurationError, "Unknown dictionary type: #{@dictionary_type}"
392
+ end
393
+
394
+ # Add custom words
395
+ @custom_words.each do |word|
396
+ dict.add_word(word)
397
+ end
398
+
399
+ dict
400
+ end
401
+
402
+ # Reset the dictionary (force reload on next access).
403
+ #
404
+ # @return [self] Self for chaining
405
+ def reset_dictionary
406
+ @dictionary = nil
407
+ self
408
+ end
409
+
410
+ # Convert to hash.
411
+ #
412
+ # @return [Hash] Hash representation
413
+ def to_h
414
+ {
415
+ dictionary_path: @dictionary_path,
416
+ dictionary_type: @dictionary_type,
417
+ language: @language,
418
+ locale: @locale,
419
+ max_suggestions: @max_suggestions,
420
+ case_sensitive: @case_sensitive,
421
+ verbose: @verbose,
422
+ suggestion_algorithms: @suggestion_algorithms&.map(&:name),
423
+ custom_words: @custom_words,
424
+ encoding: @encoding,
425
+ dictionaries_path: @dictionaries_path,
426
+ cache_path: @cache_path,
427
+ config_path: @config_path,
428
+ data_path: @data_path,
429
+ dictionaries_url: @dictionaries_url,
430
+ repos_base_url: @repos_base_url,
431
+ dictionaries_pin: @dictionaries_pin,
432
+ frequency_pin: @frequency_pin,
433
+ models_pin: @models_pin,
434
+ auto_download: @auto_download,
435
+ cache_ttl: @cache_ttl,
436
+ max_cache_size: @max_cache_size
437
+ }
438
+ end
439
+
440
+ # Build a SourceRegistry honoring this configuration's base URL and
441
+ # per-repo pins. Single source of truth for all resource URLs.
442
+ #
443
+ # @return [Kotoshu::SourceRegistry]
444
+ def source_registry
445
+ Kotoshu::SourceRegistry.new(
446
+ base_url: @repos_base_url,
447
+ pins: {
448
+ "dictionaries" => @dictionaries_pin,
449
+ "frequency-list-kelly" => @frequency_pin,
450
+ "models-fasttext-onnx" => @models_pin
451
+ }
452
+ )
453
+ end
454
+
455
+ # Clone the configuration.
456
+ #
457
+ # @return [Configuration] A new configuration with the same settings
458
+ def clone
459
+ self.class.new(to_h)
460
+ end
461
+
462
+ # Get the default configuration.
463
+ #
464
+ # @return [Configuration] Default configuration instance
465
+ #
466
+ # @example
467
+ # config = Configuration.default
468
+ def self.default
469
+ new(DEFAULTS.dup)
470
+ end
471
+
472
+ # Global configuration instance.
473
+ #
474
+ # @return [Configuration] The global configuration
475
+ #
476
+ # @example
477
+ # Configuration.instance.dictionary_path = "/usr/share/dict/words"
478
+ def self.instance
479
+ @instance ||= default
480
+ end
481
+
482
+ # Reset the global configuration.
483
+ #
484
+ # @return [Configuration] The reset configuration
485
+ def self.reset
486
+ @instance = default
487
+ end
488
+
489
+ private
490
+
491
+ # Apply default values.
492
+ def apply_defaults
493
+ @dictionary_path = DEFAULTS[:dictionary_path]
494
+ @dictionary_type = DEFAULTS[:dictionary_type]
495
+ @language = DEFAULTS[:language]
496
+ @locale = DEFAULTS[:locale]
497
+ @max_suggestions = DEFAULTS[:max_suggestions]
498
+ @case_sensitive = DEFAULTS[:case_sensitive]
499
+ @verbose = DEFAULTS[:verbose]
500
+ @suggestion_algorithms = DEFAULTS[:suggestion_algorithms]
501
+ @custom_words = DEFAULTS[:custom_words].dup
502
+ @encoding = DEFAULTS[:encoding]
503
+ @dictionaries_path = DEFAULTS[:dictionaries_path]
504
+ @dictionary = nil
505
+
506
+ # New cache-related defaults
507
+ @cache_path = self.class.default_cache_path
508
+ @config_path = self.class.default_config_path
509
+ @data_path = self.class.default_data_path
510
+ @dictionaries_url = SCHEMA[:dictionaries_url][:default]
511
+ @models_url = SCHEMA[:models_url][:default]
512
+ default = SCHEMA[:repos_base_url][:default]
513
+ @repos_base_url = default.is_a?(Proc) ? default.call : default
514
+ @dictionaries_pin = SCHEMA[:dictionaries_pin][:default]
515
+ @frequency_pin = SCHEMA[:frequency_pin][:default]
516
+ @models_pin = SCHEMA[:models_pin][:default]
517
+ @download_reporter = nil
518
+ @auto_download = SCHEMA[:auto_download][:default]
519
+ @cache_ttl = SCHEMA[:cache_ttl][:default]
520
+ @max_cache_size = SCHEMA[:max_cache_size][:default]
521
+ @resource_pin = SCHEMA[:resource_pin][:default]
522
+ end
523
+
524
+ # Apply resolved values from the resolver (ENV, defaults).
525
+ def apply_resolver_values
526
+ # Apply values from ENV and defaults via resolver
527
+ SCHEMA.each_key do |key|
528
+ env_value = @resolver.get(key)
529
+ next if env_value.nil?
530
+
531
+ # Convert boolean strings if needed
532
+ value = convert_schema_value(key, env_value)
533
+ send("#{key}=", value) if respond_to?("#{key}=")
534
+ end
535
+ end
536
+
537
+ # Apply explicit settings from the settings hash.
538
+ #
539
+ # Explicit settings have priority over ENV and defaults.
540
+ def apply_explicit_settings(settings)
541
+ settings.each do |key, value|
542
+ next if key == :env || key == :cli_options
543
+
544
+ send("#{key}=", value) if respond_to?("#{key}=")
545
+ end
546
+ end
547
+
548
+ # Convert a value based on schema type.
549
+ #
550
+ # @param key [Symbol] The configuration key
551
+ # @param value [Object] The value to convert
552
+ # @return [Object] The converted value
553
+ def convert_schema_value(key, value)
554
+ schema = SCHEMA[key]
555
+ return value if schema.nil? || value.nil?
556
+
557
+ case schema[:type]
558
+ when :boolean
559
+ parse_boolean(value)
560
+ when Integer
561
+ value.is_a?(Integer) ? value : value.to_i
562
+ when Symbol
563
+ value.is_a?(Symbol) ? value : value.to_sym
564
+ else
565
+ value
566
+ end
567
+ end
568
+
569
+ # Parse a boolean value from string.
570
+ #
571
+ # @param value [Object] The value to parse
572
+ # @return [Boolean] The parsed boolean
573
+ def parse_boolean(value)
574
+ return true if value == true || value.to_s =~ /^(true|t|yes|y|1)$/i
575
+ return false if value == false || value.to_s =~ /^(false|f|no|n|0)$/i
576
+
577
+ # Default to false for unrecognized values
578
+ false
579
+ end
580
+
581
+ # Get default cache path.
582
+ #
583
+ # @return [String] The default cache path
584
+ def self.default_cache_path
585
+ Paths.cache_path
586
+ end
587
+
588
+ def self.default_config_path
589
+ Paths.config_path
590
+ end
591
+
592
+ def self.default_data_path
593
+ Paths.data_path
594
+ end
595
+
596
+ # Load UnixWords dictionary.
597
+ def load_unix_words_dictionary
598
+ # First try configured path or system dictionary
599
+ path = @dictionary_path || Dictionary::UnixWords.detect_system_dictionary
600
+
601
+ if path
602
+ raise DictionaryNotFoundError, path unless File.exist?(path)
603
+
604
+ return Dictionary::UnixWords.new(
605
+ path,
606
+ language_code: @language,
607
+ locale: @locale,
608
+ case_sensitive: @case_sensitive
609
+ )
610
+ end
611
+
612
+ # Try to detect system dictionary
613
+ dict = Dictionary::UnixWords.detect(
614
+ language_code: @language,
615
+ locale: @locale,
616
+ case_sensitive: @case_sensitive
617
+ )
618
+
619
+ return dict if dict
620
+
621
+ # Fall back to local dictionaries directory
622
+ local_paths = [
623
+ File.expand_path("dictionaries/unix_words/words", __dir__),
624
+ File.expand_path("../../dictionaries/unix_words/web2", __dir__),
625
+ File.expand_path("../../dictionaries/unix_words/web2a", __dir__)
626
+ ]
627
+
628
+ local_paths.each do |local_path|
629
+ if File.exist?(local_path)
630
+ return Dictionary::UnixWords.new(
631
+ local_path,
632
+ language_code: @language,
633
+ locale: @locale,
634
+ case_sensitive: @case_sensitive
635
+ )
636
+ end
637
+ end
638
+
639
+ raise DictionaryNotFoundError,
640
+ "no unix_words dictionary found; run `kotoshu setup #{@language}`"
641
+ end
642
+ # Load PlainText dictionary.
643
+ def load_plain_text_dictionary
644
+ path = @dictionary_path
645
+
646
+ raise ConfigurationError, "dictionary_path is required for plain_text type" unless path
647
+ raise DictionaryNotFoundError, path unless File.exist?(path)
648
+
649
+ Dictionary::PlainText.new(
650
+ path,
651
+ language_code: @language,
652
+ locale: @locale,
653
+ case_sensitive: @case_sensitive
654
+ )
655
+ end
656
+
657
+ # Load Custom dictionary.
658
+ def load_custom_dictionary
659
+ Dictionary::Custom.new(
660
+ words: @custom_words,
661
+ language_code: @language,
662
+ locale: @locale,
663
+ case_sensitive: @case_sensitive
664
+ )
665
+ end
666
+
667
+ # Load Hunspell dictionary.
668
+ def load_hunspell_dictionary
669
+ dic_path = @dictionary_path
670
+
671
+ raise ConfigurationError, "dictionary_path is required for hunspell type" unless dic_path
672
+
673
+ # Replace .dic extension with .aff for affix file
674
+ aff_path = dic_path.sub(/\.dic$/i, ".aff")
675
+
676
+ raise DictionaryNotFoundError, dic_path unless File.exist?(dic_path)
677
+ raise DictionaryNotFoundError, aff_path unless File.exist?(aff_path)
678
+
679
+ Dictionary::Hunspell.new(
680
+ dic_path: dic_path,
681
+ aff_path: aff_path,
682
+ language_code: @language,
683
+ locale: @locale
684
+ )
685
+ end
686
+
687
+ # Load CSpell dictionary.
688
+ def load_cspell_dictionary
689
+ path = @dictionary_path
690
+
691
+ raise ConfigurationError, "dictionary_path is required for cspell type" unless path
692
+ raise DictionaryNotFoundError, path unless File.exist?(path)
693
+
694
+ Dictionary::CSpell.new(
695
+ path,
696
+ language_code: @language,
697
+ locale: @locale,
698
+ case_sensitive: @case_sensitive
699
+ )
700
+ end
701
+ end
702
+ end