kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,817 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../dictionary/hunspell"
4
+ require_relative "../dictionary/plain_text"
5
+
6
+ module Kotoshu
7
+ module Dictionaries
8
+ # Catalog of all available dictionaries from kotoshu/dictionaries repository
9
+ #
10
+ # This class provides a structured registry of all available dictionaries
11
+ # with their metadata, URLs, and license information.
12
+ #
13
+ # @example Listing all dictionaries
14
+ # catalog = Kotoshu::Dictionaries::Catalog.new
15
+ # catalog.all.each do |dict|
16
+ # puts "#{dict.code}: #{dict.name} (#{dict.size} words)"
17
+ # end
18
+ #
19
+ # @example Finding dictionaries by language
20
+ # catalog = Kotoshu::Dictionaries::Catalog.new
21
+ # german_dicts = catalog.by_language("de")
22
+ #
23
+ # @example Getting a specific dictionary
24
+ # catalog = Kotoshu::Dictionaries::Catalog.new
25
+ # dict = catalog.find("en-GB")
26
+ # dict.load # => Kotoshu::Dictionary::Base subclass
27
+ #
28
+ class Catalog
29
+ # Dictionary entry in the catalog
30
+ class DictionaryEntry
31
+ attr_reader :code, :name, :language, :region, :format,
32
+ :source, :license, :word_count, :dic_url, :aff_url,
33
+ :metadata
34
+
35
+ def initialize(code:, name:, language:, format:, source:, license:, word_count:, dic_url:, region: nil, aff_url: nil,
36
+ metadata: {})
37
+ @code = code
38
+ @name = name
39
+ @language = language
40
+ @region = region
41
+ @format = format
42
+ @source = source
43
+ @license = license
44
+ @word_count = word_count
45
+ @dic_url = dic_url
46
+ @aff_url = aff_url
47
+ @metadata = metadata
48
+ freeze
49
+ end
50
+
51
+ # Load this dictionary from URL
52
+ # @return [Kotoshu::Dictionary::Base] The loaded dictionary
53
+ def load
54
+ case @format
55
+ when :hunspell
56
+ raise ArgumentError, "Missing aff_url for Hunspell dictionary" unless @aff_url
57
+
58
+ Kotoshu::Dictionary::Hunspell.new(
59
+ dic_path: @dic_url,
60
+ aff_path: @aff_url,
61
+ language_code: @code,
62
+ metadata: @metadata
63
+ )
64
+ when :plain_text
65
+ Kotoshu::Dictionary::PlainText.new(
66
+ @dic_url,
67
+ language_code: @code,
68
+ metadata: @metadata
69
+ )
70
+ else
71
+ raise ArgumentError, "Unknown format: #{@format}"
72
+ end
73
+ end
74
+
75
+ # @return [String] Human-readable description
76
+ def description
77
+ region_part = @region ? " (#{@region})" : ""
78
+ "#{@name}#{region_part} - #{@word_count} words"
79
+ end
80
+
81
+ # @return [Boolean] true if this is a Hunspell dictionary
82
+ def hunspell?
83
+ @format == :hunspell
84
+ end
85
+
86
+ # @return [Boolean] true if this is a plain text dictionary
87
+ def plain_text?
88
+ @format == :plain_text
89
+ end
90
+ end
91
+
92
+ # Base URL for kotoshu/dictionaries repository
93
+ BASE_URL = "https://raw.githubusercontent.com/kotoshu/dictionaries/main"
94
+
95
+ # All available dictionaries
96
+ ALL_DICTIONARIES = [
97
+ # Unix System Dictionaries (Plain Text)
98
+ { code: "en-US-web2", name: "Webster's Second International", language: "en", region: "US",
99
+ format: :plain_text, source: "FreeBSD", license: "Public Domain",
100
+ word_count: 235_976,
101
+ dic_url: "#{BASE_URL}/unix-words/web2.txt",
102
+ metadata: { source_file: "web2.txt", year: 1934 } },
103
+
104
+ { code: "en-US-web2a", name: "Webster's with Affix Flags", language: "en", region: "US",
105
+ format: :plain_text, source: "FreeBSD", license: "Public Domain",
106
+ word_count: 50_000,
107
+ dic_url: "#{BASE_URL}/unix-words/web2a.txt",
108
+ metadata: { source_file: "web2a.txt", has_affix_flags: true } },
109
+
110
+ { code: "en-connectives", name: "English Connectives", language: "en",
111
+ format: :plain_text, source: "FreeBSD", license: "Public Domain",
112
+ word_count: 500,
113
+ dic_url: "#{BASE_URL}/unix-words/connectives.txt",
114
+ metadata: { source_file: "connectives.txt" } },
115
+
116
+ { code: "en-propernames", name: "Proper Names", language: "en",
117
+ format: :plain_text, source: "FreeBSD", license: "Public Domain",
118
+ word_count: 2000,
119
+ dic_url: "#{BASE_URL}/unix-words/propernames.txt",
120
+ metadata: { source_file: "propernames.txt" } },
121
+
122
+ # English (Hunspell from wooorm/dictionaries)
123
+ { code: "en", name: "US English", language: "en", region: "US",
124
+ format: :hunspell, source: "SCOWL", license: "LGPL/MPL/GPL",
125
+ word_count: 500_000,
126
+ dic_url: "#{BASE_URL}/en/index.dic",
127
+ aff_url: "#{BASE_URL}/en/index.aff",
128
+ metadata: { scowl_size: "large", source: "wooorm/dictionaries" } },
129
+
130
+ { code: "en-GB", name: "British English (ise)", language: "en", region: "GB",
131
+ format: :hunspell, source: "SCOWL", license: "LGPL/MPL/GPL",
132
+ word_count: 450_000,
133
+ dic_url: "#{BASE_URL}/en-GB/index.dic",
134
+ aff_url: "#{BASE_URL}/en-GB/index.aff",
135
+ metadata: { spelling_variant: "ise", source: "wooorm/dictionaries" } },
136
+
137
+ { code: "en-CA", name: "Canadian English", language: "en", region: "CA",
138
+ format: :hunspell, source: "SCOWL", license: "LGPL/MPL/GPL",
139
+ word_count: 300_000,
140
+ dic_url: "#{BASE_URL}/en-CA/index.dic",
141
+ aff_url: "#{BASE_URL}/en-CA/index.aff",
142
+ metadata: { source: "wooorm/dictionaries" } },
143
+
144
+ { code: "en-AU", name: "Australian English", language: "en", region: "AU",
145
+ format: :hunspell, source: "SCOWL", license: "LGPL/MPL/GPL",
146
+ word_count: 250_000,
147
+ dic_url: "#{BASE_URL}/en-AU/index.dic",
148
+ aff_url: "#{BASE_URL}/en-AU/index.aff",
149
+ metadata: { source: "wooorm/dictionaries" } },
150
+
151
+ { code: "en-ZA", name: "South African English", language: "en", region: "ZA",
152
+ format: :hunspell, source: "SCOWL", license: "LGPL/MPL/GPL",
153
+ word_count: 200_000,
154
+ dic_url: "#{BASE_URL}/en-ZA/index.dic",
155
+ aff_url: "#{BASE_URL}/en-ZA/index.aff",
156
+ metadata: { source: "wooorm/dictionaries" } },
157
+
158
+ # German
159
+ { code: "de", name: "German", language: "de",
160
+ format: :hunspell, source: "igerman98", license: "GPL",
161
+ word_count: 300_000,
162
+ dic_url: "#{BASE_URL}/de/index.dic",
163
+ aff_url: "#{BASE_URL}/de/index.aff",
164
+ metadata: { source: "wooorm/dictionaries" } },
165
+
166
+ { code: "de-AT", name: "German (Austria)", language: "de", region: "AT",
167
+ format: :hunspell, source: "igerman98", license: "GPL",
168
+ word_count: 200_000,
169
+ dic_url: "#{BASE_URL}/de-AT/index.dic",
170
+ aff_url: "#{BASE_URL}/de-AT/index.aff",
171
+ metadata: { source: "wooorm/dictionaries" } },
172
+
173
+ { code: "de-CH", name: "German (Switzerland)", language: "de", region: "CH",
174
+ format: :hunspell, source: "igerman98", license: "GPL",
175
+ word_count: 200_000,
176
+ dic_url: "#{BASE_URL}/de-CH/index.dic",
177
+ aff_url: "#{BASE_URL}/de-CH/index.aff",
178
+ metadata: { source: "wooorm/dictionaries" } },
179
+
180
+ { code: "de-DE", name: "German (Germany)", language: "de", region: "DE",
181
+ format: :hunspell, source: "igerman98", license: "GPL",
182
+ word_count: 300_000,
183
+ dic_url: "#{BASE_URL}/de-DE/index.dic",
184
+ aff_url: "#{BASE_URL}/de-DE/index.aff",
185
+ metadata: { source: "wooorm/dictionaries" } },
186
+
187
+ # Spanish
188
+ { code: "es", name: "Spanish", language: "es",
189
+ format: :hunspell, source: "LibreOffice", license: "GPL",
190
+ word_count: 500_000,
191
+ dic_url: "#{BASE_URL}/es/index.dic",
192
+ aff_url: "#{BASE_URL}/es/index.aff",
193
+ metadata: { source: "wooorm/dictionaries", regional_variants: 21 } },
194
+
195
+ { code: "es-AR", name: "Spanish (Argentina)", language: "es", region: "AR",
196
+ format: :hunspell, source: "LibreOffice", license: "GPL",
197
+ word_count: 400_000,
198
+ dic_url: "#{BASE_URL}/es-AR/index.dic",
199
+ aff_url: "#{BASE_URL}/es-AR/index.aff",
200
+ metadata: { source: "wooorm/dictionaries" } },
201
+
202
+ { code: "es-MX", name: "Spanish (Mexico)", language: "es", region: "MX",
203
+ format: :hunspell, source: "LibreOffice", license: "GPL",
204
+ word_count: 400_000,
205
+ dic_url: "#{BASE_URL}/es-MX/index.dic",
206
+ aff_url: "#{BASE_URL}/es-MX/index.aff",
207
+ metadata: { source: "wooorm/dictionaries" } },
208
+
209
+ # French
210
+ { code: "fr", name: "French", language: "fr",
211
+ format: :hunspell, source: "Grammalecte", license: "MPL 2.0",
212
+ word_count: 200_000,
213
+ dic_url: "#{BASE_URL}/fr/index.dic",
214
+ aff_url: "#{BASE_URL}/fr/index.aff",
215
+ metadata: { source: "wooorm/dictionaries" } },
216
+
217
+ { code: "fr-FR", name: "French (France)", language: "fr", region: "FR",
218
+ format: :hunspell, source: "Grammalecte", license: "MPL 2.0",
219
+ word_count: 200_000,
220
+ dic_url: "#{BASE_URL}/fr-FR/index.dic",
221
+ aff_url: "#{BASE_URL}/fr-FR/index.aff",
222
+ metadata: { source: "wooorm/dictionaries" } },
223
+
224
+ # Italian
225
+ { code: "it", name: "Italian", language: "it",
226
+ format: :hunspell, source: "LibreOffice", license: "GPL 3",
227
+ word_count: 500_000,
228
+ dic_url: "#{BASE_URL}/it/index.dic",
229
+ aff_url: "#{BASE_URL}/it/index.aff",
230
+ metadata: { source: "wooorm/dictionaries" } },
231
+
232
+ # Portuguese
233
+ { code: "pt", name: "Portuguese", language: "pt",
234
+ format: :hunspell, source: "LibreOffice", license: "LGPLv3/MPL",
235
+ word_count: 400_000,
236
+ dic_url: "#{BASE_URL}/pt/index.dic",
237
+ aff_url: "#{BASE_URL}/pt/index.aff",
238
+ metadata: { source: "wooorm/dictionaries" } },
239
+
240
+ # Russian
241
+ { code: "ru", name: "Russian", language: "ru",
242
+ format: :hunspell, source: "Alexander Lebedev", license: "BSD-style",
243
+ word_count: 800_000,
244
+ dic_url: "#{BASE_URL}/ru/index.dic",
245
+ aff_url: "#{BASE_URL}/ru/index.aff",
246
+ metadata: { source: "wooorm/dictionaries" } },
247
+
248
+ # Dutch
249
+ { code: "nl", name: "Dutch", language: "nl",
250
+ format: :hunspell, source: "OpenTaal", license: "Revised BSD + CC BY 3.0",
251
+ word_count: 400_000,
252
+ dic_url: "#{BASE_URL}/nl/index.dic",
253
+ aff_url: "#{BASE_URL}/nl/index.aff",
254
+ metadata: { source: "wooorm/dictionaries" } },
255
+
256
+ # Polish
257
+ { code: "pl", name: "Polish", language: "pl",
258
+ format: :hunspell, source: "Polish Native Lang Project", license: "GPL/LGPL/MPL/CC",
259
+ word_count: 600_000,
260
+ dic_url: "#{BASE_URL}/pl/index.dic",
261
+ aff_url: "#{BASE_URL}/pl/index.aff",
262
+ metadata: { source: "wooorm/dictionaries" } },
263
+
264
+ # Additional European languages
265
+ { code: "cs", name: "Czech", language: "cs",
266
+ format: :hunspell, source: "LibreOffice", license: "GPL",
267
+ word_count: 400_000,
268
+ dic_url: "#{BASE_URL}/cs/index.dic",
269
+ aff_url: "#{BASE_URL}/cs/index.aff",
270
+ metadata: { source: "wooorm/dictionaries" } },
271
+
272
+ { code: "sk", name: "Slovak", language: "sk",
273
+ format: :hunspell, source: "LibreOffice", license: "GPL",
274
+ word_count: 300_000,
275
+ dic_url: "#{BASE_URL}/sk/index.dic",
276
+ aff_url: "#{BASE_URL}/sk/index.aff",
277
+ metadata: { source: "wooorm/dictionaries" } },
278
+
279
+ { code: "hr", name: "Croatian", language: "hr",
280
+ format: :hunspell, source: "LibreOffice", license: "GPL",
281
+ word_count: 300_000,
282
+ dic_url: "#{BASE_URL}/hr/index.dic",
283
+ aff_url: "#{BASE_URL}/hr/index.aff",
284
+ metadata: { source: "wooorm/dictionaries" } },
285
+
286
+ { code: "sr", name: "Serbian (Cyrillic)", language: "sr",
287
+ format: :hunspell, source: "LibreOffice", license: "GPL",
288
+ word_count: 400_000,
289
+ dic_url: "#{BASE_URL}/sr/index.dic",
290
+ aff_url: "#{BASE_URL}/sr/index.aff",
291
+ metadata: { source: "wooorm/dictionaries", script: "Cyrillic" } },
292
+
293
+ { code: "sr-Latn", name: "Serbian (Latin)", language: "sr", region: "Latn",
294
+ format: :hunspell, source: "LibreOffice", license: "GPL",
295
+ word_count: 400_000,
296
+ dic_url: "#{BASE_URL}/sr-Latn/index.dic",
297
+ aff_url: "#{BASE_URL}/sr-Latn/index.aff",
298
+ metadata: { source: "wooorm/dictionaries", script: "Latin" } },
299
+
300
+ { code: "sl", name: "Slovenian", language: "sl",
301
+ format: :hunspell, source: "LibreOffice", license: "GPL",
302
+ word_count: 200_000,
303
+ dic_url: "#{BASE_URL}/sl/index.dic",
304
+ aff_url: "#{BASE_URL}/sl/index.aff",
305
+ metadata: { source: "wooorm/dictionaries" } },
306
+
307
+ # Baltic languages
308
+ { code: "lt", name: "Lithuanian", language: "lt",
309
+ format: :hunspell, source: "LibreOffice", license: "GPL",
310
+ word_count: 200_000,
311
+ dic_url: "#{BASE_URL}/lt/index.dic",
312
+ aff_url: "#{BASE_URL}/lt/index.aff",
313
+ metadata: { source: "wooorm/dictionaries" } },
314
+
315
+ { code: "lv", name: "Latvian", language: "lv",
316
+ format: :hunspell, source: "LibreOffice", license: "GPL",
317
+ word_count: 250_000,
318
+ dic_url: "#{BASE_URL}/lv/index.dic",
319
+ aff_url: "#{BASE_URL}/lv/index.aff",
320
+ metadata: { source: "wooorm/dictionaries" } },
321
+
322
+ { code: "et", name: "Estonian", language: "et",
323
+ format: :hunspell, source: "LibreOffice", license: "GPL",
324
+ word_count: 200_000,
325
+ dic_url: "#{BASE_URL}/et/index.dic",
326
+ aff_url: "#{BASE_URL}/et/index.aff",
327
+ metadata: { source: "wooorm/dictionaries" } },
328
+
329
+ # Nordic languages
330
+ { code: "da", name: "Danish", language: "da",
331
+ format: :hunspell, source: "LibreOffice", license: "GPL",
332
+ word_count: 300_000,
333
+ dic_url: "#{BASE_URL}/da/index.dic",
334
+ aff_url: "#{BASE_URL}/da/index.aff",
335
+ metadata: { source: "wooorm/dictionaries" } },
336
+
337
+ { code: "sv", name: "Swedish", language: "sv",
338
+ format: :hunspell, source: "LibreOffice", license: "GPL",
339
+ word_count: 350_000,
340
+ dic_url: "#{BASE_URL}/sv/index.dic",
341
+ aff_url: "#{BASE_URL}/sv/index.aff",
342
+ metadata: { source: "wooorm/dictionaries" } },
343
+
344
+ { code: "sv-FI", name: "Swedish (Finland)", language: "sv", region: "FI",
345
+ format: :hunspell, source: "LibreOffice", license: "GPL",
346
+ word_count: 300_000,
347
+ dic_url: "#{BASE_URL}/sv-FI/index.dic",
348
+ aff_url: "#{BASE_URL}/sv-FI/index.aff",
349
+ metadata: { source: "wooorm/dictionaries" } },
350
+
351
+ { code: "nb", name: "Norwegian (Bokmål)", language: "nb",
352
+ format: :hunspell, source: "LibreOffice", license: "GPL",
353
+ word_count: 300_000,
354
+ dic_url: "#{BASE_URL}/nb/index.dic",
355
+ aff_url: "#{BASE_URL}/nb/index.aff",
356
+ metadata: { source: "wooorm/dictionaries" } },
357
+
358
+ { code: "nn", name: "Norwegian (Nynorsk)", language: "nn",
359
+ format: :hunspell, source: "LibreOffice", license: "GPL",
360
+ word_count: 250_000,
361
+ dic_url: "#{BASE_URL}/nn/index.dic",
362
+ aff_url: "#{BASE_URL}/nn/index.aff",
363
+ metadata: { source: "wooorm/dictionaries" } },
364
+
365
+ { code: "fi", name: "Finnish", language: "fi",
366
+ format: :hunspell, source: "LibreOffice", license: "GPL",
367
+ word_count: 400_000,
368
+ dic_url: "#{BASE_URL}/fi/index.dic",
369
+ aff_url: "#{BASE_URL}/fi/index.aff",
370
+ metadata: { source: "wooorm/dictionaries" } },
371
+
372
+ { code: "is", name: "Icelandic", language: "is",
373
+ format: :hunspell, source: "LibreOffice", license: "GPL",
374
+ word_count: 200_000,
375
+ dic_url: "#{BASE_URL}/is/index.dic",
376
+ aff_url: "#{BASE_URL}/is/index.aff",
377
+ metadata: { source: "wooorm/dictionaries" } },
378
+
379
+ { code: "fo", name: "Faroese", language: "fo",
380
+ format: :hunspell, source: "LibreOffice", license: "GPL",
381
+ word_count: 100_000,
382
+ dic_url: "#{BASE_URL}/fo/index.dic",
383
+ aff_url: "#{BASE_URL}/fo/index.aff",
384
+ metadata: { source: "wooorm/dictionaries" } },
385
+
386
+ # Celtic languages
387
+ { code: "ga", name: "Irish", language: "ga",
388
+ format: :hunspell, source: "LibreOffice", license: "GPL",
389
+ word_count: 50_000,
390
+ dic_url: "#{BASE_URL}/ga/index.dic",
391
+ aff_url: "#{BASE_URL}/ga/index.aff",
392
+ metadata: { source: "wooorm/dictionaries" } },
393
+
394
+ { code: "gd", name: "Scottish Gaelic", language: "gd",
395
+ format: :hunspell, source: "LibreOffice", license: "GPL",
396
+ word_count: 50_000,
397
+ dic_url: "#{BASE_URL}/gd/index.dic",
398
+ aff_url: "#{BASE_URL}/gd/index.aff",
399
+ metadata: { source: "wooorm/dictionaries" } },
400
+
401
+ { code: "cy", name: "Welsh", language: "cy",
402
+ format: :hunspell, source: "LibreOffice", license: "GPL",
403
+ word_count: 100_000,
404
+ dic_url: "#{BASE_URL}/cy/index.dic",
405
+ aff_url: "#{BASE_URL}/cy/index.aff",
406
+ metadata: { source: "wooorm/dictionaries" } },
407
+
408
+ { code: "br", name: "Breton", language: "br",
409
+ format: :hunspell, source: "LibreOffice", license: "GPL",
410
+ word_count: 50_000,
411
+ dic_url: "#{BASE_URL}/br/index.dic",
412
+ aff_url: "#{BASE_URL}/br/index.aff",
413
+ metadata: { source: "wooorm/dictionaries" } },
414
+
415
+ { code: "gv", name: "Manx", language: "gv",
416
+ format: :hunspell, source: "LibreOffice", license: "GPL",
417
+ word_count: 30_000,
418
+ dic_url: "#{BASE_URL}/gv/index.dic",
419
+ aff_url: "#{BASE_URL}/gv/index.aff",
420
+ metadata: { source: "wooorm/dictionaries" } },
421
+
422
+ # Other European languages
423
+ { code: "el", name: "Greek", language: "el",
424
+ format: :hunspell, source: "LibreOffice", license: "GPL",
425
+ word_count: 400_000,
426
+ dic_url: "#{BASE_URL}/el/index.dic",
427
+ aff_url: "#{BASE_URL}/el/index.aff",
428
+ metadata: { source: "wooorm/dictionaries" } },
429
+
430
+ { code: "el-polyton", name: "Greek (Polytonic)", language: "el", region: "polyton",
431
+ format: :hunspell, source: "LibreOffice", license: "GPL",
432
+ word_count: 100_000,
433
+ dic_url: "#{BASE_URL}/el-polyton/index.dic",
434
+ aff_url: "#{BASE_URL}/el-polyton/index.aff",
435
+ metadata: { source: "wooorm/dictionaries" } },
436
+
437
+ { code: "tr", name: "Turkish", language: "tr",
438
+ format: :hunspell, source: "LibreOffice", license: "GPL",
439
+ word_count: 400_000,
440
+ dic_url: "#{BASE_URL}/tr/index.dic",
441
+ aff_url: "#{BASE_URL}/tr/index.aff",
442
+ metadata: { source: "wooorm/dictionaries" } },
443
+
444
+ { code: "hu", name: "Hungarian", language: "hu",
445
+ format: :hunspell, source: "LibreOffice", license: "GPL",
446
+ word_count: 400_000,
447
+ dic_url: "#{BASE_URL}/hu/index.dic",
448
+ aff_url: "#{BASE_URL}/hu/index.aff",
449
+ metadata: { source: "wooorm/dictionaries" } },
450
+
451
+ { code: "ro", name: "Romanian", language: "ro",
452
+ format: :hunspell, source: "LibreOffice", license: "GPL",
453
+ word_count: 300_000,
454
+ dic_url: "#{BASE_URL}/ro/index.dic",
455
+ aff_url: "#{BASE_URL}/ro/index.aff",
456
+ metadata: { source: "wooorm/dictionaries" } },
457
+
458
+ { code: "bg", name: "Bulgarian", language: "bg",
459
+ format: :hunspell, source: "LibreOffice", license: "GPL",
460
+ word_count: 300_000,
461
+ dic_url: "#{BASE_URL}/bg/index.dic",
462
+ aff_url: "#{BASE_URL}/bg/index.aff",
463
+ metadata: { source: "wooorm/dictionaries" } },
464
+
465
+ { code: "be", name: "Belarusian", language: "be",
466
+ format: :hunspell, source: "LibreOffice", license: "GPL",
467
+ word_count: 200_000,
468
+ dic_url: "#{BASE_URL}/be/index.dic",
469
+ aff_url: "#{BASE_URL}/be/index.aff",
470
+ metadata: { source: "wooorm/dictionaries" } },
471
+
472
+ { code: "uk", name: "Ukrainian", language: "uk",
473
+ format: :hunspell, source: "LibreOffice", license: "GPL",
474
+ word_count: 400_000,
475
+ dic_url: "#{BASE_URL}/uk/index.dic",
476
+ aff_url: "#{BASE_URL}/uk/index.aff",
477
+ metadata: { source: "wooorm/dictionaries" } },
478
+
479
+ # Regional languages
480
+ { code: "ca", name: "Catalan", language: "ca",
481
+ format: :hunspell, source: "LibreOffice", license: "GPL",
482
+ word_count: 400_000,
483
+ dic_url: "#{BASE_URL}/ca/index.dic",
484
+ aff_url: "#{BASE_URL}/ca/index.aff",
485
+ metadata: { source: "wooorm/dictionaries" } },
486
+
487
+ { code: "ca-valencia", name: "Catalan (Valencia)", language: "ca", region: "valencia",
488
+ format: :hunspell, source: "LibreOffice", license: "GPL",
489
+ word_count: 400_000,
490
+ dic_url: "#{BASE_URL}/ca-valencia/index.dic",
491
+ aff_url: "#{BASE_URL}/ca-valencia/index.aff",
492
+ metadata: { source: "wooorm/dictionaries" } },
493
+
494
+ { code: "gl", name: "Galician", language: "gl",
495
+ format: :hunspell, source: "LibreOffice", license: "GPL",
496
+ word_count: 300_000,
497
+ dic_url: "#{BASE_URL}/gl/index.dic",
498
+ aff_url: "#{BASE_URL}/gl/index.aff",
499
+ metadata: { source: "wooorm/dictionaries" } },
500
+
501
+ { code: "eu", name: "Basque", language: "eu",
502
+ format: :hunspell, source: "LibreOffice", license: "GPL",
503
+ word_count: 200_000,
504
+ dic_url: "#{BASE_URL}/eu/index.dic",
505
+ aff_url: "#{BASE_URL}/eu/index.aff",
506
+ metadata: { source: "wooorm/dictionaries" } },
507
+
508
+ { code: "lb", name: "Luxembourgish", language: "lb",
509
+ format: :hunspell, source: "LibreOffice", license: "GPL",
510
+ word_count: 100_000,
511
+ dic_url: "#{BASE_URL}/lb/index.dic",
512
+ aff_url: "#{BASE_URL}/lb/index.aff",
513
+ metadata: { source: "wooorm/dictionaries" } },
514
+
515
+ { code: "li", name: "Limburgish", language: "li",
516
+ format: :hunspell, source: "LibreOffice", license: "GPL",
517
+ word_count: 50_000,
518
+ dic_url: "#{BASE_URL}/li/index.dic",
519
+ aff_url: "#{BASE_URL}/li/index.aff",
520
+ metadata: { source: "wooorm/dictionaries" } },
521
+
522
+ { code: "fy", name: "Western Frisian", language: "fy",
523
+ format: :hunspell, source: "LibreOffice", license: "GPL",
524
+ word_count: 100_000,
525
+ dic_url: "#{BASE_URL}/fy/index.dic",
526
+ aff_url: "#{BASE_URL}/fy/index.aff",
527
+ metadata: { source: "wooorm/dictionaries" } },
528
+
529
+ { code: "ltg", name: "Latgalian", language: "ltg",
530
+ format: :hunspell, source: "LibreOffice", license: "GPL",
531
+ word_count: 50_000,
532
+ dic_url: "#{BASE_URL}/ltg/index.dic",
533
+ aff_url: "#{BASE_URL}/ltg/index.aff",
534
+ metadata: { source: "wooorm/dictionaries" } },
535
+
536
+ { code: "ku", name: "Kurdish", language: "ku",
537
+ format: :hunspell, source: "LibreOffice", license: "GPL",
538
+ word_count: 50_000,
539
+ dic_url: "#{BASE_URL}/ku/index.dic",
540
+ aff_url: "#{BASE_URL}/ku/index.aff",
541
+ metadata: { source: "wooorm/dictionaries" } },
542
+
543
+ # Middle Eastern languages
544
+ { code: "hy", name: "Armenian", language: "hy",
545
+ format: :hunspell, source: "LibreOffice", license: "GPL",
546
+ word_count: 200_000,
547
+ dic_url: "#{BASE_URL}/hy/index.dic",
548
+ aff_url: "#{BASE_URL}/hy/index.aff",
549
+ metadata: { source: "wooorm/dictionaries" } },
550
+
551
+ { code: "hyw", name: "Western Armenian", language: "hy", region: "western",
552
+ format: :hunspell, source: "LibreOffice", license: "GPL",
553
+ word_count: 100_000,
554
+ dic_url: "#{BASE_URL}/hyw/index.dic",
555
+ aff_url: "#{BASE_URL}/hyw/index.aff",
556
+ metadata: { source: "wooorm/dictionaries" } },
557
+
558
+ { code: "ka", name: "Georgian", language: "ka",
559
+ format: :hunspell, source: "LibreOffice", license: "GPL",
560
+ word_count: 200_000,
561
+ dic_url: "#{BASE_URL}/ka/index.dic",
562
+ aff_url: "#{BASE_URL}/ka/index.aff",
563
+ metadata: { source: "wooorm/dictionaries" } },
564
+
565
+ { code: "he", name: "Hebrew", language: "he",
566
+ format: :hunspell, source: "LibreOffice", license: "GPL",
567
+ word_count: 300_000,
568
+ dic_url: "#{BASE_URL}/he/index.dic",
569
+ aff_url: "#{BASE_URL}/he/index.aff",
570
+ metadata: { source: "wooorm/dictionaries" } },
571
+
572
+ { code: "fa", name: "Persian", language: "fa",
573
+ format: :hunspell, source: "LibreOffice", license: "GPL",
574
+ word_count: 200_000,
575
+ dic_url: "#{BASE_URL}/fa/index.dic",
576
+ aff_url: "#{BASE_URL}/fa/index.aff",
577
+ metadata: { source: "wooorm/dictionaries" } },
578
+
579
+ # Asian languages
580
+ { code: "ko", name: "Korean", language: "ko",
581
+ format: :hunspell, source: "LibreOffice", license: "GPL",
582
+ word_count: 500_000,
583
+ dic_url: "#{BASE_URL}/ko/index.dic",
584
+ aff_url: "#{BASE_URL}/ko/index.aff",
585
+ metadata: { source: "wooorm/dictionaries" } },
586
+
587
+ { code: "vi", name: "Vietnamese", language: "vi",
588
+ format: :hunspell, source: "LibreOffice", license: "GPL",
589
+ word_count: 100_000,
590
+ dic_url: "#{BASE_URL}/vi/index.dic",
591
+ aff_url: "#{BASE_URL}/vi/index.aff",
592
+ metadata: { source: "wooorm/dictionaries" } },
593
+
594
+ # Constructed languages
595
+ { code: "eo", name: "Esperanto", language: "eo",
596
+ format: :hunspell, source: "LibreOffice", license: "GPL",
597
+ word_count: 100_000,
598
+ dic_url: "#{BASE_URL}/eo/index.dic",
599
+ aff_url: "#{BASE_URL}/eo/index.aff",
600
+ metadata: { source: "wooorm/dictionaries" } },
601
+
602
+ { code: "ia", name: "Interlingua", language: "ia",
603
+ format: :hunspell, source: "LibreOffice", license: "GPL",
604
+ word_count: 50_000,
605
+ dic_url: "#{BASE_URL}/ia/index.dic",
606
+ aff_url: "#{BASE_URL}/ia/index.aff",
607
+ metadata: { source: "wooorm/dictionaries" } },
608
+
609
+ # Additional Spanish regional variants
610
+ { code: "es-BO", name: "Spanish (Bolivia)", language: "es", region: "BO",
611
+ format: :hunspell, source: "LibreOffice", license: "GPL",
612
+ word_count: 350_000,
613
+ dic_url: "#{BASE_URL}/es-BO/index.dic",
614
+ aff_url: "#{BASE_URL}/es-BO/index.aff",
615
+ metadata: { source: "wooorm/dictionaries" } },
616
+
617
+ { code: "es-CO", name: "Spanish (Colombia)", language: "es", region: "CO",
618
+ format: :hunspell, source: "LibreOffice", license: "GPL",
619
+ word_count: 350_000,
620
+ dic_url: "#{BASE_URL}/es-CO/index.dic",
621
+ aff_url: "#{BASE_URL}/es-CO/index.aff",
622
+ metadata: { source: "wooorm/dictionaries" } },
623
+
624
+ { code: "es-CR", name: "Spanish (Costa Rica)", language: "es", region: "CR",
625
+ format: :hunspell, source: "LibreOffice", license: "GPL",
626
+ word_count: 350_000,
627
+ dic_url: "#{BASE_URL}/es-CR/index.dic",
628
+ aff_url: "#{BASE_URL}/es-CR/index.aff",
629
+ metadata: { source: "wooorm/dictionaries" } },
630
+
631
+ { code: "es-CU", name: "Spanish (Cuba)", language: "es", region: "CU",
632
+ format: :hunspell, source: "LibreOffice", license: "GPL",
633
+ word_count: 350_000,
634
+ dic_url: "#{BASE_URL}/es-CU/index.dic",
635
+ aff_url: "#{BASE_URL}/es-CU/index.aff",
636
+ metadata: { source: "wooorm/dictionaries" } },
637
+
638
+ { code: "es-DO", name: "Spanish (Dominican Republic)", language: "es", region: "DO",
639
+ format: :hunspell, source: "LibreOffice", license: "GPL",
640
+ word_count: 350_000,
641
+ dic_url: "#{BASE_URL}/es-DO/index.dic",
642
+ aff_url: "#{BASE_URL}/es-DO/index.aff",
643
+ metadata: { source: "wooorm/dictionaries" } },
644
+
645
+ { code: "es-EC", name: "Spanish (Ecuador)", language: "es", region: "EC",
646
+ format: :hunspell, source: "LibreOffice", license: "GPL",
647
+ word_count: 350_000,
648
+ dic_url: "#{BASE_URL}/es-EC/index.dic",
649
+ aff_url: "#{BASE_URL}/es-EC/index.aff",
650
+ metadata: { source: "wooorm/dictionaries" } },
651
+
652
+ { code: "es-GT", name: "Spanish (Guatemala)", language: "es", region: "GT",
653
+ format: :hunspell, source: "LibreOffice", license: "GPL",
654
+ word_count: 350_000,
655
+ dic_url: "#{BASE_URL}/es-GT/index.dic",
656
+ aff_url: "#{BASE_URL}/es-GT/index.aff",
657
+ metadata: { source: "wooorm/dictionaries" } },
658
+
659
+ { code: "es-HN", name: "Spanish (Honduras)", language: "es", region: "HN",
660
+ format: :hunspell, source: "LibreOffice", license: "GPL",
661
+ word_count: 350_000,
662
+ dic_url: "#{BASE_URL}/es-HN/index.dic",
663
+ aff_url: "#{BASE_URL}/es-HN/index.aff",
664
+ metadata: { source: "wooorm/dictionaries" } },
665
+
666
+ { code: "es-NI", name: "Spanish (Nicaragua)", language: "es", region: "NI",
667
+ format: :hunspell, source: "LibreOffice", license: "GPL",
668
+ word_count: 350_000,
669
+ dic_url: "#{BASE_URL}/es-NI/index.dic",
670
+ aff_url: "#{BASE_URL}/es-NI/index.aff",
671
+ metadata: { source: "wooorm/dictionaries" } },
672
+
673
+ { code: "es-PA", name: "Spanish (Panama)", language: "es", region: "PA",
674
+ format: :hunspell, source: "LibreOffice", license: "GPL",
675
+ word_count: 350_000,
676
+ dic_url: "#{BASE_URL}/es-PA/index.dic",
677
+ aff_url: "#{BASE_URL}/es-PA/index.aff",
678
+ metadata: { source: "wooorm/dictionaries" } },
679
+
680
+ { code: "es-PE", name: "Spanish (Peru)", language: "es", region: "PE",
681
+ format: :hunspell, source: "LibreOffice", license: "GPL",
682
+ word_count: 350_000,
683
+ dic_url: "#{BASE_URL}/es-PE/index.dic",
684
+ aff_url: "#{BASE_URL}/es-PE/index.aff",
685
+ metadata: { source: "wooorm/dictionaries" } },
686
+
687
+ { code: "es-PH", name: "Spanish (Philippines)", language: "es", region: "PH",
688
+ format: :hunspell, source: "LibreOffice", license: "GPL",
689
+ word_count: 350_000,
690
+ dic_url: "#{BASE_URL}/es-PH/index.dic",
691
+ aff_url: "#{BASE_URL}/es-PH/index.aff",
692
+ metadata: { source: "wooorm/dictionaries" } },
693
+
694
+ { code: "es-PR", name: "Spanish (Puerto Rico)", language: "es", region: "PR",
695
+ format: :hunspell, source: "LibreOffice", license: "GPL",
696
+ word_count: 350_000,
697
+ dic_url: "#{BASE_URL}/es-PR/index.dic",
698
+ aff_url: "#{BASE_URL}/es-PR/index.aff",
699
+ metadata: { source: "wooorm/dictionaries" } },
700
+
701
+ { code: "es-PY", name: "Spanish (Paraguay)", language: "es", region: "PY",
702
+ format: :hunspell, source: "LibreOffice", license: "GPL",
703
+ word_count: 350_000,
704
+ dic_url: "#{BASE_URL}/es-PY/index.dic",
705
+ aff_url: "#{BASE_URL}/es-PY/index.aff",
706
+ metadata: { source: "wooorm/dictionaries" } },
707
+
708
+ { code: "es-SV", name: "Spanish (El Salvador)", language: "es", region: "SV",
709
+ format: :hunspell, source: "LibreOffice", license: "GPL",
710
+ word_count: 350_000,
711
+ dic_url: "#{BASE_URL}/es-SV/index.dic",
712
+ aff_url: "#{BASE_URL}/es-SV/index.aff",
713
+ metadata: { source: "wooorm/dictionaries" } },
714
+
715
+ { code: "es-US", name: "Spanish (United States)", language: "es", region: "US",
716
+ format: :hunspell, source: "LibreOffice", license: "GPL",
717
+ word_count: 350_000,
718
+ dic_url: "#{BASE_URL}/es-US/index.dic",
719
+ aff_url: "#{BASE_URL}/es-US/index.aff",
720
+ metadata: { source: "wooorm/dictionaries" } },
721
+
722
+ { code: "es-UY", name: "Spanish (Uruguay)", language: "es", region: "UY",
723
+ format: :hunspell, source: "LibreOffice", license: "GPL",
724
+ word_count: 350_000,
725
+ dic_url: "#{BASE_URL}/es-UY/index.dic",
726
+ aff_url: "#{BASE_URL}/es-UY/index.aff",
727
+ metadata: { source: "wooorm/dictionaries" } },
728
+
729
+ { code: "es-VE", name: "Spanish (Venezuela)", language: "es", region: "VE",
730
+ format: :hunspell, source: "LibreOffice", license: "GPL",
731
+ word_count: 350_000,
732
+ dic_url: "#{BASE_URL}/es-VE/index.dic",
733
+ aff_url: "#{BASE_URL}/es-VE/index.aff",
734
+ metadata: { source: "wooorm/dictionaries" } }
735
+ ].freeze
736
+
737
+ # Create catalog entries from data
738
+ @entries = ALL_DICTIONARIES.map do |data|
739
+ DictionaryEntry.new(**data)
740
+ end.freeze
741
+
742
+ # @return [Array<DictionaryEntry>] All dictionary entries
743
+ def self.all
744
+ @entries
745
+ end
746
+
747
+ # Find dictionary by code
748
+ # @param code [String, Symbol] Dictionary code (e.g., "en-GB", :en_GB)
749
+ # @return [DictionaryEntry, nil] The dictionary entry or nil if not found
750
+ def self.find(code)
751
+ code_str = code.to_s.gsub("_", "-")
752
+ all.find { |e| e.code.casecmp(code_str).zero? }
753
+ end
754
+
755
+ # Find dictionaries by language code
756
+ # @param lang [String, Symbol] Language code (e.g., "en", :de)
757
+ # @return [Array<DictionaryEntry>] Dictionaries for the language
758
+ def self.by_language(lang)
759
+ lang_str = lang.to_s.downcase
760
+ all.select { |e| e.language == lang_str }
761
+ end
762
+
763
+ # Find dictionaries by format
764
+ # @param format [Symbol] Format type (:hunspell or :plain_text)
765
+ # @return [Array<DictionaryEntry>] Dictionaries with the format
766
+ def self.by_format(format)
767
+ all.select { |e| e.format == format }
768
+ end
769
+
770
+ # Find dictionaries by license
771
+ # @param license [String, Symbol] License type (e.g., "GPL", "Public Domain")
772
+ # @return [Array<DictionaryEntry>] Dictionaries with the license
773
+ def self.by_license(license)
774
+ license_str = license.to_s
775
+ all.select { |e| e.license.include?(license_str) }
776
+ end
777
+
778
+ # Get all Hunspell dictionaries
779
+ # @return [Array<DictionaryEntry>] All Hunspell dictionaries
780
+ def self.hunspell
781
+ by_format(:hunspell)
782
+ end
783
+
784
+ # Get all plain text dictionaries
785
+ # @return [Array<DictionaryEntry>] All plain text dictionaries
786
+ def self.plain_text
787
+ by_format(:plain_text)
788
+ end
789
+
790
+ # Get statistics about the catalog
791
+ # @return [Hash] Statistics hash
792
+ def self.statistics
793
+ {
794
+ total: all.size,
795
+ hunspell: hunspell.size,
796
+ plain_text: plain_text.size,
797
+ languages: all.map(&:language).uniq.size,
798
+ total_words: all.sum(&:word_count),
799
+ formats: all.group_by(&:format).transform_values(&:size),
800
+ licenses: all.group_by { |e| e.license.split.first }.transform_values(&:size)
801
+ }
802
+ end
803
+
804
+ # Get all unique language codes
805
+ # @return [Array<String>] Unique language codes
806
+ def self.languages
807
+ all.map(&:language).uniq.sort
808
+ end
809
+
810
+ # Get all unique licenses
811
+ # @return [Array<String>] Unique license types
812
+ def self.licenses
813
+ all.map(&:license).uniq
814
+ end
815
+ end
816
+ end
817
+ end