kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,278 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Example 7: Multi-Language Dictionaries
5
+ #
6
+ # This example demonstrates how to use Kotoshu with multiple languages
7
+ # by loading dictionaries from the kotoshu/dictionaries repository.
8
+
9
+ require_relative "../lib/kotoshu"
10
+
11
+ puts "=== Example 7: Multi-Language Dictionaries ==="
12
+ puts
13
+
14
+ # Example 1: Load a specific dictionary by code
15
+ puts "1. Loading a Dictionary by Code"
16
+ puts "-" * 40
17
+
18
+ catalog = Kotoshu::Dictionaries::Catalog
19
+
20
+ # Find and load British English dictionary
21
+ en_gb_entry = catalog.find("en-GB")
22
+ if en_gb_entry
23
+ puts "Found: #{en_gb_entry.description}"
24
+ puts "Source: #{en_gb_entry.source}"
25
+ puts "License: #{en_gb_entry.license}"
26
+ puts "URL: #{en_gb_entry.dic_url}"
27
+ puts
28
+
29
+ en_gb_dict = en_gb_entry.load
30
+ puts "Loaded #{en_gb_dict.size} words"
31
+ puts "Has 'colour': #{en_gb_dict.lookup?("colour")}"
32
+ puts "Has 'color': #{en_gb_dict.lookup?("color")}"
33
+ else
34
+ puts "Dictionary not found"
35
+ end
36
+
37
+ puts
38
+ puts "=" * 40
39
+ puts
40
+
41
+ # Example 2: List all dictionaries for a language
42
+ puts "2. All English Dictionaries"
43
+ puts "-" * 40
44
+
45
+ english_dicts = catalog.by_language("en")
46
+ puts "Found #{english_dicts.size} English dictionaries:"
47
+ english_dicts.each do |entry|
48
+ puts " #{entry.code}: #{entry.name} (#{entry.word_count} words)"
49
+ end
50
+
51
+ puts
52
+ puts "=" * 40
53
+ puts
54
+
55
+ # Example 3: List all available languages
56
+ puts "3. All Available Languages"
57
+ puts "-" * 40
58
+
59
+ languages = catalog.languages
60
+ puts "Supported languages (#{languages.size}):"
61
+ puts languages.join(", ")
62
+
63
+ puts
64
+ puts "=" * 40
65
+ puts
66
+
67
+ # Example 4: Create spellcheckers for different languages
68
+ puts "4. Multi-Language Spellcheckers"
69
+ puts "-" * 40
70
+
71
+ # Load dictionaries for multiple languages
72
+ languages_to_test = %w[en de es fr]
73
+
74
+ spellcheckers = {}
75
+ languages_to_test.each do |lang|
76
+ entry = catalog.find(lang)
77
+ next unless entry
78
+
79
+ begin
80
+ dict = entry.load
81
+ spellcheckers[lang] = Kotoshu::Spellchecker.new(dictionary: dict)
82
+ puts "✓ Loaded #{entry.name}: #{dict.size} words"
83
+ rescue StandardError => e
84
+ puts "✗ Failed to load #{entry.name}: #{e.message}"
85
+ end
86
+ end
87
+
88
+ puts
89
+ puts "Testing multi-language spellchecking:"
90
+ puts
91
+
92
+ # Test words in different languages
93
+ test_cases = {
94
+ "en" => { correct: "hello", incorrect: "helo" },
95
+ "de" => { correct: "hallo", incorrect: "hllo" },
96
+ "es" => { correct: "hola", incorrect: "hla" },
97
+ "fr" => { correct: "bonjour", incorrect: "bnjour" }
98
+ }
99
+
100
+ test_cases.each do |lang, words|
101
+ checker = spellcheckers[lang]
102
+ next unless checker
103
+
104
+ correct_result = checker.correct?(words[:correct])
105
+ incorrect_result = checker.check_word(words[:incorrect])
106
+
107
+ status = correct_result ? "✓" : "✗"
108
+ puts "#{status} #{lang.upcase} '#{words[:correct]}': #{correct_result}"
109
+
110
+ puts " Suggestions for '#{words[:incorrect]}': #{incorrect_result.top_suggestions(3).join(", ")}" if incorrect_result.has_suggestions?
111
+ end
112
+
113
+ puts
114
+ puts "=" * 40
115
+ puts
116
+
117
+ # Example 5: Hunspell vs Plain Text formats
118
+ puts "5. Dictionary Formats"
119
+ puts "-" * 40
120
+
121
+ hunspell_dicts = catalog.hunspell
122
+ plain_text_dicts = catalog.plain_text
123
+
124
+ puts "Hunspell dictionaries: #{hunspell_dicts.size}"
125
+ puts "Plain text dictionaries: #{plain_text_dicts.size}"
126
+ puts
127
+
128
+ # Show some examples of each
129
+ puts "Hunspell examples:"
130
+ hunspell_dicts.first(5).each do |entry|
131
+ puts " #{entry.code}: #{entry.description}"
132
+ end
133
+
134
+ puts
135
+ puts "Plain text examples:"
136
+ plain_text_dicts.each do |entry|
137
+ puts " #{entry.code}: #{entry.description}"
138
+ end
139
+
140
+ puts
141
+ puts "=" * 40
142
+ puts
143
+
144
+ # Example 6: Filter by license
145
+ puts "6. Dictionaries by License"
146
+ puts "-" * 40
147
+
148
+ public_domain = catalog.by_license("Public Domain")
149
+ gpl = catalog.by_license("GPL")
150
+
151
+ puts "Public Domain dictionaries: #{public_domain.size}"
152
+ public_domain.each do |entry|
153
+ puts " #{entry.code}: #{entry.name}"
154
+ end
155
+
156
+ puts
157
+ puts "GPL dictionaries: #{gpl.size}"
158
+ gpl.first(5).each do |entry|
159
+ puts " #{entry.code}: #{entry.name}"
160
+ end
161
+
162
+ puts
163
+ puts "=" * 40
164
+ puts
165
+
166
+ # Example 7: Catalog statistics
167
+ puts "7. Catalog Statistics"
168
+ puts "-" * 40
169
+
170
+ stats = catalog.statistics
171
+
172
+ puts "Total dictionaries: #{stats[:total]}"
173
+ puts " Hunspell: #{stats[:hunspell]}"
174
+ puts " Plain text: #{stats[:plain_text]}"
175
+ puts
176
+ puts "Languages: #{stats[:languages]}"
177
+ puts "Total words: #{stats[:total_words].round}"
178
+ puts
179
+ puts "By format:"
180
+ stats[:formats].each do |format, count|
181
+ puts " #{format}: #{count}"
182
+ end
183
+
184
+ puts
185
+ puts "By license:"
186
+ stats[:licenses].each do |license, count|
187
+ puts " #{license}: #{count}"
188
+ end
189
+
190
+ puts
191
+ puts "=" * 40
192
+ puts
193
+
194
+ # Example 8: Create spellcheckers with regional variants
195
+ puts "8. English Regional Variants"
196
+ puts "-" * 40
197
+
198
+ english_variants = %w[en en-GB en-CA en-AU en-ZA]
199
+
200
+ english_variants.each do |code|
201
+ entry = catalog.find(code)
202
+ next unless entry
203
+
204
+ begin
205
+ dict = entry.load
206
+ checker = Kotoshu::Spellchecker.new(dictionary: dict)
207
+
208
+ # Test a word with different spellings
209
+ colour_result = checker.correct?("colour")
210
+ color_result = checker.correct?("color")
211
+
212
+ puts "#{entry.name}:"
213
+ puts " 'colour': #{colour_result ? "✓" : "✗"}"
214
+ puts " 'color': #{color_result ? "✓" : "✗"}"
215
+ rescue StandardError => e
216
+ puts "#{entry.name}: ✗ Error - #{e.message}"
217
+ end
218
+ end
219
+
220
+ puts
221
+ puts "=" * 40
222
+ puts
223
+
224
+ # Example 9: Loading large dictionaries with performance
225
+ puts "9. Large Dictionary Performance"
226
+ puts "-" * 40
227
+
228
+ require "benchmark"
229
+
230
+ large_dicts = %w[en de es fr ru]
231
+
232
+ large_dicts.each do |lang|
233
+ entry = catalog.find(lang)
234
+ next unless entry
235
+
236
+ begin
237
+ load_time = Benchmark.realtime do
238
+ dict = entry.load
239
+ checker = Kotoshu::Spellchecker.new(dictionary: dict)
240
+ checker.correct?("hello")
241
+ end
242
+
243
+ puts "#{entry.name}: #{(load_time * 1000).round(1)}ms (load + check)"
244
+ rescue StandardError => e
245
+ puts "#{entry.name}: ✗ Error - #{e.message}"
246
+ end
247
+ end
248
+
249
+ puts
250
+ puts "=" * 40
251
+ puts
252
+
253
+ # Example 10: Dictionary metadata
254
+ puts "10. Dictionary Metadata"
255
+ puts "-" * 40
256
+
257
+ entry = catalog.find("ru")
258
+ if entry
259
+ puts "Code: #{entry.code}"
260
+ puts "Name: #{entry.name}"
261
+ puts "Language: #{entry.language}"
262
+ puts "Region: #{entry.region || "N/A"}"
263
+ puts "Format: #{entry.format}"
264
+ puts "Source: #{entry.source}"
265
+ puts "License: #{entry.license}"
266
+ puts "Word count: #{entry.word_count}"
267
+ puts "Dictionary URL: #{entry.dic_url}"
268
+ puts "Affix URL: #{entry.aff_url}" if entry.aff_url
269
+ puts "Metadata: #{entry.metadata.inspect}"
270
+ end
271
+
272
+ puts
273
+ puts "=" * 40
274
+ puts
275
+
276
+ puts "For more information, see:"
277
+ puts " https://github.com/kotoshu/dictionaries"
278
+ puts
data/exe/kotoshu ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require_relative "../lib/kotoshu/cli"
5
+
6
+ Kotoshu::Cli::Cli.start(ARGV)
@@ -0,0 +1,276 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Algorithms
5
+ # Capitalization handling for different languages.
6
+ #
7
+ # Ported from Spylls (Python) capitalization.py
8
+ #
9
+ # This module provides capitalization detection and conversion for different
10
+ # language casing rules, including special handling for Turkic and German languages.
11
+ module Capitalization
12
+ # Type of capitalization detected by Casing.guess.
13
+ #
14
+ # NO:: all lowercase ("foo")
15
+ # INIT:: titlecase, only initial letter is capitalized ("Foo")
16
+ # ALL:: all uppercase ("FOO")
17
+ # HUH:: mixed capitalization ("fooBar")
18
+ # HUHINIT:: mixed capitalization, first letter is capitalized ("FooBar")
19
+ module Type
20
+ NO = :no
21
+ INIT = :init
22
+ ALL = :all
23
+ HUH = :huh
24
+ HUHINIT = :huhinit
25
+ end
26
+
27
+ # Base class for casing-related algorithms specific for dictionary's language.
28
+ #
29
+ # This is a class (not a set of functions) because it needs to have
30
+ # subclasses for specific language casing, which have only some aspects
31
+ # different from generic one.
32
+ class Casing
33
+ # Guess word's capitalization. Redefined in GermanCasing.
34
+ #
35
+ # @param word [String] The word to analyze
36
+ # @return [Symbol] One of the Type constants
37
+ def guess(word)
38
+ return Type::NO if word.downcase == word
39
+ return Type::ALL if word.upcase == word
40
+ return Type::INIT if word[0].upcase == word[0] && word[1..].downcase == word[1..]
41
+
42
+ if word[0].upcase == word[0]
43
+ Type::HUHINIT
44
+ else
45
+ Type::HUH
46
+ end
47
+ end
48
+
49
+ # Lowercases the word. Returns list of possible lowercasings for all
50
+ # casing classes to behave consistently.
51
+ #
52
+ # In GermanCasing (and only there), lowercasing word like "STRASSE"
53
+ # produces two possibilities: "strasse" and "ße" (ß is most of the time
54
+ # upcased to SS, so we can't decide which of downcased words is "right"
55
+ # and need to check both).
56
+ #
57
+ # Also redefined in TurkicCasing, because in Turkic languages lowercase
58
+ # "i" is uppercased as "İ", and uppercase "I" is downcased as "ı".
59
+ #
60
+ # @param word [String] The word to lowercase
61
+ # @return [Array<String>] List of possible lowercasings
62
+ def lower(word)
63
+ # Can't be properly lowercased in non-Turkic collation
64
+ return [] if word.nil? || word.empty? || word[0] == 'İ'
65
+
66
+ # Turkic "lowercase dot i" to latinic "i", just in case
67
+ [word.downcase.gsub('i̇', 'i')]
68
+ end
69
+
70
+ # Uppercase the word. Redefined in TurkicCasing, because in Turkic
71
+ # languages lowercase "i" is uppercased as "İ", and uppercase "I"
72
+ # is downcased as "ı".
73
+ #
74
+ # @param word [String] The word to uppercase
75
+ # @return [String] Uppercased word
76
+ def upper(word)
77
+ word.upcase
78
+ end
79
+
80
+ # Capitalize (convert word to all lowercase and first letter uppercase).
81
+ # Returns a list of results for same reasons as lower.
82
+ #
83
+ # @param word [String] The word to capitalize
84
+ # @return [Enumerator<String>] Enum of capitalized variants
85
+ def capitalize(word)
86
+ return enum_for(:capitalize, word) unless block_given?
87
+
88
+ if word.length == 1
89
+ yield upper(word[0])
90
+ else
91
+ upper_first = upper(word[0])
92
+ lower(word[1..]).each do |lowered|
93
+ yield upper_first + lowered
94
+ end
95
+ end
96
+ end
97
+
98
+ # Just change the case of the first letter to lower.
99
+ # Returns a list of results for same reasons as lower.
100
+ #
101
+ # @param word [String] The word to process
102
+ # @return [Enumerator<String>] Enum of variants with lowercased first letter
103
+ def lowerfirst(word)
104
+ return enum_for(:lowerfirst, word) unless block_given?
105
+
106
+ lower(word[0]).each do |lowered|
107
+ yield lowered + word[1..]
108
+ end
109
+ end
110
+
111
+ # Returns hypotheses of how the word might have been cased (in dictionary),
112
+ # if we consider it is spelled correctly.
113
+ #
114
+ # Example: If word is "Kitten", hypotheses are "kitten", "Kitten".
115
+ #
116
+ # @param word [String] The word to analyze
117
+ # @return [Array<Symbol, Array<String>>] Pair of [captype, variants]
118
+ def variants(word)
119
+ captype = guess(word)
120
+
121
+ result = case captype
122
+ when Type::NO
123
+ [word]
124
+ when Type::INIT
125
+ [word, *lower(word)]
126
+ when Type::HUHINIT
127
+ [word, *lowerfirst(word).to_a]
128
+ when Type::HUH
129
+ [word]
130
+ when Type::ALL
131
+ [word, *lower(word), *capitalize(word).to_a]
132
+ end
133
+
134
+ [captype, result]
135
+ end
136
+
137
+ # Returns hypotheses of how the word might have been cased if it is a
138
+ # misspelling.
139
+ #
140
+ # Example: "DiCtionary" (HUHINIT capitalization) produces hypotheses
141
+ # "DiCtionary", "diCtionary", "dictionary", "Dictionary", and all of
142
+ # them are checked by Suggest.
143
+ #
144
+ # @param word [String] The word to analyze
145
+ # @return [Array<Symbol, Array<String>>] Pair of [captype, variants]
146
+ def corrections(word)
147
+ captype = guess(word)
148
+
149
+ result = case captype
150
+ when Type::NO
151
+ [word]
152
+ when Type::INIT
153
+ [word, *lower(word)]
154
+ when Type::HUHINIT
155
+ [word, *lowerfirst(word).to_a, *lower(word), *capitalize(word).to_a]
156
+ when Type::HUH
157
+ [word, *lower(word)]
158
+ when Type::ALL
159
+ [word, *lower(word), *capitalize(word).to_a]
160
+ end
161
+
162
+ [captype, result]
163
+ end
164
+
165
+ # Used by suggest: by known (valid) suggestion, and initial word's
166
+ # capitalization, produce proper suggestion capitalization.
167
+ #
168
+ # Example: If misspelling was "Kiten" (INIT capitalization),
169
+ # found suggestion "kitten", then this method makes it "Kitten".
170
+ #
171
+ # @param word [String] The valid suggestion word
172
+ # @param cap [Symbol] Original word's capitalization type
173
+ # @return [String] Properly capitalized suggestion
174
+ def coerce(word, cap)
175
+ case cap
176
+ when Type::INIT, Type::HUHINIT
177
+ upper(word[0]) + word[1..]
178
+ when Type::ALL
179
+ upper(word)
180
+ else
181
+ word
182
+ end
183
+ end
184
+ end
185
+
186
+ # Redefines upper and lower, because in Turkic languages lowercase "i"
187
+ # is uppercased as "İ", and uppercase "I" is downcased as "ı".
188
+ #
189
+ # Example:
190
+ # turkic = Kotoshu::Algorithms::Capitalization::TurkicCasing.new
191
+ # turkic.lower('Izmir') # => ['ızmir']
192
+ # turkic.upper('Izmir') # => 'IZMİR'
193
+ class TurkicCasing < Casing
194
+ U2L = {
195
+ 'İ' => 'i',
196
+ 'I' => 'ı'
197
+ }.freeze
198
+
199
+ L2U = {
200
+ 'i' => 'İ',
201
+ 'ı' => 'I'
202
+ }.freeze
203
+
204
+ # Translate uppercase Turkic characters to lowercase.
205
+ #
206
+ # @param word [String] The word to lowercase
207
+ # @return [Array<String>] List of lowercased variants
208
+ def lower(word)
209
+ translated = word.chars.map { |c| U2L[c] || c }.join
210
+ super(translated)
211
+ end
212
+
213
+ # Translate lowercase Turkic characters to uppercase.
214
+ #
215
+ # @param word [String] The word to uppercase
216
+ # @return [String] Uppercased word
217
+ def upper(word)
218
+ translated = word.chars.map { |c| L2U[c] || c }.join
219
+ super(translated)
220
+ end
221
+ end
222
+
223
+ # Redefines lower because in German "SS" can be lowercased both as "ss" and "ß".
224
+ #
225
+ # Example:
226
+ # german = Kotoshu::Algorithms::Capitalization::GermanCasing.new
227
+ # german.lower('STRASSE') # => ['straße', 'strasse']
228
+ class GermanCasing < Casing
229
+ # Generate sharp S (ß) variants for all "ss" occurrences.
230
+ #
231
+ # @param text [String] The text to process
232
+ # @param start [Integer] Starting position for search
233
+ # @return [Array<String>] All variants with ß replacements
234
+ def sharp_s_variants(text, start = 0)
235
+ pos = text.index('ss', start)
236
+ return [] unless pos
237
+
238
+ replaced = text[0...pos] + 'ß' + text[(pos + 2)..]
239
+ [replaced,
240
+ *sharp_s_variants(replaced, pos + 1),
241
+ *sharp_s_variants(text, pos + 2)]
242
+ end
243
+
244
+ # Lowercase word, generating both "ss" and "ß" variants where applicable.
245
+ #
246
+ # @param word [String] The word to lowercase
247
+ # @return [Array<String>] List of lowercased variants
248
+ def lower(word)
249
+ lowered = super.first
250
+ return [lowered] unless word.include?('SS')
251
+
252
+ [*sharp_s_variants(lowered), lowered]
253
+ end
254
+
255
+ # Guess word's capitalization, accounting for German ß handling.
256
+ #
257
+ # In German uppercased words, ß (which is lowercase, and usually uppercased
258
+ # as SS) is allowed: "straße" => "STRAßE"
259
+ #
260
+ # @param word [String] The word to analyze
261
+ # @return [Symbol] One of the Type constants
262
+ def guess(word)
263
+ result = super
264
+
265
+ # Check if removing ß makes it ALL caps
266
+ if word.include?('ß')
267
+ word_without_ss = word.gsub('ß', '')
268
+ return Type::ALL if super(word_without_ss) == Type::ALL
269
+ end
270
+
271
+ result
272
+ end
273
+ end
274
+ end
275
+ end
276
+ end