kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
data/README.adoc ADDED
@@ -0,0 +1,955 @@
1
+ = Kotoshu: Semantic Spell Checking for Ruby
2
+
3
+ image:https://img.shields.io/gem/v/kotoshu.svg[RubyGems Version]
4
+ image:https://img.shields.io/github/license/kotoshu/kotoshu.svg[License]
5
+ image:https://github.com/kotoshu/kotoshu/actions/workflows/main.yml/badge.svg["CI", link="https://github.com/kotoshu/kotoshu/actions/workflows/main.yml"]
6
+
7
+ == Status
8
+
9
+ Kotoshu is **v0.3.0** — building on the 0.2 cut, this release adds a strict
10
+ two-stage resource model (explicit `setup`, cache-only hot path), XDG base
11
+ directory layout, SHA-256 integrity verification, SARIF output, an
12
+ `--interactive` review loop, and ONNX model pipeline wiring.
13
+
14
+ === What works in 0.3
15
+
16
+ * Two-stage resource model — `Kotoshu.setup(:en)` then `Kotoshu.correct?("hello")`. The hot path is cache-only and raises `ResourceNotSetupError` on miss; downloads are never implicit.
17
+ * `Kotoshu.check(text, language: "en")` / `Kotoshu.suggest("helo")` — full document check and suggestions
18
+ * `Kotoshu.spellchecker_for(lang, strict: true)` — re-raise on optional-resource failures
19
+ * `kotoshu check FILE` CLI with these flags:
20
+ ** `--language en|de|es|fr|pt|ru|auto` (default: `auto`)
21
+ ** `--format text|json|sarif`
22
+ ** `--offline` — use only cached resources, never download
23
+ ** `--strict` — exit 3 if any optional resource (frequency, model) can't load
24
+ ** `--interactive` — review each error after the check
25
+ ** `--verbose`
26
+ * `kotoshu setup LANGUAGE [LANGUAGE ...]` — pre-warm spelling + frequency + ONNX caches for offline use (`fetch` is kept as a hidden deprecated alias)
27
+ * Local-source setup: `kotoshu setup en --aff path/to.en.aff --dic path/to.en.dic` or `kotoshu setup en --from /path/to/dict/dir/`
28
+ * Exit codes: `0` clean, `1` errors found, `2` usage error, `3` resource setup failed
29
+ * SHA-256 integrity verification (manifest-based, with graceful degradation when manifest is absent)
30
+ * Offline mode via `KOTOSHU_OFFLINE=1` or `--offline`
31
+ * XDG base directory layout — caches in `$XDG_CACHE_HOME/kotoshu/`, config in `$XDG_CONFIG_HOME/kotoshu/`, data in `$XDG_LOCAL_HOME/kotoshu/` (overridable via `KOTOSHU_CACHE_PATH`, `KOTOSHU_CONFIG_PATH`, `KOTOSHU_DATA_PATH`)
32
+
33
+ === Planned for 0.4+
34
+
35
+ * `--output` (file output redirection)
36
+ * ONNX semantic reranking as default path
37
+ * ≥30 language modules wired
38
+ * Grammar rule packs
39
+ * CJK and RTL language support
40
+
41
+ See link:TODO.impl/00-cut-0.2.md[the 0.2 cut plan], the 0.3 tasks under
42
+ `TODO.impl/`, and link:TODO.impl/00-vision.md[the vision] for the path to 1.0.
43
+
44
+ == Purpose
45
+
46
+ Kotoshu 「言修」 is a pure-Ruby spell checker that aims to work for every
47
+ language by dynamically downloading the right combination of dictionary,
48
+ frequency data, and embedding model on demand.
49
+
50
+ The current release pairs a Ruby port of the Hunspell algorithm
51
+ (traditional morphological lookup + affix rules) with optional FastText
52
+ word embeddings converted to ONNX for context-aware reranking.
53
+
54
+ NOTE: The semantic (ONNX) path is an optional feature. `gem install kotoshu`
55
+ works without `onnxruntime`; install it separately (`gem install onnxruntime`)
56
+ to enable context-aware reranking. Set `KOTOSHU_NO_ONNX=1` to opt back out.
57
+
58
+ == Features
59
+
60
+ NOTE: The list below describes the design vision. See <<status>> for
61
+ exactly what works in 0.2 and what is planned for 0.3+.
62
+
63
+ * <<multi-language, Multi-language support with automatic detection>>
64
+ * <<semantic-analysis, Semantic error detection using word embeddings>> (opt-in via Ruby API in 0.2)
65
+ * <<interactive-mode, Interactive review mode with full navigation>> (planned for 0.3)
66
+ * <<batch-processing, Batch processing for CI/CD>> (JSON in 0.2; SARIF planned for 0.3)
67
+ * <<onnx-models, Fast ONNX inference via ONNX Runtime>>
68
+ * <<document-formats, Support for Markdown, AsciiDoc, and plain text>>
69
+ * <<multiple-models, Multiple analysis models (Hunspell, FastText, Hybrid)>> (Hunspell path only in 0.2)
70
+
71
+ == Architecture
72
+
73
+ Kotoshu is built on a modern, semantic architecture:
74
+
75
+ .Architecture overview
76
+ [source]
77
+ ----
78
+ ╔═══════════════════════════════════════════════════════════════════╗
79
+ ║ Kotoshu Semantic Architecture ║
80
+ ╠═══════════════════════════════════════════════════════════════════╣
81
+ ║ ║
82
+ ║ ┌─────────────────────────────────────────────────────────────┐ ║
83
+ ║ │ Interface Layer │ ║
84
+ ║ │ ┌─────────────────────┐ ┌─────────────────────────────┐ │ ║
85
+ ║ │ │ CLI (Thor) │ │ Ruby API │ │ ║
86
+ ║ │ │ lib/kotoshu/cli/ │ │ Kotoshu module methods │ │ ║
87
+ ║ │ └──────────┬──────────┘ └───────────┬─────────────────┘ │ ║
88
+ ║ │ │ Auto Language Detect │ │ ║
89
+ ║ └─────────────┼──────────────────────────┼─────────────────────┘ ║
90
+ ║ │ │ ║
91
+ ║ ▼ ▼ ║
92
+ ║ ┌─────────────────────────────────────────────────────────────┐ ║
93
+ ║ │ Analysis Layer │ ║
94
+ ║ │ ┌──────────────┐ ┌─────────────┐ ┌───────────────────┐ │ ║
95
+ ║ │ │ Hunspell │ │ FastText │ │ Hybrid (Best!) │ │ ║
96
+ ║ │ │ Dictionary │ │ Embeddings │ │ Combined │ │ ║
97
+ ║ │ │ (Traditional)│ │ (ONNX) │ │ Approach │ │ ║
98
+ ║ │ └──────────────┘ └─────────────┘ └───────────────────┘ │ ║
99
+ ║ └───────────────────────────┬─────────────────────────────────┘ ║
100
+ ║ │ ║
101
+ ║ ┌───────────────────────────▼─────────────────────────────────┐ ║
102
+ ║ │ Model Layer (ONNX) │ ║
103
+ ║ │ ┌──────────────────────────────────────────────────────┐ │ ║
104
+ ║ │ │ ONNX Runtime → Fast Embedding Lookup │ │ ║
105
+ ║ │ │ Semantic Similarity → Context-Aware Suggestions │ │ ║
106
+ ║ │ │ Nearest Neighbor Search → Smart Corrections │ │ ║
107
+ ║ │ └──────────────────────────────────────────────────────┘ │ ║
108
+ ║ └─────────────────────────────────────────────────────────────┘ ║
109
+ ║ ║
110
+ ╚═══════════════════════════════════════════════════════════════════╝
111
+ ----
112
+
113
+ === Key Components
114
+
115
+ * `Kotoshu::Models::OnnxModel`: ONNX-based word embedding model for fast
116
+ semantic similarity and nearest neighbor search.
117
+
118
+ * `Kotoshu::Analyzers::SemanticAnalyzer`: Unified semantic error detection
119
+ using word embeddings (no artificial spelling/grammar split).
120
+
121
+ * `Kotoshu::Language::LanguageIdentifier`: Automatic language detection
122
+ using FastText LID model (127 languages).
123
+
124
+ * `Kotoshu::Cli::InteractiveReviewer`: Interactive CLI for error review
125
+ with full navigation (forward, backward, jump, skip, accept).
126
+
127
+ * `Kotoshu::Dictionary::Hunspell`: Traditional Hunspell dictionary backend
128
+ for morphological analysis and affix rules.
129
+
130
+ == Why ONNX?
131
+
132
+ ONNX Runtime provides:
133
+
134
+ * **Performance**: C++ implementation, 10-100x faster than pure Ruby
135
+ * **Portability**: Works on CPU, GPU, TPU, mobile devices
136
+ * **Optimization**: Automatic graph optimization and quantization
137
+ * **Interoperability**: Models can be trained in Python, deployed in Ruby
138
+
139
+ Kotoshu uses FastText models converted to ONNX format for semantic spell checking.
140
+
141
+ [[semantic-analysis]]
142
+ == Semantic Analysis
143
+
144
+ Unlike traditional spell checkers that only check dictionary membership and
145
+ edit distance, Kotoshu uses semantic similarity to:
146
+
147
+ * Detect contextually appropriate corrections ("desert" vs "dessert")
148
+ * Handle out-of-vocabulary words via subword embeddings
149
+ * Provide ranked suggestions based on semantic similarity
150
+ * Support compound words and morphological variations
151
+
152
+ .Usage example
153
+ [example]
154
+ ====
155
+ [source,ruby]
156
+ ----
157
+ Kotoshu.setup(:en, want: %i[spelling model]) # one-time per language
158
+
159
+ # Traditional: knows "helo" is wrong and lists edit-distance candidates
160
+ Kotoshu.suggest("helo").to_words
161
+ # => ["hello", "help", "held", "hell", "hole"]
162
+
163
+ # Semantic: reranks candidates by context similarity
164
+ model = Kotoshu::Models::OnnxModel.from_github("en")
165
+ analyzer = Kotoshu::Analyzers::SemanticAnalyzer.new(model)
166
+ analyzer.suggest_corrections("helo", context: "I said helo to the world").map(&:word)
167
+ # => ["hello"] # "hello" makes more sense in greeting context
168
+ ----
169
+ ====
170
+
171
+ NOTE: The semantic path requires the optional `onnxruntime` gem. See
172
+ <<_requirements,Requirements>>.
173
+
174
+ [[multi-language]]
175
+ == Multi-Language Support
176
+
177
+ Kotoshu supports 6 languages with full semantic analysis:
178
+
179
+ * *de* - German (Deutsch)
180
+ * *en* - English
181
+ * *es* - Spanish (Español)
182
+ * *fr* - French (Français)
183
+ * *pt* - Portuguese (Português)
184
+ * *ru* - Russian (Русский)
185
+
186
+ Automatic language detection is enabled by default:
187
+
188
+ .Usage example
189
+ [example]
190
+ ====
191
+ [source,bash]
192
+ ----
193
+ # Language auto-detected from document content
194
+ kotoshu check document.txt
195
+ # Detected: en (95% confidence)
196
+ # Analyzing document.txt (language: en)...
197
+
198
+ # Explicit language specification
199
+ kotoshu check document.txt --language de
200
+ ----
201
+ ====
202
+
203
+ [[onnx-models]]
204
+ == ONNX Models
205
+
206
+ Kotoshu uses FastText crawl vectors converted to ONNX format:
207
+
208
+ * Source: https://fasttext.cc/docs/en/crawl-vectors.html[FastText Crawl Vectors]
209
+ * Format: ONNX with optimized runtime
210
+ * Vocabulary: 2 million words per language (full coverage)
211
+ * Dimension: 300-dimensional word vectors
212
+ * Size: ~2.4GB per language
213
+
214
+ === FastText File Formats
215
+
216
+ FastText provides two file formats. Kotoshu uses the `.vec` format for ONNX conversion.
217
+
218
+ [cols="1,1,2"]
219
+ |===
220
+ |Aspect |`.vec` (Text) |`.bin` (Binary)
221
+
222
+ |Content
223
+ |Word vectors only (pre-computed embeddings)
224
+ |Full FastText model (trained model)
225
+
226
+ |Structure
227
+ |Text: one word + 300 floats per line
228
+ |Binary: complete model with matrices
229
+
230
+ |File Size
231
+ |~1.3GB compressed (~2.4GB uncompressed)
232
+ |~1.8GB compressed (~4.8GB uncompressed)
233
+
234
+ |Train New Words
235
+ |✗ No (static lookup only)
236
+ |✓ Yes (can train/OOV with subword info)
237
+
238
+ |Subword Embeddings
239
+ |✗ No
240
+ |✓ Yes (n-gram character embeddings)
241
+
242
+ |ONNX Converter
243
+ |✓ Supported (what we use)
244
+ |✗ Not supported
245
+
246
+ |Use Case
247
+ |Simple word vector lookup for spell checking
248
+ |Full FastText functionality (training, OOV)
249
+ |===
250
+
251
+ Kotoshu uses `.vec` files because:
252
+
253
+ * Simpler extraction: Just word → vector mapping
254
+ * No subword complexity needed: Dictionary-based spell checking doesn't require OOV generation
255
+ * Smaller ONNX models: ~2.4GB vs ~4.8GB
256
+ * Faster conversion: Direct serialization to ONNX
257
+
258
+ .Model management
259
+ [example]
260
+ ====
261
+ [source,bash]
262
+ ----
263
+ # Set up a language with spelling + ONNX semantic model
264
+ kotoshu setup en --want spelling,model
265
+
266
+ # List what's set up in the cache
267
+ kotoshu setup --list
268
+
269
+ # Re-validate cached resources
270
+ kotoshu cache validate
271
+ ----
272
+ ====
273
+
274
+ NOTE: FastText `.vec` → ONNX conversion is done upstream in the
275
+ https://github.com/kotoshu/models-fasttext-onnx[`kotoshu/models-fasttext-onnx']
276
+ repo. The CLI downloads pre-converted artifacts; users do not run
277
+ conversion locally.
278
+
279
+ [[interactive-mode]]
280
+ [[interactive-mode]]
281
+ == Interactive Mode
282
+
283
+ NOTE: Interactive mode shipped in 0.3.0. It is navigation-only — the
284
+ session records which suggestions the user accepted but does not rewrite
285
+ the source file yet.
286
+
287
+ [source,bash]
288
+ ----
289
+ kotoshu check README.md --interactive
290
+ ----
291
+
292
+ Features in 0.3:
293
+
294
+ * **Navigate**: [n] / Enter next, [p] previous, [l] list
295
+ * **Accept**: [1-9] record suggestion N for the current error
296
+ * **Skip**: [s] skip the current error
297
+ * **Quit**: [q] exit the review loop
298
+
299
+ [[batch-processing]]
300
+ == Batch Processing
301
+
302
+ For CI/CD and automation, Kotoshu supports JSON and SARIF output in 0.3;
303
+ `--output` file redirection is planned for 0.4+.
304
+
305
+ .JSON output for CI/CD
306
+ [example]
307
+ ====
308
+ [source,bash]
309
+ ----
310
+ # JSON output to stdout (supported in 0.3)
311
+ kotoshu check README.md --format json
312
+
313
+ # SARIF 2.1.0 output (supported in 0.3)
314
+ kotoshu check README.md --format sarif
315
+
316
+ # Exit code for CI
317
+ kotoshu check README.md
318
+ echo $? # 0 if no errors, 1 if errors found
319
+ ----
320
+ ====
321
+
322
+ [[document-formats]]
323
+ == Document Formats
324
+
325
+ Kotoshu supports structured documents with AST parsing:
326
+
327
+ * *Plain text*: Line-based error detection
328
+ * *Markdown*: AST-based using Kramdown parser
329
+ * *AsciiDoc*: AST-based using Asciidoctor parser
330
+
331
+ Structured documents preserve node paths for precise error location.
332
+
333
+ [[multiple-models]]
334
+ == Analysis Models
335
+
336
+ NOTE: In 0.2, the CLI runs the Hunspell traditional path only.
337
+ The `--model` flag and FastText/Hybrid paths are planned for 0.3+.
338
+ The Ruby API can opt into the semantic path today via
339
+ `Kotoshu::Models::OnnxModel` (auto-available when `onnxruntime` is installed).
340
+
341
+ Kotoshu is designed to support three analysis models:
342
+
343
+ .Dictionary backend comparison
344
+ [cols="2,2,2"]
345
+ |===
346
+ | Model | Description | Best For
347
+
348
+ | *hunspell* | Traditional dictionary-based with morphological rules | Fast checking, compound words, languages with complex morphology
349
+
350
+ | *fasttext* | Pure semantic embeddings via ONNX | Context awareness, out-of-vocabulary words, semantic similarity
351
+
352
+ | *hybrid* | Hunspell candidates + FastText reranking (recommended) | Maximum accuracy, best of both worlds
353
+ |===
354
+
355
+ .Intended usage (0.3+)
356
+ [example]
357
+ ====
358
+ [source,bash]
359
+ ----
360
+ # Fast dictionary-based checking (default in 0.2)
361
+ kotoshu check document.txt # 0.2: Hunspell path
362
+
363
+ # Semantic / hybrid paths: planned for 0.3
364
+ # kotoshu check document.txt --model fasttext
365
+ # kotoshu check document.txt --model hybrid
366
+ ----
367
+ ====
368
+
369
+ == Installation
370
+
371
+ Add this line to your application's Gemfile:
372
+
373
+ [source,ruby]
374
+ ----
375
+ gem 'kotoshu'
376
+ ----
377
+
378
+ And then execute:
379
+
380
+ [source,sh]
381
+ ----
382
+ bundle install
383
+ ----
384
+
385
+ Or install it yourself as:
386
+
387
+ [source,sh]
388
+ ----
389
+ gem install kotoshu
390
+ ----
391
+
392
+ NOTE: `onnxruntime` is an optional dependency. Install it separately
393
+ (`gem install onnxruntime`) to enable semantic analysis; the
394
+ traditional Hunspell path works without it.
395
+
396
+ == Quick Start
397
+
398
+ [source,sh]
399
+ ----
400
+ # One-time per language: download spelling dictionary from
401
+ # github.com/kotoshu/dictionaries (idempotent, ~5 MB)
402
+ kotoshu setup en
403
+
404
+ # Then check files instantly, cache-only
405
+ kotoshu check README.md
406
+ ----
407
+
408
+ Or skip the explicit setup — the CLI will prompt interactively the
409
+ first time you check a file in a non-cached language (TTY only; in
410
+ non-TTY or `KOTOSHU_OFFLINE=1` mode it exits with code 3).
411
+
412
+ .Command-line usage
413
+ [source,bash]
414
+ ----
415
+ # Check a file (uses --language, or auto-detects from content)
416
+ kotoshu check README.md
417
+
418
+ # Explicit language
419
+ kotoshu check README.md --language en
420
+
421
+ # JSON output for programmatic use
422
+ kotoshu check README.md --format json
423
+
424
+ # Offline mode — use only cached dictionaries, never download
425
+ kotoshu check README.md --offline
426
+
427
+ # Check stdin
428
+ echo "helo wrld" | kotoshu check
429
+ ----
430
+
431
+ Exit codes: `0` (no errors), `1` (errors found), `2` (usage error),
432
+ `3` (language not set up — run `kotoshu setup LANG`, or run `kotoshu check`
433
+ in a TTY to be prompted).
434
+
435
+ .Ruby API usage
436
+ [source,ruby]
437
+ ----
438
+ require 'kotoshu'
439
+
440
+ # Stage 1: set up the language once (downloads from github.com/kotoshu/dictionaries)
441
+ Kotoshu.setup(:en)
442
+
443
+ # Stage 2: hot-path checks are cache-only and never touch the network
444
+ Kotoshu.correct?("hello") # => true
445
+ Kotoshu.correct?("helo") # => false
446
+
447
+ # Suggestions return a SuggestionSet; call #to_words for an Array
448
+ Kotoshu.suggest("helo").to_words # => ["hello", "help", "held", ...]
449
+
450
+ # Check a document
451
+ result = Kotoshu.check("Hello wrold")
452
+ result.errors.map(&:word) # => ["wrold"]
453
+
454
+ # Each error carries position + suggestions
455
+ result = Kotoshu.check_file("README.md")
456
+ result.errors.each do |error|
457
+ puts "#{error.word} at offset #{error.position}: #{error.top_suggestions(3).join(', ')}"
458
+ end
459
+
460
+ # Semantic analysis is optional — requires the onnxruntime gem
461
+ # (gem install onnxruntime). Skip this block if you only want Hunspell.
462
+ if Kotoshu::Models::OnnxModel::ONNX_LOADED
463
+ Kotoshu.setup(:en, want: %i[spelling model])
464
+ model = Kotoshu::Models::OnnxModel.from_github('en')
465
+ analyzer = Kotoshu::Analyzers::SemanticAnalyzer.new(model)
466
+ analyzer.analyze(Kotoshu.check("Hello wrold"))
467
+ end
468
+ ----
469
+
470
+ NOTE: The library API is strict: calls like `Kotoshu.correct?` raise
471
+ `Kotoshu::ResourceNotSetupError` until you've run `Kotoshu.setup`. This
472
+ prevents surprise downloads on metered networks. The CLI (`kotoshu check`)
473
+ intercepts the error and prompts to download interactively.
474
+
475
+ == Requirements
476
+
477
+ * Ruby 3.1+
478
+ * `onnxruntime` gem (optional — enables semantic spell checking; install separately with `gem install onnxruntime`)
479
+ * Python 3 + fasttext (optional, only if you want to convert `.vec` → `.onnx` upstream)
480
+
481
+ == Resource Caching and Language Support
482
+
483
+ Kotoshu uses a sophisticated multi-layer caching system to manage dictionaries,
484
+ frequency lists, and embedding models. Resources are downloaded explicitly via
485
+ `Kotoshu.setup` (or `kotoshu setup`) and cached under the XDG base directory
486
+ layout (`~/.cache/kotoshu/` by default; override via `KOTOSHU_CACHE_PATH`,
487
+ `KOTOSHU_CONFIG_PATH`, `KOTOSHU_DATA_PATH`, or the `XDG_*_HOME` vars).
488
+
489
+ === Cache Architecture
490
+
491
+ .Cache System Class Diagram
492
+ [source]
493
+ ----
494
+ ┌────────────────────────────────────────────────────────────────────────────┐
495
+ │ BaseCache (Abstract) │
496
+ │ ┌────────────────────────────────────────────────────────────────────┐ │
497
+ │ │ Common: download, metadata, validation, stats, TTL management │ │
498
+ │ └────────────────────────────────────────────────────────────────────┘ │
499
+ └────────────────────┬───────────────────┬────────────────────┬──────────────┘
500
+ │ │ │
501
+ ┌────────────▼────────┐ ┌──────▼──────┐ ┌───────▼─────────┐
502
+ │ LanguageCache │ │ModelCache │ │ FrequencyCache │
503
+ │ (Dictionaries) │ │ (Embeddings)│ │ (Kelly Lists) │
504
+ └─────────────────────┘ └─────────────┘ └─────────────────┘
505
+ │ │ │
506
+ ┌────────────▼────────┐ ┌──────▼──────┐ ┌───────▼─────────┐
507
+ │ ~/.cache/kotoshu/ │ │~/.cache/ │ │ ~/.cache/kotoshu/│
508
+ │ languages/ │ │ kotoshu/ │ │frequency-lists/ │
509
+ │ │ │ models/ │ │ │
510
+ └─────────────────────┘ └─────────────┘ └─────────────────┘
511
+ ----
512
+
513
+ === Cache Types
514
+
515
+ ==== LanguageCache (Dictionaries)
516
+
517
+ Manages Hunspell dictionaries and grammar rules for spell checking.
518
+
519
+ * **Cache Path**: `~/.cache/kotoshu/languages/{code}/`
520
+ * **TTL**: 7 days (604,800 seconds)
521
+ * **Source**: https://github.com/kotoshu/dictionaries[kotoshu/dictionaries]
522
+ * **Resources per language**:
523
+ ** `spelling/`**: Hunspell dictionary (`index.dic`, `index.aff`)
524
+ ** `grammar/`**: Grammar rules (`rules.yaml`) - *future*
525
+ ** `frequency/`**: Frequency data - *deprecated, use FrequencyCache*
526
+
527
+ .Usage
528
+ [source,ruby]
529
+ ----
530
+ # Access via cache
531
+ cache = Kotoshu::Cache::LanguageCache.new
532
+ dict = cache.get_spelling('en')
533
+
534
+ # Result:
535
+ # {
536
+ # aff_path: "~/.cache/kotoshu/languages/en/spelling/index.aff",
537
+ # dic_path: "~/.cache/kotoshu/languages/en/spelling/index.dic",
538
+ # cached: true,
539
+ # metadata: { ... }
540
+ # }
541
+ ----
542
+
543
+ ==== FrequencyCache (Kelly Project)
544
+
545
+ Manages Kelly Project frequency lists for intelligent suggestion ranking.
546
+
547
+ * **Cache Path**: `~/.cache/kotoshu/frequency-lists/{code}/`
548
+ * **TTL**: 7 days (604,800 seconds)
549
+ * **Source**: https://github.com/kotoshu/frequency-list-kelly[kotoshu/frequency-list-kelly]
550
+ * **Format**: JSON with tiered word frequency data
551
+
552
+ .Kelly Frequency Data Structure
553
+ [source,json]
554
+ ----
555
+ {
556
+ "metadata": {
557
+ "language": "en",
558
+ "source": "Kelly Project (University of Leeds)",
559
+ "total_words_analyzed": 1500000
560
+ },
561
+ "tiers": {
562
+ "top_50": {
563
+ "words": ["the", "be", "to", "of", "and", ...],
564
+ "info": "Most common 50 words"
565
+ },
566
+ "top_200": {
567
+ "words": ["will", "my", "one", "all", ...],
568
+ "info": "Most common 200 words"
569
+ },
570
+ "top_1000": {
571
+ "words": ["however", "although", ...],
572
+ "info": "Most common 1000 words"
573
+ }
574
+ }
575
+ }
576
+ ----
577
+
578
+ .Usage
579
+ [source,ruby]
580
+ ----
581
+ # Access via cache
582
+ cache = Kotoshu::Cache::FrequencyCache.new
583
+ freq_data = cache.get('en', force_download: true)
584
+
585
+ # Result:
586
+ # {
587
+ # frequency_path: "~/.cache/kotoshu/frequency-lists/en/frequency.json",
588
+ # tiers: {
589
+ # top_50: Set<...>,
590
+ # top_200: Set<...>,
591
+ # top_1000: Set<...>
592
+ # },
593
+ # metadata: { ... }
594
+ # }
595
+
596
+ # Integrated into EditDistanceStrategy
597
+ strategy = Kotoshu::Suggestions::Strategies::EditDistanceStrategy.new(
598
+ language_code: 'en'
599
+ )
600
+ strategy.frequency_bonus('the') # => 200 (top 50)
601
+ strategy.frequency_bonus('hello') # => 100 (top 200)
602
+ strategy.frequency_bonus('xyz') # => 0 (not in lists)
603
+ ----
604
+
605
+ ==== ModelCache (Embedding Models)
606
+
607
+ Manages FastText and ONNX embedding models for semantic spell checking.
608
+
609
+ * **Cache Path**: `~/.cache/kotoshu/models/{code}/models/{type}/`
610
+ * **TTL**: 30 days (2,592,000 seconds)
611
+ * **Sources**:
612
+ ** FastText (.vec): Facebook CDN (https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/[dl.fbaipublicfiles.com])
613
+ ** ONNX (.onnx): Converted locally from FastText models
614
+ * **Supported Types**:
615
+ ** `fasttext`: FastText word vectors (.vec files, 300D) - Downloaded from Facebook CDN
616
+ ** `onnx`: ONNX-converted models (.onnx files) - Auto-converted from FastText
617
+
618
+ NOTE: ONNX models are automatically converted from FastText models on first use.
619
+ The conversion uses `lib/kotoshu/scripts/fasttext_to_onnx.py` and requires Python 3 with
620
+ `numpy` and `onnx` packages installed.
621
+
622
+ .Model Files by Language
623
+ [width="100%",cols="3,m"]
624
+ |=========================================
625
+ | Language | FastText File | ONNX File |
626
+ |=========================================
627
+ | de (German) | cc.de.300.vec | fasttext.de.onnx |
628
+ | en (English) | cc.en.300.vec | fasttext.en.onnx |
629
+ | es (Spanish) | cc.es.300.vec | fasttext.es.onnx |
630
+ | fr (French) | cc.fr.300.vec | fasttext.fr.onnx |
631
+ | pt (Portuguese) | cc.pt.300.vec | fasttext.pt.onnx |
632
+ | ru (Russian) | cc.ru.300.vec | fasttext.ru.onnx |
633
+ |=========================================|
634
+
635
+ === CLI Cache Management
636
+
637
+ Kotoshu provides CLI commands for managing cached resources:
638
+
639
+ [source,bash]
640
+ ----
641
+ # List all cached resources
642
+ kotoshu cache list
643
+
644
+ # List specific cache type
645
+ kotoshu cache list language
646
+ kotoshu cache list model
647
+ kotoshu cache list frequency
648
+
649
+ # Show cache statistics
650
+ kotoshu cache status
651
+
652
+ # Show detailed status (verbose)
653
+ kotoshu cache status --verbose
654
+
655
+ # Download a resource
656
+ kotoshu cache download language en
657
+ kotoshu cache download model en:fasttext
658
+ kotoshu cache download frequency en
659
+
660
+ # Get information about a resource
661
+ kotoshu cache info language en
662
+ kotoshu cache info model en:fasttext
663
+ kotoshu cache info frequency en
664
+
665
+ # Purge cached data
666
+ kotoshu cache purge all
667
+ kotoshu cache purge language en
668
+ kotoshu cache purge frequency
669
+
670
+ # Clean expired entries
671
+ kotoshu cache clean
672
+ ----
673
+
674
+ === Cache Statistics
675
+
676
+ Each cache type tracks statistics:
677
+
678
+ * **Hits**: Number of cache hits (resource found locally)
679
+ * **Misses**: Number of cache misses (had to download)
680
+ * **Hit Rate**: Percentage of cache hits
681
+ * **Size**: Total disk space used
682
+ * **Cached Resources**: Number of resources cached
683
+
684
+ [source,bash]
685
+ ----
686
+ $ kotoshu cache status
687
+ ======================================================================
688
+ Kotoshu Cache Status
689
+ ======================================================================
690
+
691
+ Language Cache:
692
+ Directory: /Users/username/.cache/kotoshu/languages
693
+ Resources cached: 2
694
+ Size: 2.45 MB
695
+ Hits: 15, Misses: 2
696
+ Hit rate: 88.2%
697
+
698
+ Frequency Cache:
699
+ Directory: /Users/username/.cache/kotoshu/frequency-lists
700
+ Resources cached: 1
701
+ Size: 815.84 KB
702
+ Hits: 42, Misses: 1
703
+ Hit rate: 97.7%
704
+
705
+ Model Cache:
706
+ Directory: /Users/username/.cache/kotoshu/models
707
+ Resources cached: 0
708
+ Size: 0 B
709
+ Hits: 0, Misses: 0
710
+ Hit rate: 0.0%
711
+
712
+ Total:
713
+ Total size: 3.26 MB
714
+ Overall hit rate: 93.5%
715
+ ======================================================================
716
+ ----
717
+
718
+ === Language Support Matrix
719
+
720
+ Kotoshu provides multi-language support with varying feature availability.
721
+
722
+ .Complete Language Support Matrix
723
+ [width="100%",cols="^1,^1,^1,^1,^1,^1,^1",options="header"]
724
+ |=========================================
725
+ | Language | Dictionary | Hunspell Affix Rules | Kelly Frequency | FastText Model | ONNX Model | Notes |
726
+ |=========================================
727
+ | de (German) | ✓ (75,873 words) | ✓ | ✗ | ✓ (2.5 GB) | ✓ (~230 MB) | QWERTZ keyboard support |
728
+ | en (English) | ✓ (49,568 words) | ✓ | ✓ (815 KB) | ✓ (4.3 GB) | ✓ (~460 MB) | QWERTY keyboard support |
729
+ | es (Spanish) | ✓ (57,344 words) | ✓ | ✗ | ✓ (2.5 GB) | ✓ (~230 MB) | QWERTY keyboard support |
730
+ | fr (French) | ✓ (84,310 words) ✓ | ✗ | ✓ (2.5 GB) | ✓ (~230 MB) | AZERTY keyboard support |
731
+ | pt (Portuguese) | ✓ (312,368 words) | ✓ | ✗ | ✓ (2.5 GB) | ✓ (~230 MB) | QWERTY keyboard support |
732
+ | ru (Russian) | ✓ (146,269 words) | ✓ | ✓ (780 KB) | ✓ (2.5 GB) | ✓ (~230 MB) | JCUKEN keyboard support |
733
+ | ar (Arabic) | ✗ | ✗ | ✓ | ✗ | ✗ | Kelly frequency only |
734
+ | zh (Chinese) | ✗ | ✗ | ✓ | ✗ | ✗ | Kelly frequency only |
735
+ | el (Greek) | ✗ | ✗ | ✓ | ✗ | ✗ | Kelly frequency only |
736
+ | it (Italian) | ✗ | ✗ | ✓ | ✗ | ✗ | Kelly frequency only |
737
+ | no (Norwegian) | ✗ | ✗ | ✓ | ✗ | ✗ | Kelly frequency only |
738
+ | sv (Swedish) | ✗ | ✗ | ✓ | ✗ | ✗ | Kelly frequency only |
739
+ |=========================================]
740
+
741
+ .Dictionary Sources
742
+ [width="100%",cols="^1,^1,^1,^1",options="header"]
743
+ |=========================================
744
+ | Language | Word Count | License | Source |
745
+ |=========================================
746
+ | de (German) | 75,873 | GPL | igerman98 |
747
+ | en (English) | 49,568 | LGPL/MPL/GPL | SCOWL |
748
+ | es (Spanish) | 57,344 | GPL | LibreOffice |
749
+ | fr (French) | 84,310 | MPL 2.0 | Grammalecte |
750
+ | pt (Portuguese) | 312,368 | LGPLv3 + MPL | VERO |
751
+ | ru (Russian) | 146,269 | BSD-style | Alexander Lebedev |
752
+ |=========================================+
753
+
754
+ .Kelly Frequency Lists
755
+ [width="100%",cols="^1,^1,^1",options="header"]
756
+ |=========================================
757
+ | Language | Size | Coverage |
758
+ |=========================================
759
+ | ar (Arabic) | ~750 KB | Top 1000 words |
760
+ | zh (Chinese) | ~800 KB | Top 1000 words |
761
+ | en (English) | 815 KB | Top 1000 words |
762
+ | el (Greek) | ~780 KB | Top 1000 words |
763
+ | it (Italian) | ~790 KB | Top 1000 words |
764
+ | no (Norwegian) | ~770 KB | Top 1000 words |
765
+ | ru (Russian) | 780 KB | Top 1000 words |
766
+ | sv (Swedish) | ~775 KB | Top 1000 words |
767
+ |=========================================+
768
+
769
+ NOTE: Kelly frequency lists provide the top 1000 most common words from
770
+ the Kelly Project (University of Leeds & University of Gothenburg).
771
+ Languages not listed here require external frequency data sources.
772
+
773
+ === Programmatic Usage
774
+
775
+ ==== Using Language Cache
776
+
777
+ [source,ruby]
778
+ ----
779
+ require 'kotoshu/cache/language_cache'
780
+
781
+ cache = Kotoshu::Cache::LanguageCache.new
782
+
783
+ # Get spelling dictionary
784
+ dict = cache.get_spelling('en')
785
+ puts "Dictionary: #{dict[:dic_path]}"
786
+ puts "Words: #{File.readlines(dict[:dic_path]).count}"
787
+
788
+ # Get available languages
789
+ cache.available_languages # => ["de", "en", "es", "fr", "pt", "ru"]
790
+
791
+ # Check if resource is cached
792
+ cache.available?('en:spelling') # => true
793
+
794
+ # Get language info
795
+ info = cache.language_info('en')
796
+ puts "Language: #{info[:name]}"
797
+ puts "Words: #{info[:word_count]}"
798
+ puts "License: #{info[:license]}"
799
+ ----
800
+
801
+ ==== Using Frequency Cache
802
+
803
+ [source,ruby]
804
+ ----
805
+ require 'kotoshu/cache/frequency_cache'
806
+
807
+ cache = Kotoshu::Cache::FrequencyCache.new
808
+
809
+ # Get frequency data
810
+ freq_data = cache.get('en')
811
+
812
+ # Access frequency tiers
813
+ top_50 = freq_data[:tiers][:top_50]
814
+ top_50.include?('the') # => true
815
+ top_50.include?('hello') # => true (in top 200)
816
+
817
+ # Get available languages
818
+ cache.available_languages # => ["ar", "zh", "en", "el", "it", "no", "ru", "sv"]
819
+ ----
820
+
821
+ ==== Integration with Suggestion Strategies
822
+
823
+ [source,ruby]
824
+ ----
825
+ require 'kotoshu/suggestions/strategies/edit_distance_strategy'
826
+
827
+ # Frequency bonuses automatically applied
828
+ strategy = Kotoshu::Suggestions::Strategies::EditDistanceStrategy.new(
829
+ language_code: 'en'
830
+ )
831
+
832
+ # Suggestions are ranked by frequency
833
+ suggestions = strategy.suggest('helo', max_results: 5)
834
+ # => [
835
+ # { word: "hello", score: 1200 }, # High frequency word
836
+ # { word: "help", score: 1150 }, # Medium frequency word
837
+ # ...
838
+ # ]
839
+ ----
840
+
841
+ === Cache TTL and Expiration
842
+
843
+ All cached resources have a Time-To-Live (TTL) and automatically expire:
844
+
845
+ * **LanguageCache**: 7 days (dictionaries change infrequently)
846
+ * **FrequencyCache**: 7 days (frequency lists are stable)
847
+ * **ModelCache**: 30 days (models are large and change rarely)
848
+
849
+ Expired resources are automatically re-downloaded on next access.
850
+
851
+ [source,ruby]
852
+ ----
853
+ cache = Kotoshu::Cache::FrequencyCache.new
854
+
855
+ # Force re-download (ignores cache)
856
+ freq_data = cache.get('en', force_download: true)
857
+
858
+ # Clean expired entries manually
859
+ cache.clean
860
+ ----
861
+
862
+ === Manual Cache Management
863
+
864
+ [source,ruby]
865
+ ----
866
+ cache = Kotoshu::Cache::LanguageCache.new
867
+
868
+ # Clear specific resource
869
+ cache.clear('en:spelling')
870
+
871
+ # Clear all resources
872
+ cache.clear_all
873
+
874
+ # Check if resource exists
875
+ cache.available?('en:spelling') # => true after download
876
+
877
+ # Get statistics
878
+ stats = cache.stats
879
+ puts "Hit rate: #{stats[:hit_rate] * 100}%"
880
+ puts "Size: #{stats[:size_bytes]} bytes"
881
+ ----
882
+
883
+ === GitHub Repository Structure
884
+
885
+ The kotoshu/dictionaries repository follows this structure:
886
+
887
+ [source]
888
+ ----
889
+ kotoshu/dictionaries/
890
+ ├── en/
891
+ │ ├── spelling/
892
+ │ │ ├── index.dic # Hunspell dictionary
893
+ │ │ ├── index.aff # Hunspell affix rules
894
+ │ │ └── metadata.json # Version info
895
+ │ ├── grammar/
896
+ │ │ └── rules.yaml # Grammar rules (future)
897
+ │ └── models/
898
+ │ ├── fasttext/
899
+ │ │ └── cc.en.300.vec # FastText vectors
900
+ │ └── onnx/
901
+ │ └── fasttext.en.onnx # ONNX model
902
+ ├── de/
903
+ │ └── ... (same structure)
904
+ └── README.md
905
+
906
+ kotoshu/frequency-list-kelly/
907
+ ├── data/
908
+ │ ├── en.json # Kelly frequency data
909
+ │ ├── ru.json
910
+ │ └── ...
911
+ └── README.md
912
+ ----
913
+
914
+ === Adding New Languages
915
+
916
+ To add support for a new language:
917
+
918
+ 1. **Dictionary**: Add Hunspell dictionary to `kotoshu/dictionaries/{code}/spelling/`
919
+ 2. **Frequency**: Add Kelly frequency data to `kotoshu/frequency-list-kelly/data/{code}.json`
920
+ 3. **Register**: Add to `AVAILABLE_LANGUAGES` in `LanguageCache`
921
+ 4. **Test**: Run integration tests to verify
922
+
923
+ See CONTRIBUTING.adoc for detailed guidelines.
924
+
925
+ == Model Repository
926
+
927
+ ONNX models are hosted at: https://github.com/kotoshu/dictionaries[kotoshu/dictionaries]
928
+
929
+ Download and setup:
930
+
931
+ [source,bash]
932
+ ----
933
+ # Preferred: let kotoshu fetch and verify the model
934
+ kotoshu setup en --want spelling,model
935
+
936
+ # Manual clone (advanced; bypasses manifest verification)
937
+ git clone https://github.com/kotoshu/dictionaries.git ~/src/kotoshu/dictionaries
938
+ ----
939
+
940
+ == License
941
+
942
+ BSD 2-Clause — see the link:LICENSE[LICENSE] file for details.
943
+
944
+ Bundled dictionaries and frequency lists carry their own licenses; see the
945
+ per-language `license` files in https://github.com/kotoshu/dictionaries[kotoshu/dictionaries].
946
+
947
+ == Contributing
948
+
949
+ Contributions are welcome! Please see CONTRIBUTING.adoc for guidelines.
950
+
951
+ == Acknowledgments
952
+
953
+ * FastText: https://fasttext.cc/[Facebook Research]
954
+ * ONNX Runtime: https://onnxruntime.ai/[Microsoft]
955
+ * Hunspell: https://hunspell.github.io/[László Németh]