kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 496e00727251ee935375c0a8013132917384d1847fe99ed3752a414f3a73e5d6
4
+ data.tar.gz: 6998e4c7879ecca1888a1bedbfe41af5e5516c8a2497457f887c06f310f53678
5
+ SHA512:
6
+ metadata.gz: 9bafd7e06458c93a4a3b817fdddd50beac4a74cbcc22c3cf0de805dd1f2211d023c5f7dea2e187b1c0fdbe1772901c7e9c22d2956c3b9d887a2f20dbdb267b35
7
+ data.tar.gz: 4e37cee72f98cf5a171c1f167c52720213ab93d99919f50268bba998836233d09a3e4fb8b0d1756bacc52fb554e42541a7ba8918dba99067dc2da146dd5e3e51
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,18 @@
1
+ inherit_from:
2
+ - https://raw.githubusercontent.com/riboseinc/oss-guides/main/ci/rubocop.yml
3
+ - .rubocop_todo.yml
4
+
5
+ plugins:
6
+ - rubocop-performance
7
+ - rubocop-rake
8
+ - rubocop-rspec
9
+
10
+ AllCops:
11
+ TargetRubyVersion: 3.0
12
+ NewCops: enable
13
+ SuggestExtensions: false
14
+ Exclude:
15
+ - 'debug_*.rb'
16
+ - 'test_*.rb'
17
+ - 'vendor/**/*'
18
+ - 'tmp/**/*'
data/CHANGELOG.md ADDED
@@ -0,0 +1,182 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.3.0] — 2026-06-27
11
+
12
+ The two-stage release. Resources are now downloaded explicitly via
13
+ `Kotoshu.setup(:en)`; the hot path (`correct?`, `suggest`, `check`) reads
14
+ only from cache and raises a typed error when a language is missing instead
15
+ of triggering a network download. The CLI adds `setup`, `status`, language
16
+ auto-detection, SARIF/JSON output, and an interactive auto-setup prompt.
17
+ `onnxruntime` is now a soft dependency, so `gem install kotoshu` succeeds on
18
+ hosts that can't load native ONNX runtime.
19
+
20
+ ### Added
21
+
22
+ - **Two-stage resource model** (`Kotoshu::ResourceManager`):
23
+ `Kotoshu.setup(:en, want: %i[spelling frequency model])` writes into the
24
+ cache; `Kotoshu::ResourceManager.resolve(language:, want:)` is instant and
25
+ cache-only, raising `ResourceNotSetupError` on miss. `Kotoshu.setup?` is
26
+ the predicate for "is this language already cached?". The library never
27
+ triggers a surprise download; the CLI prompts the user via `AutoSetup`.
28
+ - **`SourceRegistry`** — single source of truth for the three content repos'
29
+ URLs and per-repo pins. `kotoshu/dictionaries` is pinned to the `v1`
30
+ branch; `frequency-list-kelly` and `models-fasttext-onnx` are on `main`.
31
+ Override at runtime via `KOTOSHU_REPOS_BASE_URL`, `KOTOSHU_DICTIONARIES_PIN`,
32
+ `KOTOSHU_FREQUENCY_PIN`, `KOTOSHU_MODELS_PIN`.
33
+ - **XDG Base Directory layout** (`Kotoshu::Paths`): dictionaries, frequency
34
+ lists, ONNX models under `$XDG_CACHE_HOME/kotoshu/`; personal dictionary
35
+ and `kotoshu.cfg` under `$XDG_CONFIG_HOME/kotoshu/`; audit log under
36
+ `$XDG_DATA_HOME/kotoshu/audit.log`. Override per-axis with
37
+ `KOTOSHU_CACHE_PATH`, `KOTOSHU_CONFIG_PATH`, `KOTOSHU_DATA_PATH`.
38
+ - **Integrity verification** — `Kotoshu::Integrity::Manifest` (SHA-256) is
39
+ fetched per content repo and matched against every download. Mismatches
40
+ raise `Kotoshu::IntegrityError`. Outcomes (verified / unverified / mismatch)
41
+ are written to the audit log. Missing manifests degrade gracefully.
42
+ - **CLI `setup` command** — `kotoshu setup LANG [--force] [--no-frequency]
43
+ [--no-model]` writes the requested resources into the cache with progress
44
+ reporting.
45
+ - **CLI `status` command** — `kotoshu status [--json]` summarises installed
46
+ resources, sizes, mtimes, and ONNX runtime availability.
47
+ - **CLI `check --language auto`** — auto-detects document language via
48
+ FastText LID; falls back to the configured default language when detection
49
+ is unavailable or the detected language is not set up.
50
+ - **CLI `check --format json|sarif`** — machine-readable output. SARIF
51
+ follows v2.1.0 with `kotoshu/spelling` rule id, JSON exposes
52
+ `success`/`wordCount`/`errorCount`/`uniqueErrorCount`/`errors`/`source`.
53
+ - **CLI auto-setup prompt** — when the hot path raises
54
+ `ResourceNotSetupError` in an interactive session, the user is prompted to
55
+ run setup now and the original command is retried on success. Non-TTY,
56
+ offline (`--offline`), and `--no-prompt` invocations skip the prompt and
57
+ surface the error as before.
58
+ - **Download progress reporting** (`Kotoshu::Cli::ProgressReporter`) — TTY
59
+ mode renders a determinate/indeterminate progress bar; non-TTY mode prints
60
+ a periodic line every 10 MiB. `Kotoshu.configuration.download_reporter=`
61
+ exposes the reporter for programmatic use.
62
+ - **End-to-end smoke spec** (`spec/integration/end_to_end_spec.rb`) covers
63
+ install → setup → `correct?` → `suggest.to_words` → `check` →
64
+ `setup?` predicate → `ResourceNotSetupError` → idempotent re-setup.
65
+ Tagged `:network`, opted into via `NETWORK_TESTS=1`.
66
+ - **CLI format spec** (`spec/kotoshu/cli/check_format_spec.rb`) shells out to
67
+ the real `kotoshu` CLI and asserts JSON / SARIF structure and exit codes.
68
+
69
+ ### Changed
70
+
71
+ - **`onnxruntime` is a soft dependency.** Removed from `kotoshu.gemspec`.
72
+ `Kotoshu::Models::OnnxModel` soft-requires it at load time and exposes
73
+ `ONNX_LOADED`. When false, semantic methods raise
74
+ `Kotoshu::Models::OnnxModel::OnnxUnavailable` with a caller-friendly
75
+ message. `KOTOSHU_NO_ONNX=1` forces semantic off even when the gem is
76
+ present. The traditional spell-checking path never touches `onnxruntime`.
77
+ - **Loading strategy** — `lib/kotoshu.rb` eagerly loads only the facade
78
+ dependencies; heavier or optional pieces (ONNX models, interactive CLI,
79
+ caches, language detection) are wired through Ruby `autoload` registered
80
+ in their immediate parent namespace.
81
+ - **Public API** — `suggest` returns a `SuggestionSet`; call `.to_words` for
82
+ an `Array<String>`. `Kotoshu.check` returns a `DocumentResult`; iterate
83
+ `errors` for `WordResult` instances with `word`, `position`, `line`,
84
+ `column`, `suggestions`.
85
+ - **README quickstart** — reflects the two-stage API; documents XDG paths;
86
+ marks `onnxruntime` as optional.
87
+
88
+ ### Fixed
89
+
90
+ - `gem install kotoshu` no longer requires `onnxruntime` or its native
91
+ toolchain.
92
+ - Resource resolution no longer triggers downloads from inside the hot path.
93
+ - Per-repo pins are honoured — the `v1` branch of `kotoshu/dictionaries` is
94
+ fetched instead of `main`.
95
+
96
+ ### Known limitations (carried from 0.1.0, scope reduced)
97
+
98
+ - **Hunspell correctness**: compound rules, circumfix, ICONV/OCONV, German ß,
99
+ Turkish dotless-i remain partial. See `TODO.impl/01-hunspell-correctness.md`.
100
+ - **CJK and RTL**: tokenizer, normalizer, and keyboard layouts exist for
101
+ supported languages; full CJK/RTL support deferred past 0.3.
102
+ See `TODO.impl/06-cjk-support.md` and `TODO.impl/07-rtl-support.md`.
103
+ - **Grammar rules**: the rule engine exists; no rule packs are shipped.
104
+ See `TODO.impl/08-grammar-engine.md`.
105
+ - **Audit log rotation, cache eviction policy, and shell completion** are
106
+ deferred past 0.3 (T3 TODOs).
107
+
108
+ ### Internal
109
+
110
+ - 9 logical commits on `release-0.3` cover the T1 (architectural) and T2
111
+ (user-facing) work for this release.
112
+ - `SourceRegistry`, `Paths`, `ResourceManager`, `ResourceBundle`,
113
+ `SetupResult`, `Integrity::Manifest`, `Integrity::AuditLog`,
114
+ `Cli::AutoSetup`, `Cli::StatusReport`, `Cli::LanguageResolver`,
115
+ `Cli::ProgressReporter` are new model-driven types.
116
+ - 73 new specs added (source_registry, end_to_end, check_format,
117
+ progress_reporter, language_resolver, status_report, auto_setup).
118
+
119
+ ### Contributors
120
+
121
+ - Ribose Inc.
122
+
123
+ ## [0.1.0] — 2026-06-25
124
+
125
+ First public release. Kotoshu is a pure-Ruby spellchecker that combines a
126
+ Ruby port of the Hunspell algorithm with optional FastText ONNX embeddings
127
+ for semantic reranking. This release establishes the public Ruby API, the
128
+ basic CLI, and the cache layer.
129
+
130
+ ### Working
131
+
132
+ - **Ruby API**: `Kotoshu.correct?`, `Kotoshu.suggest`, `Kotoshu.check`,
133
+ `Kotoshu.check_file`, `Kotoshu.detect_language`
134
+ - **CLI**: `kotoshu check TARGET`, `kotoshu dict SUBCOMMAND`, `kotoshu cache
135
+ SUBCOMMAND`, `kotoshu version`
136
+ - **Dictionary backends**: Hunspell (`.aff`/`.dic`), CSpell, UnixWords
137
+ (`/usr/share/dict/words`), PlainText, Custom
138
+ - **Suggestion strategies**: edit distance, phonetic (Phonet), keyboard
139
+ proximity, n-gram, symspell, composite pipeline
140
+ - **Configuration**: `Kotoshu.configure`, CLI > ENV (`KOTOSHU_*`) >
141
+ programmatic > defaults via `Configuration::Resolver`
142
+ - **Cache layer**: `LanguageCache`, `FrequencyCache`, `ModelCache` with TTLs
143
+ and download from `kotoshu/dictionaries`, `kotoshu/frequency-list-kelly`,
144
+ `kotoshu/models-fasttext-onnx`
145
+ - **Language detection**: FastText LID, 127 languages
146
+ - **Documents**: Plain text, Markdown (Kramdown), AsciiDoc (Asciidoctor)
147
+ - **Test suite**: 803 of 866 examples passing (92.7%), 6 pending
148
+
149
+ ### Known limitations (not blocking 0.1)
150
+
151
+ - **Hunspell correctness**: compound rules, circumfix, ICONV/OCONV, German
152
+ ß, Turkish dotless-i are not fully implemented. Single-word lookup and
153
+ basic affixes work. See `TODO.impl/01-hunspell-correctness.md`.
154
+ - **CLI surface**: `--interactive`, `--format sarif|json|yaml|csv`,
155
+ `--model fasttext|hybrid`, `--language auto` exist in
156
+ `lib/kotoshu/commands/check_command.rb` but are not wired through
157
+ `exe/kotoshu`. See `TODO.impl/02-cli-unification.md`.
158
+ - **Semantic path**: gated behind `ENV['KOTOSHU_REQUIRE_ONNX']` because
159
+ `onnxruntime` loads eagerly otherwise. Hybrid mode is not the default.
160
+ See `TODO.impl/05-semantic-path.md`.
161
+ - **Dynamic resolution**: the three caches exist independently; there is
162
+ no unified `ResourceManager` that takes arbitrary text and yields the
163
+ full resource bundle. See `TODO.impl/03-dynamic-download.md`.
164
+ - **Languages**: code is wired for English by default. The
165
+ `dictionaries` repo has 98 language directories but the gem's
166
+ `lib/kotoshu/languages/` has only 7 modules (de, en, es, fr, ja, pt,
167
+ ru). See `TODO.impl/04-language-modules.md`.
168
+ - **CJK, RTL**: not implemented. See `TODO.impl/06-cjk-support.md`
169
+ and `TODO.impl/07-rtl-support.md`.
170
+ - **Grammar rules**: the rule engine exists; no rule packs are shipped.
171
+ See `TODO.impl/08-grammar-engine.md`.
172
+ - **Integrity verification**: downloaded resources are not currently
173
+ checksummed. See `TODO.impl/09-integrity-security.md`.
174
+
175
+ ### Internal
176
+
177
+ - 12 plans under `TODO.impl/` define the path to 1.0
178
+ - Architecture documentation consolidated under `docs/`
179
+
180
+ ### Contributors
181
+
182
+ - Ribose Inc.
data/CLAUDE.md ADDED
@@ -0,0 +1,172 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## What Kotoshu Is
6
+
7
+ Kotoshu 「言修」 is a **semantic** spell checker for Ruby. It pairs a traditional
8
+ dictionary/affix backend (Hunspell-style) with ONNX-converted FastText word
9
+ embeddings for context-aware suggestions. The README.adoc is the authoritative
10
+ user-facing description; this file is the contributor-facing map.
11
+
12
+ Key dependencies (`kotoshu.gemspec`): `thor` (CLI), `suika` (tokenizer),
13
+ `onnxruntime` (semantic inference). Ruby 3.1+.
14
+
15
+ ## Development Commands
16
+
17
+ ```bash
18
+ bundle exec rspec # Run the full test suite
19
+ bundle exec rspec spec/path/to_spec.rb # Run one file
20
+ bundle exec rspec -e "matches a word" # Run examples matching a name
21
+ bundle exec rspec --only-failures # Rerun just failing examples (uses .rspec_status)
22
+
23
+ NETWORK_TESTS=1 bundle exec rspec # Opt INTO tests that download dictionaries
24
+ bundle exec rubocop # Lint
25
+ bundle exec rubocop -A # Lint with safe auto-fix
26
+ bundle exec rake # default task = spec + rubocop
27
+
28
+ bundle exec bin/console # IRB with Kotoshu loaded
29
+ bundle exec exe/kotoshu check FILE # Run the CLI locally
30
+ gem build kotoshu.gemspec && gem install kotoshu-*.gem
31
+ ```
32
+
33
+ Notes that aren't obvious from the Rakefile:
34
+ - `spec/spec_helper.rb` excludes anything tagged `:network` unless `NETWORK_TESTS=1` is set — those specs download dictionaries from GitHub and are slow/flaky.
35
+ - SimpleCov runs on every `rspec` invocation (configured in `spec_helper.rb`).
36
+ - `spec/spylls_test_helper.rb` is mixed into every spec. It ports Hunspell's reference test fixtures from [Splylls](https://github.com/neolithos/spylls) (the Python Hunspell port); many specs assert behavior against those fixtures.
37
+
38
+ ## Architecture
39
+
40
+ Kotoshu has **two parallel checking paths** that share infrastructure:
41
+
42
+ 1. **Traditional path** — `Kotoshu::Spellchecker` (facade) → `Suggestions::Generator` → pluggable `Dictionary::*` backends + `Suggestions::Strategies::*` algorithms. This is what `Kotoshu.correct?` / `Kotoshu.suggest` / `Kotoshu.check` use.
43
+ 2. **Semantic path** — `Analyzers::SemanticAnalyzer` driven by an `Models::EmbeddingModel` (`FastTextModel` or `OnnxModel`). Used for context-aware reranking and OOV handling. This path is **opt-in** and only loads when needed.
44
+
45
+ ### Layer map
46
+
47
+ ```
48
+ exe/kotoshu ─► lib/kotoshu/cli.rb (Kotoshu::Cli::Cli < Thor)
49
+ subcommands: check, dict (DictCommand), cache (CacheCommand)
50
+ helpers: cli/interactive_reviewer, cli/batch_reporter,
51
+ cli/navigation_manager, cli/display_formatter
52
+
53
+ Kotoshu module (lib/kotoshu.rb) ─► public facade methods
54
+ .correct? .suggest .check .check_file .detect_language ...
55
+ all delegate to a singleton Spellchecker
56
+
57
+ Spellchecker ─► Configuration ─► Dictionary::Repository ─► Dictionary::*
58
+
59
+ └─► Suggestions::Generator
60
+ └─► Strategies::CompositeStrategy
61
+ (edit_distance, phonetic,
62
+ keyboard_proximity, ngram,
63
+ symspell, semantic)
64
+
65
+ SemanticAnalyzer ─► Models::OnnxModel | Models::FastTextModel
66
+ └─► Embeddings::* (vocabulary, similarity search, LRU cache)
67
+ ```
68
+
69
+ ### Loading strategy
70
+
71
+ `lib/kotoshu.rb` eagerly `require_relative`s the traditional path (core models, dictionaries, strategies, configuration, spellchecker) and `autoload`s the heavier / optional pieces (ONNX models, documents, interactive CLI, caches, language detection, debug/metrics). When adding a new top-level component, follow the existing split: eager-load only what the facade needs at boot; autoload the rest.
72
+
73
+ **ONNX is a soft dependency.** `onnxruntime` is NOT in `kotoshu.gemspec` — `gem install kotoshu` succeeds without it. `Models::OnnxModel` soft-requires it at load time and exposes `ONNX_LOADED` (true/false). When false, semantic methods raise `Models::OnnxModel::OnnxUnavailable` with a caller-friendly message. `KOTOSHU_NO_ONNX=1` forces semantic off even when the gem is present. The traditional spell-checking path never touches onnxruntime.
74
+
75
+ ### Resource lifecycle — two-stage model
76
+
77
+ Resources (dictionaries, frequency lists, ONNX models) flow through a strict two-stage API in `ResourceManager`:
78
+
79
+ 1. **Setup** (`Kotoshu.setup(:en, want: %i[spelling frequency model])`, or `Kotoshu.setup(:en, aff:, dic:)` / `from:` for local sources). Slow, network-required, explicit. Writes into the cache.
80
+ 2. **Resolve** (`Kotoshu::ResourceManager.resolve(language:, want:)`). Instant, cache-only, raises `ResourceNotSetupError` on miss.
81
+
82
+ The hot path (`Kotoshu.correct?`, `.check`, `.suggest`, `.spellchecker_for`) calls `resolve` and lets the error propagate — **setup is never implicit**. This is intentional: users on metered networks or air-gapped hosts must not get a surprise download. `Kotoshu.setup?(:en, resource: :spelling|:frequency|:model)` is the predicate for "is this already in cache?".
83
+
84
+ `ResourceBundle` (the resolve result) carries `dictionary`, `frequency`, `model`, and `rules`. `SetupResult` (the setup result) reports per-resource status (`:downloaded | :local | :cached | :unavailable`).
85
+
86
+ ### Paths — XDG Base Directory
87
+
88
+ All on-disk locations are resolved through `Kotoshu::Paths`, which honors `XDG_CACHE_HOME`, `XDG_CONFIG_HOME`, `XDG_LOCAL_HOME` and the override envs `KOTOSHU_CACHE_PATH`, `KOTOSHU_CONFIG_PATH`, `KOTOSHU_DATA_PATH`. Defaults:
89
+
90
+ | Concern | Default path |
91
+ |---|---|
92
+ | Language dictionaries, frequency lists, ONNX models | `~/.cache/kotoshu/` |
93
+ | Personal dictionary, kotoshu.cfg | `~/.config/kotoshu/` |
94
+ | Audit log | `~/.local/share/kotoshu/audit.log` |
95
+
96
+ ### Resource caching
97
+
98
+ Three caches under `~/.cache/kotoshu/` (see `CACHE_ARCHITECTURE.md` for detail, README.adoc for the user-facing version):
99
+
100
+ | Cache | Path | Source | TTL |
101
+ |---|---|---|---|
102
+ | `Cache::LanguageCache` | `~/.cache/kotoshu/languages/{code}/spelling/` | `github.com/kotoshu/dictionaries` | 7 days |
103
+ | `Cache::FrequencyCache` | `~/.cache/kotoshu/frequency-lists/{code}/` | `github.com/kotoshu/frequency-list-kelly` | 7 days |
104
+ | `Cache::ModelCache` | `~/.cache/kotoshu/models/{code}/...` | `github.com/kotoshu/models-fasttext-onnx` (FastText `.vec` → ONNX converted upstream) | 30 days |
105
+
106
+ `FrequencyCache` feeds `frequency_bonus` in `Suggestions::Strategies::EditDistanceStrategy` — high-frequency words get a ranking boost. The `kotoshu cache` subcommand exposes list/status/download/info/purge/clean operations.
107
+
108
+ ### Configuration
109
+
110
+ `Configuration` (singleton via `.instance`) is built from a `SCHEMA` hash that declares each option's ENV var, default, type, and description. The `Configuration::Resolver` enforces the priority chain: **CLI flags > ENV (`KOTOSHU_*`) > programmatic > defaults**. When adding a config option, add it to `SCHEMA` (and probably `DEFAULTS`) rather than sprinkling `attr_accessor`s — that's how it picks up ENV support automatically.
111
+
112
+ `dictionary_type` selects the backend: `:unix_words | :plain_text | :custom | :hunspell | :cspell`. The dictionary is lazy-loaded through `Configuration#dictionary` (cached on the instance; call `reset_dictionary` to reload).
113
+
114
+ ### Language support
115
+
116
+ Full features (dictionary + affixes + FastText + ONNX + keyboard layout): `de, en, es, fr, pt, ru`.
117
+ Kelly frequency only: `ar, zh, el, it, no, sv` (and `ru`).
118
+ `Language::Identifier` does automatic detection (FastText LID model, 127 languages). Per-language behavior (tokenizer, normalizer) lives in `languages/{code}/language.rb` and `language/tokenizer/*`. Keyboard layouts (`keyboard/layouts/*`) feed `KeyboardProximityStrategy`.
119
+
120
+ ### Suggestion strategies
121
+
122
+ `Suggestions::Generator::DEFAULT_ALGORITHMS` = `[EditDistanceStrategy, PhoneticStrategy, KeyboardProximityStrategy, NgramStrategy]`, composed via `Strategies::CompositeStrategy`. Also available: `SymspellStrategy`, `SemanticStrategy`. Register new algorithms via `Kotoshu.register_suggestion_algorithm(:name, Klass)` (uses `BaseStrategy.register_type`).
123
+
124
+ ## Code Layout (lib/kotoshu/)
125
+
126
+ | Path | Responsibility |
127
+ |---|---|
128
+ | `kotoshu.rb` | Public facade + eager/autoload wiring |
129
+ | `spellchecker.rb`, `spellchecker/parallel_checker.rb` | Traditional check facade |
130
+ | `paths.rb` | XDG path resolution (cache, config, data, audit log, personal dict) |
131
+ | `resource_manager.rb`, `resource_bundle.rb` | Two-stage setup/resolve flow + result structs |
132
+ | `configuration.rb`, `configuration/{builder,resolver}.rb` | Config + priority resolution |
133
+ | `core/` | Domain models (`Word`, `AffixRule`, `result/*`), `IndexedDictionary`, `Trie/*`, `exceptions` |
134
+ | `dictionary/` | Backends: `base`, `hunspell`, `cspell`, `unix_words`, `plain_text`, `custom`, `unified`, `repository` |
135
+ | `readers/` | Parsers for Hunspell `.aff` / `.dic` (aff_data, aff_reader, dic_reader, condition_checker, lookup_builder) |
136
+ | `suggestions/` | `generator`, `context`, `suggestion{,_set}`, `pipeline`, `strategies/*` |
137
+ | `algorithms/` | Lower-level Hunspell-style suggestion primitives (ported from Spylls): `ngram_suggest`, `phonet_suggest`, `suggest`, `lookup`, `permutations`, `capitalization` |
138
+ | `analyzers/` | `semantic_analyzer` — the embedding-based checker |
139
+ | `models/` | `embedding_model` (abstract), `fasttext_model`, `onnx_model`, `word_embedding`, `nearest_neighbor`, `semantic_error`, `context`, `suggestion` |
140
+ | `embeddings/` | ONNX runtime glue: `onnx_runtime_model`, `vocabulary`, `similarity_engine`, `similarity_search`, `search`, `embedding_pipeline`, `protocols{,_registry}`, `lru_cache` |
141
+ | `cache/` | `base_cache`, `language_cache`, `model_cache`, `frequency_cache`, plus `lookup_cache` / `suggestion_cache` runtime caches |
142
+ | `language/`, `languages/` | Detection (`identifier`, `detector`), registry, per-language modules, tokenizers, normalizers |
143
+ | `documents/` | Document abstraction: `plain_text_document`, `markdown_document`, `asciidoc_document`, `location` |
144
+ | `cli/` | CLI helpers (interactive reviewer, batch reporter, navigation, display) |
145
+ | `commands/` | Thor subcommands: `check_command`, `cache_command`, `model_command` |
146
+ | `grammar/` | Rule engine + pattern matchers (`rule`, `rule_engine`, `rule_loader`, `pattern_matchers/*`) |
147
+ | `keyboard/` | Layout registry + per-layout files (qwerty, qwertz, azerty, jcuken, dvorak) |
148
+ | `components/`, `plugins/`, `data_structures/`, `results/`, `data/` | Tokenizer/POS/synthesizer components, plugin registry, bloom filter, result base, common-words loader |
149
+
150
+ The exe uses `Kotoshu::Cli::Cli` (in `cli.rb`), which registers `dict` → `DictCommand` and `cache` → `CacheCommand` as subcommands. A richer `Kotoshu::CheckCommand` exists in `commands/check_command.rb` (with `--interactive`, `--format sarif/json`, `--model`, `--language auto`) — check which one is actually wired before assuming a CLI flag exists.
151
+
152
+ ## Specs
153
+
154
+ Spec layout mirrors lib: `spec/kotoshu/...`, plus `spec/integration/`, `spec/integrational/`, `spec/performance/`, `spec/benchmark/`, `spec/properties/`, `spec/unit/`, `spec/hunspell_tests/` (Splylls-ported fixtures), `spec/fixtures/`, `spec/support/`.
155
+
156
+ Global rules that apply here (see `~/.claude/CLAUDE.md`): **no `double()` in specs** — use real instances or `Struct.new`; **no hand-rolled serialization** (`to_h`/`from_h` on models).
157
+
158
+ ## Reference Implementations (read-only, on disk)
159
+
160
+ When implementing features, study these alongside Kotoshu:
161
+
162
+ - `/Users/mulgogi/src/external/hunspell/` — morphological rules, affix processing, suggestions (C++ reference).
163
+ - `/Users/mulgogi/src/external/cspell/` — trie/DAFSA dictionaries, code-aware checking (TypeScript reference).
164
+ - `/Users/mulgogi/src/external/languagetool/` — rule-based grammar, multi-interface (library + HTTP), caching patterns (Java reference).
165
+ - Spylls (Python Hunspell port) — the algorithms in `algorithms/` and the fixtures in `spec/hunspell_tests/` derive from here.
166
+
167
+ ## Other Notes
168
+
169
+ - License is **BSD-2-Clause** (not MIT — the README's "License" section is wrong).
170
+ - RBS signatures live in `sig/kotoshu.rbs` (the `sig/kotoshu/` subdirectory is empty). Update signatures when changing public APIs.
171
+ - `scripts/` contains one-off utilities (FastText→ONNX conversion in Python, Kelly frequency parsing, diagnostics). `examples/` has numbered walkthrough scripts (`01_*.rb` … `07_*.rb`).
172
+ - Design history and superseded planning docs live in `docs/` (`architecture.md`, `cache-architecture.md`, `performance.md`, `plugins.md`, `getting-started.md`, plus integrated planning docs like `KOTOSHU_SOLIDIFICATION_PLAN.md`, `ARCHITECTURE_IMPROVEMENTS.md`, `TDD_ITERATION_STRATEGY.md`). Treat them as historical context, verify against current code before relying on them. `TODO.impl/` is the current source of truth for execution plans.
@@ -0,0 +1,132 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ We as members, contributors, and leaders pledge to make participation in our
6
+ community a harassment-free experience for everyone, regardless of age, body
7
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
8
+ identity and expression, level of experience, education, socio-economic status,
9
+ nationality, personal appearance, race, caste, color, religion, or sexual
10
+ identity and orientation.
11
+
12
+ We pledge to act and interact in ways that contribute to an open, welcoming,
13
+ diverse, inclusive, and healthy community.
14
+
15
+ ## Our Standards
16
+
17
+ Examples of behavior that contributes to a positive environment for our
18
+ community include:
19
+
20
+ * Demonstrating empathy and kindness toward other people
21
+ * Being respectful of differing opinions, viewpoints, and experiences
22
+ * Giving and gracefully accepting constructive feedback
23
+ * Accepting responsibility and apologizing to those affected by our mistakes,
24
+ and learning from the experience
25
+ * Focusing on what is best not just for us as individuals, but for the overall
26
+ community
27
+
28
+ Examples of unacceptable behavior include:
29
+
30
+ * The use of sexualized language or imagery, and sexual attention or advances of
31
+ any kind
32
+ * Trolling, insulting or derogatory comments, and personal or political attacks
33
+ * Public or private harassment
34
+ * Publishing others' private information, such as a physical or email address,
35
+ without their explicit permission
36
+ * Other conduct which could reasonably be considered inappropriate in a
37
+ professional setting
38
+
39
+ ## Enforcement Responsibilities
40
+
41
+ Community leaders are responsible for clarifying and enforcing our standards of
42
+ acceptable behavior and will take appropriate and fair corrective action in
43
+ response to any behavior that they deem inappropriate, threatening, offensive,
44
+ or harmful.
45
+
46
+ Community leaders have the right and responsibility to remove, edit, or reject
47
+ comments, commits, code, wiki edits, issues, and other contributions that are
48
+ not aligned to this Code of Conduct, and will communicate reasons for moderation
49
+ decisions when appropriate.
50
+
51
+ ## Scope
52
+
53
+ This Code of Conduct applies within all community spaces, and also applies when
54
+ an individual is officially representing the community in public spaces.
55
+ Examples of representing our community include using an official email address,
56
+ posting via an official social media account, or acting as an appointed
57
+ representative at an online or offline event.
58
+
59
+ ## Enforcement
60
+
61
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
+ reported to the community leaders responsible for enforcement at
63
+ [INSERT CONTACT METHOD].
64
+ All complaints will be reviewed and investigated promptly and fairly.
65
+
66
+ All community leaders are obligated to respect the privacy and security of the
67
+ reporter of any incident.
68
+
69
+ ## Enforcement Guidelines
70
+
71
+ Community leaders will follow these Community Impact Guidelines in determining
72
+ the consequences for any action they deem in violation of this Code of Conduct:
73
+
74
+ ### 1. Correction
75
+
76
+ **Community Impact**: Use of inappropriate language or other behavior deemed
77
+ unprofessional or unwelcome in the community.
78
+
79
+ **Consequence**: A private, written warning from community leaders, providing
80
+ clarity around the nature of the violation and an explanation of why the
81
+ behavior was inappropriate. A public apology may be requested.
82
+
83
+ ### 2. Warning
84
+
85
+ **Community Impact**: A violation through a single incident or series of
86
+ actions.
87
+
88
+ **Consequence**: A warning with consequences for continued behavior. No
89
+ interaction with the people involved, including unsolicited interaction with
90
+ those enforcing the Code of Conduct, for a specified period of time. This
91
+ includes avoiding interactions in community spaces as well as external channels
92
+ like social media. Violating these terms may lead to a temporary or permanent
93
+ ban.
94
+
95
+ ### 3. Temporary Ban
96
+
97
+ **Community Impact**: A serious violation of community standards, including
98
+ sustained inappropriate behavior.
99
+
100
+ **Consequence**: A temporary ban from any sort of interaction or public
101
+ communication with the community for a specified period of time. No public or
102
+ private interaction with the people involved, including unsolicited interaction
103
+ with those enforcing the Code of Conduct, is allowed during this period.
104
+ Violating these terms may lead to a permanent ban.
105
+
106
+ ### 4. Permanent Ban
107
+
108
+ **Community Impact**: Demonstrating a pattern of violation of community
109
+ standards, including sustained inappropriate behavior, harassment of an
110
+ individual, or aggression toward or disparagement of classes of individuals.
111
+
112
+ **Consequence**: A permanent ban from any sort of public interaction within the
113
+ community.
114
+
115
+ ## Attribution
116
+
117
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118
+ version 2.1, available at
119
+ [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
120
+
121
+ Community Impact Guidelines were inspired by
122
+ [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
123
+
124
+ For answers to common questions about this code of conduct, see the FAQ at
125
+ [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
126
+ [https://www.contributor-covenant.org/translations][translations].
127
+
128
+ [homepage]: https://www.contributor-covenant.org
129
+ [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
130
+ [Mozilla CoC]: https://github.com/mozilla/diversity
131
+ [FAQ]: https://www.contributor-covenant.org/faq
132
+ [translations]: https://www.contributor-covenant.org/translations
data/LICENSE ADDED
@@ -0,0 +1,31 @@
1
+ BSD 2-Clause License
2
+
3
+ Copyright (c) 2025-2026, Kotoshu contributors
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ 1. Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ 2. Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ ------------------------------------------------------------------------------
28
+
29
+ Bundled dictionaries and frequency lists carry their own licenses — see the
30
+ per-language `license` files in https://github.com/kotoshu/dictionaries and
31
+ the attribution file at the root of that repository.