kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 496e00727251ee935375c0a8013132917384d1847fe99ed3752a414f3a73e5d6
|
|
4
|
+
data.tar.gz: 6998e4c7879ecca1888a1bedbfe41af5e5516c8a2497457f887c06f310f53678
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 9bafd7e06458c93a4a3b817fdddd50beac4a74cbcc22c3cf0de805dd1f2211d023c5f7dea2e187b1c0fdbe1772901c7e9c22d2956c3b9d887a2f20dbdb267b35
|
|
7
|
+
data.tar.gz: 4e37cee72f98cf5a171c1f167c52720213ab93d99919f50268bba998836233d09a3e4fb8b0d1756bacc52fb554e42541a7ba8918dba99067dc2da146dd5e3e51
|
data/.rspec
ADDED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
inherit_from:
|
|
2
|
+
- https://raw.githubusercontent.com/riboseinc/oss-guides/main/ci/rubocop.yml
|
|
3
|
+
- .rubocop_todo.yml
|
|
4
|
+
|
|
5
|
+
plugins:
|
|
6
|
+
- rubocop-performance
|
|
7
|
+
- rubocop-rake
|
|
8
|
+
- rubocop-rspec
|
|
9
|
+
|
|
10
|
+
AllCops:
|
|
11
|
+
TargetRubyVersion: 3.0
|
|
12
|
+
NewCops: enable
|
|
13
|
+
SuggestExtensions: false
|
|
14
|
+
Exclude:
|
|
15
|
+
- 'debug_*.rb'
|
|
16
|
+
- 'test_*.rb'
|
|
17
|
+
- 'vendor/**/*'
|
|
18
|
+
- 'tmp/**/*'
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.3.0] — 2026-06-27
|
|
11
|
+
|
|
12
|
+
The two-stage release. Resources are now downloaded explicitly via
|
|
13
|
+
`Kotoshu.setup(:en)`; the hot path (`correct?`, `suggest`, `check`) reads
|
|
14
|
+
only from cache and raises a typed error when a language is missing instead
|
|
15
|
+
of triggering a network download. The CLI adds `setup`, `status`, language
|
|
16
|
+
auto-detection, SARIF/JSON output, and an interactive auto-setup prompt.
|
|
17
|
+
`onnxruntime` is now a soft dependency, so `gem install kotoshu` succeeds on
|
|
18
|
+
hosts that can't load native ONNX runtime.
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
|
|
22
|
+
- **Two-stage resource model** (`Kotoshu::ResourceManager`):
|
|
23
|
+
`Kotoshu.setup(:en, want: %i[spelling frequency model])` writes into the
|
|
24
|
+
cache; `Kotoshu::ResourceManager.resolve(language:, want:)` is instant and
|
|
25
|
+
cache-only, raising `ResourceNotSetupError` on miss. `Kotoshu.setup?` is
|
|
26
|
+
the predicate for "is this language already cached?". The library never
|
|
27
|
+
triggers a surprise download; the CLI prompts the user via `AutoSetup`.
|
|
28
|
+
- **`SourceRegistry`** — single source of truth for the three content repos'
|
|
29
|
+
URLs and per-repo pins. `kotoshu/dictionaries` is pinned to the `v1`
|
|
30
|
+
branch; `frequency-list-kelly` and `models-fasttext-onnx` are on `main`.
|
|
31
|
+
Override at runtime via `KOTOSHU_REPOS_BASE_URL`, `KOTOSHU_DICTIONARIES_PIN`,
|
|
32
|
+
`KOTOSHU_FREQUENCY_PIN`, `KOTOSHU_MODELS_PIN`.
|
|
33
|
+
- **XDG Base Directory layout** (`Kotoshu::Paths`): dictionaries, frequency
|
|
34
|
+
lists, ONNX models under `$XDG_CACHE_HOME/kotoshu/`; personal dictionary
|
|
35
|
+
and `kotoshu.cfg` under `$XDG_CONFIG_HOME/kotoshu/`; audit log under
|
|
36
|
+
`$XDG_DATA_HOME/kotoshu/audit.log`. Override per-axis with
|
|
37
|
+
`KOTOSHU_CACHE_PATH`, `KOTOSHU_CONFIG_PATH`, `KOTOSHU_DATA_PATH`.
|
|
38
|
+
- **Integrity verification** — `Kotoshu::Integrity::Manifest` (SHA-256) is
|
|
39
|
+
fetched per content repo and matched against every download. Mismatches
|
|
40
|
+
raise `Kotoshu::IntegrityError`. Outcomes (verified / unverified / mismatch)
|
|
41
|
+
are written to the audit log. Missing manifests degrade gracefully.
|
|
42
|
+
- **CLI `setup` command** — `kotoshu setup LANG [--force] [--no-frequency]
|
|
43
|
+
[--no-model]` writes the requested resources into the cache with progress
|
|
44
|
+
reporting.
|
|
45
|
+
- **CLI `status` command** — `kotoshu status [--json]` summarises installed
|
|
46
|
+
resources, sizes, mtimes, and ONNX runtime availability.
|
|
47
|
+
- **CLI `check --language auto`** — auto-detects document language via
|
|
48
|
+
FastText LID; falls back to the configured default language when detection
|
|
49
|
+
is unavailable or the detected language is not set up.
|
|
50
|
+
- **CLI `check --format json|sarif`** — machine-readable output. SARIF
|
|
51
|
+
follows v2.1.0 with `kotoshu/spelling` rule id, JSON exposes
|
|
52
|
+
`success`/`wordCount`/`errorCount`/`uniqueErrorCount`/`errors`/`source`.
|
|
53
|
+
- **CLI auto-setup prompt** — when the hot path raises
|
|
54
|
+
`ResourceNotSetupError` in an interactive session, the user is prompted to
|
|
55
|
+
run setup now and the original command is retried on success. Non-TTY,
|
|
56
|
+
offline (`--offline`), and `--no-prompt` invocations skip the prompt and
|
|
57
|
+
surface the error as before.
|
|
58
|
+
- **Download progress reporting** (`Kotoshu::Cli::ProgressReporter`) — TTY
|
|
59
|
+
mode renders a determinate/indeterminate progress bar; non-TTY mode prints
|
|
60
|
+
a periodic line every 10 MiB. `Kotoshu.configuration.download_reporter=`
|
|
61
|
+
exposes the reporter for programmatic use.
|
|
62
|
+
- **End-to-end smoke spec** (`spec/integration/end_to_end_spec.rb`) covers
|
|
63
|
+
install → setup → `correct?` → `suggest.to_words` → `check` →
|
|
64
|
+
`setup?` predicate → `ResourceNotSetupError` → idempotent re-setup.
|
|
65
|
+
Tagged `:network`, opted into via `NETWORK_TESTS=1`.
|
|
66
|
+
- **CLI format spec** (`spec/kotoshu/cli/check_format_spec.rb`) shells out to
|
|
67
|
+
the real `kotoshu` CLI and asserts JSON / SARIF structure and exit codes.
|
|
68
|
+
|
|
69
|
+
### Changed
|
|
70
|
+
|
|
71
|
+
- **`onnxruntime` is a soft dependency.** Removed from `kotoshu.gemspec`.
|
|
72
|
+
`Kotoshu::Models::OnnxModel` soft-requires it at load time and exposes
|
|
73
|
+
`ONNX_LOADED`. When false, semantic methods raise
|
|
74
|
+
`Kotoshu::Models::OnnxModel::OnnxUnavailable` with a caller-friendly
|
|
75
|
+
message. `KOTOSHU_NO_ONNX=1` forces semantic off even when the gem is
|
|
76
|
+
present. The traditional spell-checking path never touches `onnxruntime`.
|
|
77
|
+
- **Loading strategy** — `lib/kotoshu.rb` eagerly loads only the facade
|
|
78
|
+
dependencies; heavier or optional pieces (ONNX models, interactive CLI,
|
|
79
|
+
caches, language detection) are wired through Ruby `autoload` registered
|
|
80
|
+
in their immediate parent namespace.
|
|
81
|
+
- **Public API** — `suggest` returns a `SuggestionSet`; call `.to_words` for
|
|
82
|
+
an `Array<String>`. `Kotoshu.check` returns a `DocumentResult`; iterate
|
|
83
|
+
`errors` for `WordResult` instances with `word`, `position`, `line`,
|
|
84
|
+
`column`, `suggestions`.
|
|
85
|
+
- **README quickstart** — reflects the two-stage API; documents XDG paths;
|
|
86
|
+
marks `onnxruntime` as optional.
|
|
87
|
+
|
|
88
|
+
### Fixed
|
|
89
|
+
|
|
90
|
+
- `gem install kotoshu` no longer requires `onnxruntime` or its native
|
|
91
|
+
toolchain.
|
|
92
|
+
- Resource resolution no longer triggers downloads from inside the hot path.
|
|
93
|
+
- Per-repo pins are honoured — the `v1` branch of `kotoshu/dictionaries` is
|
|
94
|
+
fetched instead of `main`.
|
|
95
|
+
|
|
96
|
+
### Known limitations (carried from 0.1.0, scope reduced)
|
|
97
|
+
|
|
98
|
+
- **Hunspell correctness**: compound rules, circumfix, ICONV/OCONV, German ß,
|
|
99
|
+
Turkish dotless-i remain partial. See `TODO.impl/01-hunspell-correctness.md`.
|
|
100
|
+
- **CJK and RTL**: tokenizer, normalizer, and keyboard layouts exist for
|
|
101
|
+
supported languages; full CJK/RTL support deferred past 0.3.
|
|
102
|
+
See `TODO.impl/06-cjk-support.md` and `TODO.impl/07-rtl-support.md`.
|
|
103
|
+
- **Grammar rules**: the rule engine exists; no rule packs are shipped.
|
|
104
|
+
See `TODO.impl/08-grammar-engine.md`.
|
|
105
|
+
- **Audit log rotation, cache eviction policy, and shell completion** are
|
|
106
|
+
deferred past 0.3 (T3 TODOs).
|
|
107
|
+
|
|
108
|
+
### Internal
|
|
109
|
+
|
|
110
|
+
- 9 logical commits on `release-0.3` cover the T1 (architectural) and T2
|
|
111
|
+
(user-facing) work for this release.
|
|
112
|
+
- `SourceRegistry`, `Paths`, `ResourceManager`, `ResourceBundle`,
|
|
113
|
+
`SetupResult`, `Integrity::Manifest`, `Integrity::AuditLog`,
|
|
114
|
+
`Cli::AutoSetup`, `Cli::StatusReport`, `Cli::LanguageResolver`,
|
|
115
|
+
`Cli::ProgressReporter` are new model-driven types.
|
|
116
|
+
- 73 new specs added (source_registry, end_to_end, check_format,
|
|
117
|
+
progress_reporter, language_resolver, status_report, auto_setup).
|
|
118
|
+
|
|
119
|
+
### Contributors
|
|
120
|
+
|
|
121
|
+
- Ribose Inc.
|
|
122
|
+
|
|
123
|
+
## [0.1.0] — 2026-06-25
|
|
124
|
+
|
|
125
|
+
First public release. Kotoshu is a pure-Ruby spellchecker that combines a
|
|
126
|
+
Ruby port of the Hunspell algorithm with optional FastText ONNX embeddings
|
|
127
|
+
for semantic reranking. This release establishes the public Ruby API, the
|
|
128
|
+
basic CLI, and the cache layer.
|
|
129
|
+
|
|
130
|
+
### Working
|
|
131
|
+
|
|
132
|
+
- **Ruby API**: `Kotoshu.correct?`, `Kotoshu.suggest`, `Kotoshu.check`,
|
|
133
|
+
`Kotoshu.check_file`, `Kotoshu.detect_language`
|
|
134
|
+
- **CLI**: `kotoshu check TARGET`, `kotoshu dict SUBCOMMAND`, `kotoshu cache
|
|
135
|
+
SUBCOMMAND`, `kotoshu version`
|
|
136
|
+
- **Dictionary backends**: Hunspell (`.aff`/`.dic`), CSpell, UnixWords
|
|
137
|
+
(`/usr/share/dict/words`), PlainText, Custom
|
|
138
|
+
- **Suggestion strategies**: edit distance, phonetic (Phonet), keyboard
|
|
139
|
+
proximity, n-gram, symspell, composite pipeline
|
|
140
|
+
- **Configuration**: `Kotoshu.configure`, CLI > ENV (`KOTOSHU_*`) >
|
|
141
|
+
programmatic > defaults via `Configuration::Resolver`
|
|
142
|
+
- **Cache layer**: `LanguageCache`, `FrequencyCache`, `ModelCache` with TTLs
|
|
143
|
+
and download from `kotoshu/dictionaries`, `kotoshu/frequency-list-kelly`,
|
|
144
|
+
`kotoshu/models-fasttext-onnx`
|
|
145
|
+
- **Language detection**: FastText LID, 127 languages
|
|
146
|
+
- **Documents**: Plain text, Markdown (Kramdown), AsciiDoc (Asciidoctor)
|
|
147
|
+
- **Test suite**: 803 of 866 examples passing (92.7%), 6 pending
|
|
148
|
+
|
|
149
|
+
### Known limitations (not blocking 0.1)
|
|
150
|
+
|
|
151
|
+
- **Hunspell correctness**: compound rules, circumfix, ICONV/OCONV, German
|
|
152
|
+
ß, Turkish dotless-i are not fully implemented. Single-word lookup and
|
|
153
|
+
basic affixes work. See `TODO.impl/01-hunspell-correctness.md`.
|
|
154
|
+
- **CLI surface**: `--interactive`, `--format sarif|json|yaml|csv`,
|
|
155
|
+
`--model fasttext|hybrid`, `--language auto` exist in
|
|
156
|
+
`lib/kotoshu/commands/check_command.rb` but are not wired through
|
|
157
|
+
`exe/kotoshu`. See `TODO.impl/02-cli-unification.md`.
|
|
158
|
+
- **Semantic path**: gated behind `ENV['KOTOSHU_REQUIRE_ONNX']` because
|
|
159
|
+
`onnxruntime` loads eagerly otherwise. Hybrid mode is not the default.
|
|
160
|
+
See `TODO.impl/05-semantic-path.md`.
|
|
161
|
+
- **Dynamic resolution**: the three caches exist independently; there is
|
|
162
|
+
no unified `ResourceManager` that takes arbitrary text and yields the
|
|
163
|
+
full resource bundle. See `TODO.impl/03-dynamic-download.md`.
|
|
164
|
+
- **Languages**: code is wired for English by default. The
|
|
165
|
+
`dictionaries` repo has 98 language directories but the gem's
|
|
166
|
+
`lib/kotoshu/languages/` has only 7 modules (de, en, es, fr, ja, pt,
|
|
167
|
+
ru). See `TODO.impl/04-language-modules.md`.
|
|
168
|
+
- **CJK, RTL**: not implemented. See `TODO.impl/06-cjk-support.md`
|
|
169
|
+
and `TODO.impl/07-rtl-support.md`.
|
|
170
|
+
- **Grammar rules**: the rule engine exists; no rule packs are shipped.
|
|
171
|
+
See `TODO.impl/08-grammar-engine.md`.
|
|
172
|
+
- **Integrity verification**: downloaded resources are not currently
|
|
173
|
+
checksummed. See `TODO.impl/09-integrity-security.md`.
|
|
174
|
+
|
|
175
|
+
### Internal
|
|
176
|
+
|
|
177
|
+
- 12 plans under `TODO.impl/` define the path to 1.0
|
|
178
|
+
- Architecture documentation consolidated under `docs/`
|
|
179
|
+
|
|
180
|
+
### Contributors
|
|
181
|
+
|
|
182
|
+
- Ribose Inc.
|
data/CLAUDE.md
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## What Kotoshu Is
|
|
6
|
+
|
|
7
|
+
Kotoshu 「言修」 is a **semantic** spell checker for Ruby. It pairs a traditional
|
|
8
|
+
dictionary/affix backend (Hunspell-style) with ONNX-converted FastText word
|
|
9
|
+
embeddings for context-aware suggestions. The README.adoc is the authoritative
|
|
10
|
+
user-facing description; this file is the contributor-facing map.
|
|
11
|
+
|
|
12
|
+
Key dependencies (`kotoshu.gemspec`): `thor` (CLI), `suika` (tokenizer),
|
|
13
|
+
`onnxruntime` (semantic inference). Ruby 3.1+.
|
|
14
|
+
|
|
15
|
+
## Development Commands
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
bundle exec rspec # Run the full test suite
|
|
19
|
+
bundle exec rspec spec/path/to_spec.rb # Run one file
|
|
20
|
+
bundle exec rspec -e "matches a word" # Run examples matching a name
|
|
21
|
+
bundle exec rspec --only-failures # Rerun just failing examples (uses .rspec_status)
|
|
22
|
+
|
|
23
|
+
NETWORK_TESTS=1 bundle exec rspec # Opt INTO tests that download dictionaries
|
|
24
|
+
bundle exec rubocop # Lint
|
|
25
|
+
bundle exec rubocop -A # Lint with safe auto-fix
|
|
26
|
+
bundle exec rake # default task = spec + rubocop
|
|
27
|
+
|
|
28
|
+
bundle exec bin/console # IRB with Kotoshu loaded
|
|
29
|
+
bundle exec exe/kotoshu check FILE # Run the CLI locally
|
|
30
|
+
gem build kotoshu.gemspec && gem install kotoshu-*.gem
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Notes that aren't obvious from the Rakefile:
|
|
34
|
+
- `spec/spec_helper.rb` excludes anything tagged `:network` unless `NETWORK_TESTS=1` is set — those specs download dictionaries from GitHub and are slow/flaky.
|
|
35
|
+
- SimpleCov runs on every `rspec` invocation (configured in `spec_helper.rb`).
|
|
36
|
+
- `spec/spylls_test_helper.rb` is mixed into every spec. It ports Hunspell's reference test fixtures from [Splylls](https://github.com/neolithos/spylls) (the Python Hunspell port); many specs assert behavior against those fixtures.
|
|
37
|
+
|
|
38
|
+
## Architecture
|
|
39
|
+
|
|
40
|
+
Kotoshu has **two parallel checking paths** that share infrastructure:
|
|
41
|
+
|
|
42
|
+
1. **Traditional path** — `Kotoshu::Spellchecker` (facade) → `Suggestions::Generator` → pluggable `Dictionary::*` backends + `Suggestions::Strategies::*` algorithms. This is what `Kotoshu.correct?` / `Kotoshu.suggest` / `Kotoshu.check` use.
|
|
43
|
+
2. **Semantic path** — `Analyzers::SemanticAnalyzer` driven by an `Models::EmbeddingModel` (`FastTextModel` or `OnnxModel`). Used for context-aware reranking and OOV handling. This path is **opt-in** and only loads when needed.
|
|
44
|
+
|
|
45
|
+
### Layer map
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
exe/kotoshu ─► lib/kotoshu/cli.rb (Kotoshu::Cli::Cli < Thor)
|
|
49
|
+
subcommands: check, dict (DictCommand), cache (CacheCommand)
|
|
50
|
+
helpers: cli/interactive_reviewer, cli/batch_reporter,
|
|
51
|
+
cli/navigation_manager, cli/display_formatter
|
|
52
|
+
|
|
53
|
+
Kotoshu module (lib/kotoshu.rb) ─► public facade methods
|
|
54
|
+
.correct? .suggest .check .check_file .detect_language ...
|
|
55
|
+
all delegate to a singleton Spellchecker
|
|
56
|
+
|
|
57
|
+
Spellchecker ─► Configuration ─► Dictionary::Repository ─► Dictionary::*
|
|
58
|
+
│
|
|
59
|
+
└─► Suggestions::Generator
|
|
60
|
+
└─► Strategies::CompositeStrategy
|
|
61
|
+
(edit_distance, phonetic,
|
|
62
|
+
keyboard_proximity, ngram,
|
|
63
|
+
symspell, semantic)
|
|
64
|
+
|
|
65
|
+
SemanticAnalyzer ─► Models::OnnxModel | Models::FastTextModel
|
|
66
|
+
└─► Embeddings::* (vocabulary, similarity search, LRU cache)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Loading strategy
|
|
70
|
+
|
|
71
|
+
`lib/kotoshu.rb` eagerly `require_relative`s the traditional path (core models, dictionaries, strategies, configuration, spellchecker) and `autoload`s the heavier / optional pieces (ONNX models, documents, interactive CLI, caches, language detection, debug/metrics). When adding a new top-level component, follow the existing split: eager-load only what the facade needs at boot; autoload the rest.
|
|
72
|
+
|
|
73
|
+
**ONNX is a soft dependency.** `onnxruntime` is NOT in `kotoshu.gemspec` — `gem install kotoshu` succeeds without it. `Models::OnnxModel` soft-requires it at load time and exposes `ONNX_LOADED` (true/false). When false, semantic methods raise `Models::OnnxModel::OnnxUnavailable` with a caller-friendly message. `KOTOSHU_NO_ONNX=1` forces semantic off even when the gem is present. The traditional spell-checking path never touches onnxruntime.
|
|
74
|
+
|
|
75
|
+
### Resource lifecycle — two-stage model
|
|
76
|
+
|
|
77
|
+
Resources (dictionaries, frequency lists, ONNX models) flow through a strict two-stage API in `ResourceManager`:
|
|
78
|
+
|
|
79
|
+
1. **Setup** (`Kotoshu.setup(:en, want: %i[spelling frequency model])`, or `Kotoshu.setup(:en, aff:, dic:)` / `from:` for local sources). Slow, network-required, explicit. Writes into the cache.
|
|
80
|
+
2. **Resolve** (`Kotoshu::ResourceManager.resolve(language:, want:)`). Instant, cache-only, raises `ResourceNotSetupError` on miss.
|
|
81
|
+
|
|
82
|
+
The hot path (`Kotoshu.correct?`, `.check`, `.suggest`, `.spellchecker_for`) calls `resolve` and lets the error propagate — **setup is never implicit**. This is intentional: users on metered networks or air-gapped hosts must not get a surprise download. `Kotoshu.setup?(:en, resource: :spelling|:frequency|:model)` is the predicate for "is this already in cache?".
|
|
83
|
+
|
|
84
|
+
`ResourceBundle` (the resolve result) carries `dictionary`, `frequency`, `model`, and `rules`. `SetupResult` (the setup result) reports per-resource status (`:downloaded | :local | :cached | :unavailable`).
|
|
85
|
+
|
|
86
|
+
### Paths — XDG Base Directory
|
|
87
|
+
|
|
88
|
+
All on-disk locations are resolved through `Kotoshu::Paths`, which honors `XDG_CACHE_HOME`, `XDG_CONFIG_HOME`, `XDG_LOCAL_HOME` and the override envs `KOTOSHU_CACHE_PATH`, `KOTOSHU_CONFIG_PATH`, `KOTOSHU_DATA_PATH`. Defaults:
|
|
89
|
+
|
|
90
|
+
| Concern | Default path |
|
|
91
|
+
|---|---|
|
|
92
|
+
| Language dictionaries, frequency lists, ONNX models | `~/.cache/kotoshu/` |
|
|
93
|
+
| Personal dictionary, kotoshu.cfg | `~/.config/kotoshu/` |
|
|
94
|
+
| Audit log | `~/.local/share/kotoshu/audit.log` |
|
|
95
|
+
|
|
96
|
+
### Resource caching
|
|
97
|
+
|
|
98
|
+
Three caches under `~/.cache/kotoshu/` (see `CACHE_ARCHITECTURE.md` for detail, README.adoc for the user-facing version):
|
|
99
|
+
|
|
100
|
+
| Cache | Path | Source | TTL |
|
|
101
|
+
|---|---|---|---|
|
|
102
|
+
| `Cache::LanguageCache` | `~/.cache/kotoshu/languages/{code}/spelling/` | `github.com/kotoshu/dictionaries` | 7 days |
|
|
103
|
+
| `Cache::FrequencyCache` | `~/.cache/kotoshu/frequency-lists/{code}/` | `github.com/kotoshu/frequency-list-kelly` | 7 days |
|
|
104
|
+
| `Cache::ModelCache` | `~/.cache/kotoshu/models/{code}/...` | `github.com/kotoshu/models-fasttext-onnx` (FastText `.vec` → ONNX converted upstream) | 30 days |
|
|
105
|
+
|
|
106
|
+
`FrequencyCache` feeds `frequency_bonus` in `Suggestions::Strategies::EditDistanceStrategy` — high-frequency words get a ranking boost. The `kotoshu cache` subcommand exposes list/status/download/info/purge/clean operations.
|
|
107
|
+
|
|
108
|
+
### Configuration
|
|
109
|
+
|
|
110
|
+
`Configuration` (singleton via `.instance`) is built from a `SCHEMA` hash that declares each option's ENV var, default, type, and description. The `Configuration::Resolver` enforces the priority chain: **CLI flags > ENV (`KOTOSHU_*`) > programmatic > defaults**. When adding a config option, add it to `SCHEMA` (and probably `DEFAULTS`) rather than sprinkling `attr_accessor`s — that's how it picks up ENV support automatically.
|
|
111
|
+
|
|
112
|
+
`dictionary_type` selects the backend: `:unix_words | :plain_text | :custom | :hunspell | :cspell`. The dictionary is lazy-loaded through `Configuration#dictionary` (cached on the instance; call `reset_dictionary` to reload).
|
|
113
|
+
|
|
114
|
+
### Language support
|
|
115
|
+
|
|
116
|
+
Full features (dictionary + affixes + FastText + ONNX + keyboard layout): `de, en, es, fr, pt, ru`.
|
|
117
|
+
Kelly frequency only: `ar, zh, el, it, no, sv` (and `ru`).
|
|
118
|
+
`Language::Identifier` does automatic detection (FastText LID model, 127 languages). Per-language behavior (tokenizer, normalizer) lives in `languages/{code}/language.rb` and `language/tokenizer/*`. Keyboard layouts (`keyboard/layouts/*`) feed `KeyboardProximityStrategy`.
|
|
119
|
+
|
|
120
|
+
### Suggestion strategies
|
|
121
|
+
|
|
122
|
+
`Suggestions::Generator::DEFAULT_ALGORITHMS` = `[EditDistanceStrategy, PhoneticStrategy, KeyboardProximityStrategy, NgramStrategy]`, composed via `Strategies::CompositeStrategy`. Also available: `SymspellStrategy`, `SemanticStrategy`. Register new algorithms via `Kotoshu.register_suggestion_algorithm(:name, Klass)` (uses `BaseStrategy.register_type`).
|
|
123
|
+
|
|
124
|
+
## Code Layout (lib/kotoshu/)
|
|
125
|
+
|
|
126
|
+
| Path | Responsibility |
|
|
127
|
+
|---|---|
|
|
128
|
+
| `kotoshu.rb` | Public facade + eager/autoload wiring |
|
|
129
|
+
| `spellchecker.rb`, `spellchecker/parallel_checker.rb` | Traditional check facade |
|
|
130
|
+
| `paths.rb` | XDG path resolution (cache, config, data, audit log, personal dict) |
|
|
131
|
+
| `resource_manager.rb`, `resource_bundle.rb` | Two-stage setup/resolve flow + result structs |
|
|
132
|
+
| `configuration.rb`, `configuration/{builder,resolver}.rb` | Config + priority resolution |
|
|
133
|
+
| `core/` | Domain models (`Word`, `AffixRule`, `result/*`), `IndexedDictionary`, `Trie/*`, `exceptions` |
|
|
134
|
+
| `dictionary/` | Backends: `base`, `hunspell`, `cspell`, `unix_words`, `plain_text`, `custom`, `unified`, `repository` |
|
|
135
|
+
| `readers/` | Parsers for Hunspell `.aff` / `.dic` (aff_data, aff_reader, dic_reader, condition_checker, lookup_builder) |
|
|
136
|
+
| `suggestions/` | `generator`, `context`, `suggestion{,_set}`, `pipeline`, `strategies/*` |
|
|
137
|
+
| `algorithms/` | Lower-level Hunspell-style suggestion primitives (ported from Spylls): `ngram_suggest`, `phonet_suggest`, `suggest`, `lookup`, `permutations`, `capitalization` |
|
|
138
|
+
| `analyzers/` | `semantic_analyzer` — the embedding-based checker |
|
|
139
|
+
| `models/` | `embedding_model` (abstract), `fasttext_model`, `onnx_model`, `word_embedding`, `nearest_neighbor`, `semantic_error`, `context`, `suggestion` |
|
|
140
|
+
| `embeddings/` | ONNX runtime glue: `onnx_runtime_model`, `vocabulary`, `similarity_engine`, `similarity_search`, `search`, `embedding_pipeline`, `protocols{,_registry}`, `lru_cache` |
|
|
141
|
+
| `cache/` | `base_cache`, `language_cache`, `model_cache`, `frequency_cache`, plus `lookup_cache` / `suggestion_cache` runtime caches |
|
|
142
|
+
| `language/`, `languages/` | Detection (`identifier`, `detector`), registry, per-language modules, tokenizers, normalizers |
|
|
143
|
+
| `documents/` | Document abstraction: `plain_text_document`, `markdown_document`, `asciidoc_document`, `location` |
|
|
144
|
+
| `cli/` | CLI helpers (interactive reviewer, batch reporter, navigation, display) |
|
|
145
|
+
| `commands/` | Thor subcommands: `check_command`, `cache_command`, `model_command` |
|
|
146
|
+
| `grammar/` | Rule engine + pattern matchers (`rule`, `rule_engine`, `rule_loader`, `pattern_matchers/*`) |
|
|
147
|
+
| `keyboard/` | Layout registry + per-layout files (qwerty, qwertz, azerty, jcuken, dvorak) |
|
|
148
|
+
| `components/`, `plugins/`, `data_structures/`, `results/`, `data/` | Tokenizer/POS/synthesizer components, plugin registry, bloom filter, result base, common-words loader |
|
|
149
|
+
|
|
150
|
+
The exe uses `Kotoshu::Cli::Cli` (in `cli.rb`), which registers `dict` → `DictCommand` and `cache` → `CacheCommand` as subcommands. A richer `Kotoshu::CheckCommand` exists in `commands/check_command.rb` (with `--interactive`, `--format sarif/json`, `--model`, `--language auto`) — check which one is actually wired before assuming a CLI flag exists.
|
|
151
|
+
|
|
152
|
+
## Specs
|
|
153
|
+
|
|
154
|
+
Spec layout mirrors lib: `spec/kotoshu/...`, plus `spec/integration/`, `spec/integrational/`, `spec/performance/`, `spec/benchmark/`, `spec/properties/`, `spec/unit/`, `spec/hunspell_tests/` (Splylls-ported fixtures), `spec/fixtures/`, `spec/support/`.
|
|
155
|
+
|
|
156
|
+
Global rules that apply here (see `~/.claude/CLAUDE.md`): **no `double()` in specs** — use real instances or `Struct.new`; **no hand-rolled serialization** (`to_h`/`from_h` on models).
|
|
157
|
+
|
|
158
|
+
## Reference Implementations (read-only, on disk)
|
|
159
|
+
|
|
160
|
+
When implementing features, study these alongside Kotoshu:
|
|
161
|
+
|
|
162
|
+
- `/Users/mulgogi/src/external/hunspell/` — morphological rules, affix processing, suggestions (C++ reference).
|
|
163
|
+
- `/Users/mulgogi/src/external/cspell/` — trie/DAFSA dictionaries, code-aware checking (TypeScript reference).
|
|
164
|
+
- `/Users/mulgogi/src/external/languagetool/` — rule-based grammar, multi-interface (library + HTTP), caching patterns (Java reference).
|
|
165
|
+
- Spylls (Python Hunspell port) — the algorithms in `algorithms/` and the fixtures in `spec/hunspell_tests/` derive from here.
|
|
166
|
+
|
|
167
|
+
## Other Notes
|
|
168
|
+
|
|
169
|
+
- License is **BSD-2-Clause** (not MIT — the README's "License" section is wrong).
|
|
170
|
+
- RBS signatures live in `sig/kotoshu.rbs` (the `sig/kotoshu/` subdirectory is empty). Update signatures when changing public APIs.
|
|
171
|
+
- `scripts/` contains one-off utilities (FastText→ONNX conversion in Python, Kelly frequency parsing, diagnostics). `examples/` has numbered walkthrough scripts (`01_*.rb` … `07_*.rb`).
|
|
172
|
+
- Design history and superseded planning docs live in `docs/` (`architecture.md`, `cache-architecture.md`, `performance.md`, `plugins.md`, `getting-started.md`, plus integrated planning docs like `KOTOSHU_SOLIDIFICATION_PLAN.md`, `ARCHITECTURE_IMPROVEMENTS.md`, `TDD_ITERATION_STRATEGY.md`). Treat them as historical context, verify against current code before relying on them. `TODO.impl/` is the current source of truth for execution plans.
|
data/CODE_OF_CONDUCT.md
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
|
2
|
+
|
|
3
|
+
## Our Pledge
|
|
4
|
+
|
|
5
|
+
We as members, contributors, and leaders pledge to make participation in our
|
|
6
|
+
community a harassment-free experience for everyone, regardless of age, body
|
|
7
|
+
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
|
8
|
+
identity and expression, level of experience, education, socio-economic status,
|
|
9
|
+
nationality, personal appearance, race, caste, color, religion, or sexual
|
|
10
|
+
identity and orientation.
|
|
11
|
+
|
|
12
|
+
We pledge to act and interact in ways that contribute to an open, welcoming,
|
|
13
|
+
diverse, inclusive, and healthy community.
|
|
14
|
+
|
|
15
|
+
## Our Standards
|
|
16
|
+
|
|
17
|
+
Examples of behavior that contributes to a positive environment for our
|
|
18
|
+
community include:
|
|
19
|
+
|
|
20
|
+
* Demonstrating empathy and kindness toward other people
|
|
21
|
+
* Being respectful of differing opinions, viewpoints, and experiences
|
|
22
|
+
* Giving and gracefully accepting constructive feedback
|
|
23
|
+
* Accepting responsibility and apologizing to those affected by our mistakes,
|
|
24
|
+
and learning from the experience
|
|
25
|
+
* Focusing on what is best not just for us as individuals, but for the overall
|
|
26
|
+
community
|
|
27
|
+
|
|
28
|
+
Examples of unacceptable behavior include:
|
|
29
|
+
|
|
30
|
+
* The use of sexualized language or imagery, and sexual attention or advances of
|
|
31
|
+
any kind
|
|
32
|
+
* Trolling, insulting or derogatory comments, and personal or political attacks
|
|
33
|
+
* Public or private harassment
|
|
34
|
+
* Publishing others' private information, such as a physical or email address,
|
|
35
|
+
without their explicit permission
|
|
36
|
+
* Other conduct which could reasonably be considered inappropriate in a
|
|
37
|
+
professional setting
|
|
38
|
+
|
|
39
|
+
## Enforcement Responsibilities
|
|
40
|
+
|
|
41
|
+
Community leaders are responsible for clarifying and enforcing our standards of
|
|
42
|
+
acceptable behavior and will take appropriate and fair corrective action in
|
|
43
|
+
response to any behavior that they deem inappropriate, threatening, offensive,
|
|
44
|
+
or harmful.
|
|
45
|
+
|
|
46
|
+
Community leaders have the right and responsibility to remove, edit, or reject
|
|
47
|
+
comments, commits, code, wiki edits, issues, and other contributions that are
|
|
48
|
+
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
|
49
|
+
decisions when appropriate.
|
|
50
|
+
|
|
51
|
+
## Scope
|
|
52
|
+
|
|
53
|
+
This Code of Conduct applies within all community spaces, and also applies when
|
|
54
|
+
an individual is officially representing the community in public spaces.
|
|
55
|
+
Examples of representing our community include using an official email address,
|
|
56
|
+
posting via an official social media account, or acting as an appointed
|
|
57
|
+
representative at an online or offline event.
|
|
58
|
+
|
|
59
|
+
## Enforcement
|
|
60
|
+
|
|
61
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
|
62
|
+
reported to the community leaders responsible for enforcement at
|
|
63
|
+
[INSERT CONTACT METHOD].
|
|
64
|
+
All complaints will be reviewed and investigated promptly and fairly.
|
|
65
|
+
|
|
66
|
+
All community leaders are obligated to respect the privacy and security of the
|
|
67
|
+
reporter of any incident.
|
|
68
|
+
|
|
69
|
+
## Enforcement Guidelines
|
|
70
|
+
|
|
71
|
+
Community leaders will follow these Community Impact Guidelines in determining
|
|
72
|
+
the consequences for any action they deem in violation of this Code of Conduct:
|
|
73
|
+
|
|
74
|
+
### 1. Correction
|
|
75
|
+
|
|
76
|
+
**Community Impact**: Use of inappropriate language or other behavior deemed
|
|
77
|
+
unprofessional or unwelcome in the community.
|
|
78
|
+
|
|
79
|
+
**Consequence**: A private, written warning from community leaders, providing
|
|
80
|
+
clarity around the nature of the violation and an explanation of why the
|
|
81
|
+
behavior was inappropriate. A public apology may be requested.
|
|
82
|
+
|
|
83
|
+
### 2. Warning
|
|
84
|
+
|
|
85
|
+
**Community Impact**: A violation through a single incident or series of
|
|
86
|
+
actions.
|
|
87
|
+
|
|
88
|
+
**Consequence**: A warning with consequences for continued behavior. No
|
|
89
|
+
interaction with the people involved, including unsolicited interaction with
|
|
90
|
+
those enforcing the Code of Conduct, for a specified period of time. This
|
|
91
|
+
includes avoiding interactions in community spaces as well as external channels
|
|
92
|
+
like social media. Violating these terms may lead to a temporary or permanent
|
|
93
|
+
ban.
|
|
94
|
+
|
|
95
|
+
### 3. Temporary Ban
|
|
96
|
+
|
|
97
|
+
**Community Impact**: A serious violation of community standards, including
|
|
98
|
+
sustained inappropriate behavior.
|
|
99
|
+
|
|
100
|
+
**Consequence**: A temporary ban from any sort of interaction or public
|
|
101
|
+
communication with the community for a specified period of time. No public or
|
|
102
|
+
private interaction with the people involved, including unsolicited interaction
|
|
103
|
+
with those enforcing the Code of Conduct, is allowed during this period.
|
|
104
|
+
Violating these terms may lead to a permanent ban.
|
|
105
|
+
|
|
106
|
+
### 4. Permanent Ban
|
|
107
|
+
|
|
108
|
+
**Community Impact**: Demonstrating a pattern of violation of community
|
|
109
|
+
standards, including sustained inappropriate behavior, harassment of an
|
|
110
|
+
individual, or aggression toward or disparagement of classes of individuals.
|
|
111
|
+
|
|
112
|
+
**Consequence**: A permanent ban from any sort of public interaction within the
|
|
113
|
+
community.
|
|
114
|
+
|
|
115
|
+
## Attribution
|
|
116
|
+
|
|
117
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
|
118
|
+
version 2.1, available at
|
|
119
|
+
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
|
|
120
|
+
|
|
121
|
+
Community Impact Guidelines were inspired by
|
|
122
|
+
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
|
|
123
|
+
|
|
124
|
+
For answers to common questions about this code of conduct, see the FAQ at
|
|
125
|
+
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
|
|
126
|
+
[https://www.contributor-covenant.org/translations][translations].
|
|
127
|
+
|
|
128
|
+
[homepage]: https://www.contributor-covenant.org
|
|
129
|
+
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
|
|
130
|
+
[Mozilla CoC]: https://github.com/mozilla/diversity
|
|
131
|
+
[FAQ]: https://www.contributor-covenant.org/faq
|
|
132
|
+
[translations]: https://www.contributor-covenant.org/translations
|
data/LICENSE
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
BSD 2-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025-2026, Kotoshu contributors
|
|
4
|
+
All rights reserved.
|
|
5
|
+
|
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
|
8
|
+
|
|
9
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
10
|
+
list of conditions and the following disclaimer.
|
|
11
|
+
|
|
12
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
13
|
+
this list of conditions and the following disclaimer in the documentation
|
|
14
|
+
and/or other materials provided with the distribution.
|
|
15
|
+
|
|
16
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
17
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
18
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
19
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
20
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
21
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
22
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
23
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
24
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
25
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
26
|
+
|
|
27
|
+
------------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
Bundled dictionaries and frequency lists carry their own licenses — see the
|
|
30
|
+
per-language `license` files in https://github.com/kotoshu/dictionaries and
|
|
31
|
+
the attribution file at the root of that repository.
|