kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
data/Rakefile
ADDED
data/SECURITY.md
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# Security Policy
|
|
2
|
+
|
|
3
|
+
## Reporting a Vulnerability
|
|
4
|
+
|
|
5
|
+
Email security@kotoshu.org with details and a reproduction. You will
|
|
6
|
+
receive an acknowledgement within 72 hours. Please do not file public
|
|
7
|
+
issues for security reports.
|
|
8
|
+
|
|
9
|
+
## Supported Versions
|
|
10
|
+
|
|
11
|
+
The latest 0.x minor is supported. Older minors are not maintained.
|
|
12
|
+
|
|
13
|
+
## Threat Model
|
|
14
|
+
|
|
15
|
+
Kotoshu downloads dictionaries, frequency lists, and (optionally)
|
|
16
|
+
embedding models from public GitHub repositories on first use:
|
|
17
|
+
|
|
18
|
+
| Resource | Source repo | Cached at |
|
|
19
|
+
|---|---|---|
|
|
20
|
+
| Spelling dictionaries | `kotoshu/dictionaries` | `~/.kotoshu/languages/{code}/` |
|
|
21
|
+
| Kelly frequency lists | `kotoshu/frequency-list-kelly` | `~/.kotoshu/frequency-lists/{code}/` |
|
|
22
|
+
| FastText / ONNX models | `kotoshu/models-fasttext-onnx` | `~/.kotoshu/models/{code}/` |
|
|
23
|
+
|
|
24
|
+
Downloads flow over HTTPS from `raw.githubusercontent.com`. The threat
|
|
25
|
+
model assumes GitHub serves the bytes the repository owner committed,
|
|
26
|
+
and treats anyone with push access to those repos as trusted.
|
|
27
|
+
|
|
28
|
+
## Integrity Verification
|
|
29
|
+
|
|
30
|
+
Each content repo may ship a `manifest.json` at its root listing every
|
|
31
|
+
file with its SHA-256 hash, size, and language/type tags. When a
|
|
32
|
+
manifest is present, every download is verified against it:
|
|
33
|
+
|
|
34
|
+
1. The manifest is fetched once per cache session.
|
|
35
|
+
2. Each downloaded file's SHA-256 is computed locally and compared to
|
|
36
|
+
the manifest entry.
|
|
37
|
+
3. On mismatch, `Kotoshu::IntegrityError` is raised with the expected
|
|
38
|
+
and actual hashes. The download is rejected and the cache is left
|
|
39
|
+
untouched.
|
|
40
|
+
4. Every verification outcome (verified / unverified / mismatch /
|
|
41
|
+
missing) is appended to `~/.kotoshu/audit.log` as one JSON object
|
|
42
|
+
per line.
|
|
43
|
+
|
|
44
|
+
### Graceful Degradation
|
|
45
|
+
|
|
46
|
+
When a manifest is **absent** (HTTP 404), verification silently
|
|
47
|
+
downgrades to `"unverified"` status and the download proceeds. This
|
|
48
|
+
preserves forward compatibility with repos that have not yet shipped a
|
|
49
|
+
manifest. The audit log records the difference.
|
|
50
|
+
|
|
51
|
+
### Strict Mode
|
|
52
|
+
|
|
53
|
+
`Kotoshu.spellchecker_for(lang, strict: true)` (and the CLI's
|
|
54
|
+
`--strict` flag) re-raise on any optional-resource failure — including
|
|
55
|
+
integrity mismatches on frequency data — instead of silently
|
|
56
|
+
degrading. Spelling-dictionary integrity is always enforced.
|
|
57
|
+
|
|
58
|
+
## Cache Layout
|
|
59
|
+
|
|
60
|
+
The cache is written under `$KOTOSHU_HOME` (default `~/.kotoshu/`).
|
|
61
|
+
Files are created with the user's default umask. Cache contents are
|
|
62
|
+
not encrypted at rest.
|
|
63
|
+
|
|
64
|
+
## Audit Log
|
|
65
|
+
|
|
66
|
+
`~/.kotoshu/audit.log` is append-only, JSON-per-line, and never
|
|
67
|
+
auto-rotated. Operators in multi-user environments should rotate it
|
|
68
|
+
via logrotate or equivalent. To inspect:
|
|
69
|
+
|
|
70
|
+
[source,bash]
|
|
71
|
+
----
|
|
72
|
+
cat ~/.kotoshu/audit.log | jq .
|
|
73
|
+
----
|
|
74
|
+
|
|
75
|
+
To clear the audit log:
|
|
76
|
+
|
|
77
|
+
[source,ruby]
|
|
78
|
+
----
|
|
79
|
+
Kotoshu::Integrity::AuditLog.new(path: "#{ENV['HOME']}/.kotoshu/audit.log").clear!
|
|
80
|
+
----
|
|
81
|
+
|
|
82
|
+
## Network Egress
|
|
83
|
+
|
|
84
|
+
`offline: true` (or `KOTOSHU_OFFLINE=1` or `--offline`) disables all
|
|
85
|
+
network egress and only reads from the on-disk cache. If a required
|
|
86
|
+
resource is not cached, the call raises `Kotoshu::ResourceNotCachedError`
|
|
87
|
+
(CLI exits 3). Use `kotoshu fetch LANGUAGE` to pre-warm the cache in
|
|
88
|
+
environments without outbound network access.
|
|
89
|
+
|
|
90
|
+
## Scope
|
|
91
|
+
|
|
92
|
+
This policy covers the kotoshu gem itself. Vulnerabilities in
|
|
93
|
+
dependencies (Thor, suika, onnxruntime) should be reported upstream.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Example 1: Basic Word Checking
|
|
5
|
+
#
|
|
6
|
+
# This example demonstrates the simplest way to use Kotoshu
|
|
7
|
+
# to check if words are spelled correctly.
|
|
8
|
+
|
|
9
|
+
require_relative "../lib/kotoshu"
|
|
10
|
+
|
|
11
|
+
puts "=== Example 1: Basic Word Checking ==="
|
|
12
|
+
puts
|
|
13
|
+
|
|
14
|
+
# Check if words are correct
|
|
15
|
+
puts "Is 'hello' correct? #{Kotoshu.correct?("hello")}"
|
|
16
|
+
puts "Is 'world' correct? #{Kotoshu.correct?("world")}"
|
|
17
|
+
puts "Is 'helo' correct? #{Kotoshu.correct?("helo")}"
|
|
18
|
+
puts "Is 'Kotoshu' correct? #{Kotoshu.correct?("Kotoshu")}"
|
|
19
|
+
puts
|
|
20
|
+
|
|
21
|
+
# Get suggestions for misspelled words
|
|
22
|
+
puts "Suggestions for 'helo':"
|
|
23
|
+
suggestions = Kotoshu.suggest("helo")
|
|
24
|
+
puts suggestions.to_words.join(", ")
|
|
25
|
+
puts
|
|
26
|
+
|
|
27
|
+
puts "Suggestions for 'wrold':"
|
|
28
|
+
suggestions = Kotoshu.suggest("wrold")
|
|
29
|
+
puts suggestions.to_words.join(", ")
|
|
30
|
+
puts
|
|
31
|
+
|
|
32
|
+
# Check multiple words
|
|
33
|
+
words = %w[hello world test helo wrold]
|
|
34
|
+
puts "Checking multiple words:"
|
|
35
|
+
words.each do |word|
|
|
36
|
+
status = Kotoshu.correct?(word) ? "✓" : "✗"
|
|
37
|
+
puts " #{status} #{word}"
|
|
38
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Example 2: Text and Document Checking
|
|
5
|
+
#
|
|
6
|
+
# This example demonstrates how to check paragraphs and documents
|
|
7
|
+
# for spelling errors and get detailed results.
|
|
8
|
+
|
|
9
|
+
require_relative "../lib/kotoshu"
|
|
10
|
+
|
|
11
|
+
puts "=== Example 2: Text and Document Checking ==="
|
|
12
|
+
puts
|
|
13
|
+
|
|
14
|
+
# Check a paragraph of text
|
|
15
|
+
text = <<~TEXT
|
|
16
|
+
Hello wrold!
|
|
17
|
+
|
|
18
|
+
This is a test document with some misspelled words.
|
|
19
|
+
We want to see if the spellchcker can find them al.
|
|
20
|
+
|
|
21
|
+
Teh quick brown fox jumps over the lazy dog.
|
|
22
|
+
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
|
|
23
|
+
TEXT
|
|
24
|
+
|
|
25
|
+
puts "Checking text:"
|
|
26
|
+
puts "-" * 40
|
|
27
|
+
result = Kotoshu.check(text)
|
|
28
|
+
|
|
29
|
+
puts result
|
|
30
|
+
unless result.success?
|
|
31
|
+
puts
|
|
32
|
+
puts "Errors found:"
|
|
33
|
+
result.each_error do |error|
|
|
34
|
+
suggestions_str = if error.has_suggestions?
|
|
35
|
+
" (did you mean #{error.top_suggestions(3).join(", ")}?)"
|
|
36
|
+
else
|
|
37
|
+
""
|
|
38
|
+
end
|
|
39
|
+
puts " • #{error.word}#{suggestions_str}"
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
puts
|
|
44
|
+
puts "=" * 40
|
|
45
|
+
puts
|
|
46
|
+
|
|
47
|
+
# Check a file
|
|
48
|
+
file_path = "spec/fixtures/documents/with_errors.txt"
|
|
49
|
+
if File.exist?(file_path)
|
|
50
|
+
puts "Checking file: #{file_path}"
|
|
51
|
+
puts "-" * 40
|
|
52
|
+
|
|
53
|
+
file_result = Kotoshu.check_file(file_path)
|
|
54
|
+
|
|
55
|
+
if file_result.success?
|
|
56
|
+
puts "✓ No errors found (#{file_result.word_count} words checked)"
|
|
57
|
+
else
|
|
58
|
+
puts "✗ #{file_result.error_count} error(s) found:"
|
|
59
|
+
puts
|
|
60
|
+
file_result.each_unique_error do |word, errors|
|
|
61
|
+
puts " • #{word} (appears #{errors.size}x)"
|
|
62
|
+
first_error = errors.first
|
|
63
|
+
puts " Suggestions: #{first_error.top_suggestions(3).join(", ")}" if first_error.has_suggestions?
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
puts
|
|
69
|
+
puts "=" * 40
|
|
70
|
+
puts
|
|
71
|
+
|
|
72
|
+
# Document result statistics
|
|
73
|
+
puts "Document Statistics:"
|
|
74
|
+
puts " Word count: #{result.word_count}"
|
|
75
|
+
puts " Error count: #{result.error_count}"
|
|
76
|
+
puts " Unique errors: #{result.unique_error_count}"
|
|
77
|
+
puts " Error summary: #{result.error_summary.inspect}"
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Example 3: Using Different Dictionary Backends
|
|
5
|
+
#
|
|
6
|
+
# This example demonstrates how to use different dictionary backends
|
|
7
|
+
# including UnixWords, PlainText, Custom, and Hunspell.
|
|
8
|
+
|
|
9
|
+
require_relative "../lib/kotoshu"
|
|
10
|
+
|
|
11
|
+
puts "=== Example 3: Dictionary Backends ==="
|
|
12
|
+
puts
|
|
13
|
+
|
|
14
|
+
# Example 1: UnixWords dictionary
|
|
15
|
+
puts "1. UnixWords Dictionary (System Dictionary)"
|
|
16
|
+
puts "-" * 40
|
|
17
|
+
|
|
18
|
+
unix_dict = Kotoshu::Dictionary::UnixWords.detect(language_code: "en-US")
|
|
19
|
+
if unix_dict
|
|
20
|
+
puts "Loaded: #{unix_dict.path}"
|
|
21
|
+
puts "Words: #{unix_dict.size}"
|
|
22
|
+
puts "Has 'hello': #{unix_dict.lookup?("hello")}"
|
|
23
|
+
puts "Has 'Kotoshu': #{unix_dict.lookup?("Kotoshu")}"
|
|
24
|
+
suggestions = unix_dict.suggest("helo", max_suggestions: 5)
|
|
25
|
+
puts "Suggestions for 'helo': #{suggestions.join(", ")}"
|
|
26
|
+
else
|
|
27
|
+
puts "No system dictionary found"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
puts
|
|
31
|
+
puts "=" * 40
|
|
32
|
+
puts
|
|
33
|
+
|
|
34
|
+
# Example 2: PlainText dictionary
|
|
35
|
+
puts "2. PlainText Dictionary"
|
|
36
|
+
puts "-" * 40
|
|
37
|
+
|
|
38
|
+
plain_dict = Kotoshu::Dictionary::PlainText.from_words(
|
|
39
|
+
%w[hello world kotoshu ruby spellchecker],
|
|
40
|
+
language_code: "en"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
puts "Created dictionary with #{plain_dict.size} words"
|
|
44
|
+
puts "Has 'hello': #{plain_dict.lookup?("hello")}"
|
|
45
|
+
puts "Has 'ruby': #{plain_dict.lookup?("ruby")}"
|
|
46
|
+
puts "Has 'python': #{plain_dict.lookup?("python")}"
|
|
47
|
+
|
|
48
|
+
# Add a word dynamically
|
|
49
|
+
plain_dict.add_word("python")
|
|
50
|
+
puts "After adding 'python': #{plain_dict.lookup?("python")}"
|
|
51
|
+
plain_dict.add_word("Kotoshu")
|
|
52
|
+
puts "After adding 'Kotoshu': #{plain_dict.lookup?("Kotoshu")}"
|
|
53
|
+
|
|
54
|
+
puts
|
|
55
|
+
puts "=" * 40
|
|
56
|
+
puts
|
|
57
|
+
|
|
58
|
+
# Example 3: Custom dictionary
|
|
59
|
+
puts "3. Custom Dictionary (In-Memory)"
|
|
60
|
+
puts "-" * 40
|
|
61
|
+
|
|
62
|
+
custom_dict = Kotoshu::Dictionary::Custom.new(
|
|
63
|
+
words: %w[Kotoshu spellchecker ruby],
|
|
64
|
+
language_code: "en"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
puts "Created custom dictionary"
|
|
68
|
+
puts "Words: #{custom_dict.words.inspect}"
|
|
69
|
+
puts "Size: #{custom_dict.size}"
|
|
70
|
+
puts "Has 'Kotoshu': #{custom_dict.lookup?("Kotoshu")}"
|
|
71
|
+
|
|
72
|
+
# Merge with another array
|
|
73
|
+
custom_dict.merge(%w[gem library code])
|
|
74
|
+
puts "After merging: #{custom_dict.words.inspect}"
|
|
75
|
+
|
|
76
|
+
puts
|
|
77
|
+
puts "=" * 40
|
|
78
|
+
puts
|
|
79
|
+
|
|
80
|
+
# Example 4: Hunspell dictionary (if available)
|
|
81
|
+
puts "4. Hunspell Dictionary"
|
|
82
|
+
puts "-" * 40
|
|
83
|
+
|
|
84
|
+
hunspell_dic = "dictionaries/hunspell/test/en_US_test.dic"
|
|
85
|
+
hunspell_aff = "dictionaries/hunspell/test/en_US_test.aff"
|
|
86
|
+
|
|
87
|
+
if File.exist?(hunspell_dic) && File.exist?(hunspell_aff)
|
|
88
|
+
hunspell_dict = Kotoshu::Dictionary::Hunspell.new(
|
|
89
|
+
dic_path: hunspell_dic,
|
|
90
|
+
aff_path: hunspell_aff,
|
|
91
|
+
language_code: "en-US"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
puts "Loaded Hunspell dictionary"
|
|
95
|
+
puts "Words: #{hunspell_dict.size}"
|
|
96
|
+
puts "Has 'hello': #{hunspell_dict.lookup?("hello")}"
|
|
97
|
+
puts "Has 'hello' (case-insensitive): #{hunspell_dict.lookup?("HELLO")}"
|
|
98
|
+
puts "Has 'runs': #{hunspell_dict.lookup?("runs")}"
|
|
99
|
+
puts "Has 'running': #{hunspell_dict.lookup?("running")}"
|
|
100
|
+
|
|
101
|
+
# Show word variants using affix rules
|
|
102
|
+
puts "\nWord variants for 'run':"
|
|
103
|
+
variants = hunspell_dict.word_variants("run")
|
|
104
|
+
puts " #{variants.inspect}"
|
|
105
|
+
else
|
|
106
|
+
puts "Hunspell test dictionary not found at:"
|
|
107
|
+
puts " #{hunspell_dic}"
|
|
108
|
+
puts " #{hunspell_aff}"
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
puts
|
|
112
|
+
puts "=" * 40
|
|
113
|
+
puts
|
|
114
|
+
|
|
115
|
+
# Example 5: CSpell dictionary
|
|
116
|
+
puts "5. CSpell Dictionary (Trie-based)"
|
|
117
|
+
puts "-" * 40
|
|
118
|
+
|
|
119
|
+
cspell_dict = Kotoshu::Dictionary::CSpell.from_words(
|
|
120
|
+
%w[hello world kotoshu ruby gem],
|
|
121
|
+
language_code: "en"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
puts "Created CSpell dictionary with trie"
|
|
125
|
+
puts "Words: #{cspell_dict.words.inspect}"
|
|
126
|
+
puts "Size: #{cspell_dict.size}"
|
|
127
|
+
puts "Has 'hello': #{cspell_dict.lookup?("hello")}"
|
|
128
|
+
puts "Has prefix 'hel': #{cspell_dict.has_prefix?("hel")}"
|
|
129
|
+
puts "Words with prefix 'hel': #{cspell_dict.words_with_prefix("hel").inspect}"
|
|
130
|
+
|
|
131
|
+
# Convert to trie
|
|
132
|
+
trie = cspell_dict.trie
|
|
133
|
+
puts "\nTrie structure:"
|
|
134
|
+
puts " Has 'hello': #{trie.has_word?("hello")}"
|
|
135
|
+
puts " Has prefix 'wo': #{trie.has_prefix?("wo")}"
|
|
136
|
+
puts " Words with prefix 'wo': #{trie.words_with_prefix("wo").inspect}"
|
|
137
|
+
puts " Suggestions for 'he': #{trie.suggestions("he").inspect}"
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Example 4: Trie Data Structure
|
|
5
|
+
#
|
|
6
|
+
# This example demonstrates how to use the Trie data structure
|
|
7
|
+
# for efficient word lookup and prefix-based operations.
|
|
8
|
+
|
|
9
|
+
require_relative "../lib/kotoshu"
|
|
10
|
+
|
|
11
|
+
puts "=== Example 4: Trie Data Structure ==="
|
|
12
|
+
puts
|
|
13
|
+
|
|
14
|
+
# Build a trie from an array of words
|
|
15
|
+
words = %w[
|
|
16
|
+
hello help held heap
|
|
17
|
+
world work word
|
|
18
|
+
test text toast
|
|
19
|
+
run running runner
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
trie = Kotoshu.trie(words)
|
|
23
|
+
|
|
24
|
+
puts "Built trie with #{words.size} words"
|
|
25
|
+
puts "All words: #{trie.all_words.inspect}"
|
|
26
|
+
puts
|
|
27
|
+
|
|
28
|
+
# Lookup operations
|
|
29
|
+
puts "Lookup Operations:"
|
|
30
|
+
puts "-" * 20
|
|
31
|
+
puts "Has 'hello': #{trie.has_word?("hello")}"
|
|
32
|
+
puts "Has 'hell': #{trie.has_word?("hell")}"
|
|
33
|
+
puts "Has 'HELLO' (case-sensitive): #{trie.has_word?("HELLO")}"
|
|
34
|
+
puts
|
|
35
|
+
|
|
36
|
+
# Prefix operations
|
|
37
|
+
puts "Prefix Operations:"
|
|
38
|
+
puts "-" * 20
|
|
39
|
+
puts "Has prefix 'hel': #{trie.has_prefix?("hel")}"
|
|
40
|
+
puts "Has prefix 'wor': #{trie.has_prefix?("wor")}"
|
|
41
|
+
puts "Has prefix 'xyz': #{trie.has_prefix?("xyz")}"
|
|
42
|
+
puts
|
|
43
|
+
|
|
44
|
+
# Words with prefix
|
|
45
|
+
puts "Words with prefix 'hel': #{trie.words_with_prefix("hel").inspect}"
|
|
46
|
+
puts "Words with prefix 'te': #{trie.words_with_prefix("te").inspect}"
|
|
47
|
+
puts
|
|
48
|
+
|
|
49
|
+
# Suggestions based on prefix
|
|
50
|
+
puts "Suggestions for 'hel':"
|
|
51
|
+
puts " #{trie.suggestions("hel", max_results: 10).inspect}"
|
|
52
|
+
puts
|
|
53
|
+
|
|
54
|
+
puts "Suggestions for 'te':"
|
|
55
|
+
puts " #{trie.suggestions("te", max_results: 10).inspect}"
|
|
56
|
+
puts
|
|
57
|
+
|
|
58
|
+
# Traverse the trie
|
|
59
|
+
puts "Traversing trie:"
|
|
60
|
+
puts "-" * 20
|
|
61
|
+
trie.each_word do |word, payload|
|
|
62
|
+
puts " #{word} (payload: #{payload.inspect})"
|
|
63
|
+
end
|
|
64
|
+
puts
|
|
65
|
+
|
|
66
|
+
# Trie builder methods
|
|
67
|
+
puts "Building tries from different sources:"
|
|
68
|
+
puts "-" * 20
|
|
69
|
+
|
|
70
|
+
# From string
|
|
71
|
+
string_trie = Kotoshu.trie("hello world test")
|
|
72
|
+
puts "From string: #{string_trie.all_words.inspect}"
|
|
73
|
+
|
|
74
|
+
# From file (if exists)
|
|
75
|
+
test_file = "dictionaries/plain_text/en_US/words.txt"
|
|
76
|
+
if File.exist?(test_file)
|
|
77
|
+
file_trie = Kotoshu.trie(test_file)
|
|
78
|
+
puts "From file: loaded #{file_trie.size} words"
|
|
79
|
+
puts "First 5 words: #{file_trie.all_words.first(5).inspect}"
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Trie set operations
|
|
83
|
+
puts
|
|
84
|
+
puts "Trie Set Operations:"
|
|
85
|
+
puts "-" * 20
|
|
86
|
+
|
|
87
|
+
trie1 = Kotoshu.trie(%w[hello world test])
|
|
88
|
+
trie2 = Kotoshu.trie(%w[hello world ruby])
|
|
89
|
+
|
|
90
|
+
puts "Trie 1: #{trie1.all_words.inspect}"
|
|
91
|
+
puts "Trie 2: #{trie2.all_words.inspect}"
|
|
92
|
+
puts
|
|
93
|
+
|
|
94
|
+
# Union (|)
|
|
95
|
+
union = trie1 | trie2
|
|
96
|
+
puts "Union: #{union.all_words.inspect}"
|
|
97
|
+
|
|
98
|
+
# Intersection (&)
|
|
99
|
+
intersection = trie1 & trie2
|
|
100
|
+
puts "Intersection: #{intersection.all_words.inspect}"
|
|
101
|
+
|
|
102
|
+
# Merge (mutating)
|
|
103
|
+
merged = trie1.dup
|
|
104
|
+
merged.merge!(trie2)
|
|
105
|
+
puts "Merged: #{merged.all_words.inspect}"
|
|
106
|
+
|
|
107
|
+
# Difference
|
|
108
|
+
# Note: Trie doesn't have difference (-) operator, but we can simulate it
|
|
109
|
+
all_words = trie1.all_words | trie2.all_words
|
|
110
|
+
common = trie1.all_words & trie2.all_words
|
|
111
|
+
difference = all_words - common
|
|
112
|
+
puts "Words in only one trie: #{difference.inspect}"
|
|
113
|
+
|
|
114
|
+
puts
|
|
115
|
+
puts "Trie Statistics:"
|
|
116
|
+
puts "-" * 20
|
|
117
|
+
puts "Total words: #{trie.size}"
|
|
118
|
+
puts "Unique prefixes: #{trie.size}"
|
|
119
|
+
# puts "Max depth: #{trie.max_depth}" # Method not implemented yet
|
|
120
|
+
|
|
121
|
+
# Advanced: Payload storage
|
|
122
|
+
puts
|
|
123
|
+
puts "Advanced: Payload Storage:"
|
|
124
|
+
puts "-" * 20
|
|
125
|
+
|
|
126
|
+
payload_trie = Kotoshu::Core::Trie::Builder.new
|
|
127
|
+
payload_trie.add_word("hello", { definition: "a greeting", count: 5 })
|
|
128
|
+
payload_trie.add_word("help", { definition: "assistance", count: 3 })
|
|
129
|
+
payload_trie.add_word("world", { definition: "earth", count: 1 })
|
|
130
|
+
|
|
131
|
+
payload_trie_obj = payload_trie.build
|
|
132
|
+
|
|
133
|
+
puts "Word 'hello' payload: #{payload_trie_obj.find_node("hello")&.payload.inspect}"
|
|
134
|
+
puts "Word 'help' payload: #{payload_trie_obj.find_node("help")&.payload.inspect}"
|
|
135
|
+
|
|
136
|
+
# Convert IndexedDictionary to trie
|
|
137
|
+
puts
|
|
138
|
+
puts "IndexedDictionary to Trie:"
|
|
139
|
+
puts "-" * 20
|
|
140
|
+
|
|
141
|
+
dict = Kotoshu.dictionary(%w[hello world test])
|
|
142
|
+
trie_from_dict = dict.to_trie
|
|
143
|
+
|
|
144
|
+
puts "Dictionary words: #{dict.words.inspect}"
|
|
145
|
+
puts "Trie words: #{trie_from_dict.all_words.inspect}"
|
|
146
|
+
puts "Trie has 'hello': #{trie_from_dict.has_word?("hello")}"
|