kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require "rubocop/rake_task"
9
+
10
+ RuboCop::RakeTask.new
11
+
12
+ task default: %i[spec rubocop]
data/SECURITY.md ADDED
@@ -0,0 +1,93 @@
1
+ # Security Policy
2
+
3
+ ## Reporting a Vulnerability
4
+
5
+ Email security@kotoshu.org with details and a reproduction. You will
6
+ receive an acknowledgement within 72 hours. Please do not file public
7
+ issues for security reports.
8
+
9
+ ## Supported Versions
10
+
11
+ The latest 0.x minor is supported. Older minors are not maintained.
12
+
13
+ ## Threat Model
14
+
15
+ Kotoshu downloads dictionaries, frequency lists, and (optionally)
16
+ embedding models from public GitHub repositories on first use:
17
+
18
+ | Resource | Source repo | Cached at |
19
+ |---|---|---|
20
+ | Spelling dictionaries | `kotoshu/dictionaries` | `~/.kotoshu/languages/{code}/` |
21
+ | Kelly frequency lists | `kotoshu/frequency-list-kelly` | `~/.kotoshu/frequency-lists/{code}/` |
22
+ | FastText / ONNX models | `kotoshu/models-fasttext-onnx` | `~/.kotoshu/models/{code}/` |
23
+
24
+ Downloads flow over HTTPS from `raw.githubusercontent.com`. The threat
25
+ model assumes GitHub serves the bytes the repository owner committed,
26
+ and treats anyone with push access to those repos as trusted.
27
+
28
+ ## Integrity Verification
29
+
30
+ Each content repo may ship a `manifest.json` at its root listing every
31
+ file with its SHA-256 hash, size, and language/type tags. When a
32
+ manifest is present, every download is verified against it:
33
+
34
+ 1. The manifest is fetched once per cache session.
35
+ 2. Each downloaded file's SHA-256 is computed locally and compared to
36
+ the manifest entry.
37
+ 3. On mismatch, `Kotoshu::IntegrityError` is raised with the expected
38
+ and actual hashes. The download is rejected and the cache is left
39
+ untouched.
40
+ 4. Every verification outcome (verified / unverified / mismatch /
41
+ missing) is appended to `~/.kotoshu/audit.log` as one JSON object
42
+ per line.
43
+
44
+ ### Graceful Degradation
45
+
46
+ When a manifest is **absent** (HTTP 404), verification silently
47
+ downgrades to `"unverified"` status and the download proceeds. This
48
+ preserves forward compatibility with repos that have not yet shipped a
49
+ manifest. The audit log records the difference.
50
+
51
+ ### Strict Mode
52
+
53
+ `Kotoshu.spellchecker_for(lang, strict: true)` (and the CLI's
54
+ `--strict` flag) re-raise on any optional-resource failure — including
55
+ integrity mismatches on frequency data — instead of silently
56
+ degrading. Spelling-dictionary integrity is always enforced.
57
+
58
+ ## Cache Layout
59
+
60
+ The cache is written under `$KOTOSHU_HOME` (default `~/.kotoshu/`).
61
+ Files are created with the user's default umask. Cache contents are
62
+ not encrypted at rest.
63
+
64
+ ## Audit Log
65
+
66
+ `~/.kotoshu/audit.log` is append-only, JSON-per-line, and never
67
+ auto-rotated. Operators in multi-user environments should rotate it
68
+ via logrotate or equivalent. To inspect:
69
+
70
+ [source,bash]
71
+ ----
72
+ cat ~/.kotoshu/audit.log | jq .
73
+ ----
74
+
75
+ To clear the audit log:
76
+
77
+ [source,ruby]
78
+ ----
79
+ Kotoshu::Integrity::AuditLog.new(path: "#{ENV['HOME']}/.kotoshu/audit.log").clear!
80
+ ----
81
+
82
+ ## Network Egress
83
+
84
+ `offline: true` (or `KOTOSHU_OFFLINE=1` or `--offline`) disables all
85
+ network egress and only reads from the on-disk cache. If a required
86
+ resource is not cached, the call raises `Kotoshu::ResourceNotCachedError`
87
+ (CLI exits 3). Use `kotoshu fetch LANGUAGE` to pre-warm the cache in
88
+ environments without outbound network access.
89
+
90
+ ## Scope
91
+
92
+ This policy covers the kotoshu gem itself. Vulnerabilities in
93
+ dependencies (Thor, suika, onnxruntime) should be reported upstream.
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Example 1: Basic Word Checking
5
+ #
6
+ # This example demonstrates the simplest way to use Kotoshu
7
+ # to check if words are spelled correctly.
8
+
9
+ require_relative "../lib/kotoshu"
10
+
11
+ puts "=== Example 1: Basic Word Checking ==="
12
+ puts
13
+
14
+ # Check if words are correct
15
+ puts "Is 'hello' correct? #{Kotoshu.correct?("hello")}"
16
+ puts "Is 'world' correct? #{Kotoshu.correct?("world")}"
17
+ puts "Is 'helo' correct? #{Kotoshu.correct?("helo")}"
18
+ puts "Is 'Kotoshu' correct? #{Kotoshu.correct?("Kotoshu")}"
19
+ puts
20
+
21
+ # Get suggestions for misspelled words
22
+ puts "Suggestions for 'helo':"
23
+ suggestions = Kotoshu.suggest("helo")
24
+ puts suggestions.to_words.join(", ")
25
+ puts
26
+
27
+ puts "Suggestions for 'wrold':"
28
+ suggestions = Kotoshu.suggest("wrold")
29
+ puts suggestions.to_words.join(", ")
30
+ puts
31
+
32
+ # Check multiple words
33
+ words = %w[hello world test helo wrold]
34
+ puts "Checking multiple words:"
35
+ words.each do |word|
36
+ status = Kotoshu.correct?(word) ? "✓" : "✗"
37
+ puts " #{status} #{word}"
38
+ end
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Example 2: Text and Document Checking
5
+ #
6
+ # This example demonstrates how to check paragraphs and documents
7
+ # for spelling errors and get detailed results.
8
+
9
+ require_relative "../lib/kotoshu"
10
+
11
+ puts "=== Example 2: Text and Document Checking ==="
12
+ puts
13
+
14
+ # Check a paragraph of text
15
+ text = <<~TEXT
16
+ Hello wrold!
17
+
18
+ This is a test document with some misspelled words.
19
+ We want to see if the spellchcker can find them al.
20
+
21
+ Teh quick brown fox jumps over the lazy dog.
22
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit.
23
+ TEXT
24
+
25
+ puts "Checking text:"
26
+ puts "-" * 40
27
+ result = Kotoshu.check(text)
28
+
29
+ puts result
30
+ unless result.success?
31
+ puts
32
+ puts "Errors found:"
33
+ result.each_error do |error|
34
+ suggestions_str = if error.has_suggestions?
35
+ " (did you mean #{error.top_suggestions(3).join(", ")}?)"
36
+ else
37
+ ""
38
+ end
39
+ puts " • #{error.word}#{suggestions_str}"
40
+ end
41
+ end
42
+
43
+ puts
44
+ puts "=" * 40
45
+ puts
46
+
47
+ # Check a file
48
+ file_path = "spec/fixtures/documents/with_errors.txt"
49
+ if File.exist?(file_path)
50
+ puts "Checking file: #{file_path}"
51
+ puts "-" * 40
52
+
53
+ file_result = Kotoshu.check_file(file_path)
54
+
55
+ if file_result.success?
56
+ puts "✓ No errors found (#{file_result.word_count} words checked)"
57
+ else
58
+ puts "✗ #{file_result.error_count} error(s) found:"
59
+ puts
60
+ file_result.each_unique_error do |word, errors|
61
+ puts " • #{word} (appears #{errors.size}x)"
62
+ first_error = errors.first
63
+ puts " Suggestions: #{first_error.top_suggestions(3).join(", ")}" if first_error.has_suggestions?
64
+ end
65
+ end
66
+ end
67
+
68
+ puts
69
+ puts "=" * 40
70
+ puts
71
+
72
+ # Document result statistics
73
+ puts "Document Statistics:"
74
+ puts " Word count: #{result.word_count}"
75
+ puts " Error count: #{result.error_count}"
76
+ puts " Unique errors: #{result.unique_error_count}"
77
+ puts " Error summary: #{result.error_summary.inspect}"
@@ -0,0 +1,137 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Example 3: Using Different Dictionary Backends
5
+ #
6
+ # This example demonstrates how to use different dictionary backends
7
+ # including UnixWords, PlainText, Custom, and Hunspell.
8
+
9
+ require_relative "../lib/kotoshu"
10
+
11
+ puts "=== Example 3: Dictionary Backends ==="
12
+ puts
13
+
14
+ # Example 1: UnixWords dictionary
15
+ puts "1. UnixWords Dictionary (System Dictionary)"
16
+ puts "-" * 40
17
+
18
+ unix_dict = Kotoshu::Dictionary::UnixWords.detect(language_code: "en-US")
19
+ if unix_dict
20
+ puts "Loaded: #{unix_dict.path}"
21
+ puts "Words: #{unix_dict.size}"
22
+ puts "Has 'hello': #{unix_dict.lookup?("hello")}"
23
+ puts "Has 'Kotoshu': #{unix_dict.lookup?("Kotoshu")}"
24
+ suggestions = unix_dict.suggest("helo", max_suggestions: 5)
25
+ puts "Suggestions for 'helo': #{suggestions.join(", ")}"
26
+ else
27
+ puts "No system dictionary found"
28
+ end
29
+
30
+ puts
31
+ puts "=" * 40
32
+ puts
33
+
34
+ # Example 2: PlainText dictionary
35
+ puts "2. PlainText Dictionary"
36
+ puts "-" * 40
37
+
38
+ plain_dict = Kotoshu::Dictionary::PlainText.from_words(
39
+ %w[hello world kotoshu ruby spellchecker],
40
+ language_code: "en"
41
+ )
42
+
43
+ puts "Created dictionary with #{plain_dict.size} words"
44
+ puts "Has 'hello': #{plain_dict.lookup?("hello")}"
45
+ puts "Has 'ruby': #{plain_dict.lookup?("ruby")}"
46
+ puts "Has 'python': #{plain_dict.lookup?("python")}"
47
+
48
+ # Add a word dynamically
49
+ plain_dict.add_word("python")
50
+ puts "After adding 'python': #{plain_dict.lookup?("python")}"
51
+ plain_dict.add_word("Kotoshu")
52
+ puts "After adding 'Kotoshu': #{plain_dict.lookup?("Kotoshu")}"
53
+
54
+ puts
55
+ puts "=" * 40
56
+ puts
57
+
58
+ # Example 3: Custom dictionary
59
+ puts "3. Custom Dictionary (In-Memory)"
60
+ puts "-" * 40
61
+
62
+ custom_dict = Kotoshu::Dictionary::Custom.new(
63
+ words: %w[Kotoshu spellchecker ruby],
64
+ language_code: "en"
65
+ )
66
+
67
+ puts "Created custom dictionary"
68
+ puts "Words: #{custom_dict.words.inspect}"
69
+ puts "Size: #{custom_dict.size}"
70
+ puts "Has 'Kotoshu': #{custom_dict.lookup?("Kotoshu")}"
71
+
72
+ # Merge with another array
73
+ custom_dict.merge(%w[gem library code])
74
+ puts "After merging: #{custom_dict.words.inspect}"
75
+
76
+ puts
77
+ puts "=" * 40
78
+ puts
79
+
80
+ # Example 4: Hunspell dictionary (if available)
81
+ puts "4. Hunspell Dictionary"
82
+ puts "-" * 40
83
+
84
+ hunspell_dic = "dictionaries/hunspell/test/en_US_test.dic"
85
+ hunspell_aff = "dictionaries/hunspell/test/en_US_test.aff"
86
+
87
+ if File.exist?(hunspell_dic) && File.exist?(hunspell_aff)
88
+ hunspell_dict = Kotoshu::Dictionary::Hunspell.new(
89
+ dic_path: hunspell_dic,
90
+ aff_path: hunspell_aff,
91
+ language_code: "en-US"
92
+ )
93
+
94
+ puts "Loaded Hunspell dictionary"
95
+ puts "Words: #{hunspell_dict.size}"
96
+ puts "Has 'hello': #{hunspell_dict.lookup?("hello")}"
97
+ puts "Has 'hello' (case-insensitive): #{hunspell_dict.lookup?("HELLO")}"
98
+ puts "Has 'runs': #{hunspell_dict.lookup?("runs")}"
99
+ puts "Has 'running': #{hunspell_dict.lookup?("running")}"
100
+
101
+ # Show word variants using affix rules
102
+ puts "\nWord variants for 'run':"
103
+ variants = hunspell_dict.word_variants("run")
104
+ puts " #{variants.inspect}"
105
+ else
106
+ puts "Hunspell test dictionary not found at:"
107
+ puts " #{hunspell_dic}"
108
+ puts " #{hunspell_aff}"
109
+ end
110
+
111
+ puts
112
+ puts "=" * 40
113
+ puts
114
+
115
+ # Example 5: CSpell dictionary
116
+ puts "5. CSpell Dictionary (Trie-based)"
117
+ puts "-" * 40
118
+
119
+ cspell_dict = Kotoshu::Dictionary::CSpell.from_words(
120
+ %w[hello world kotoshu ruby gem],
121
+ language_code: "en"
122
+ )
123
+
124
+ puts "Created CSpell dictionary with trie"
125
+ puts "Words: #{cspell_dict.words.inspect}"
126
+ puts "Size: #{cspell_dict.size}"
127
+ puts "Has 'hello': #{cspell_dict.lookup?("hello")}"
128
+ puts "Has prefix 'hel': #{cspell_dict.has_prefix?("hel")}"
129
+ puts "Words with prefix 'hel': #{cspell_dict.words_with_prefix("hel").inspect}"
130
+
131
+ # Convert to trie
132
+ trie = cspell_dict.trie
133
+ puts "\nTrie structure:"
134
+ puts " Has 'hello': #{trie.has_word?("hello")}"
135
+ puts " Has prefix 'wo': #{trie.has_prefix?("wo")}"
136
+ puts " Words with prefix 'wo': #{trie.words_with_prefix("wo").inspect}"
137
+ puts " Suggestions for 'he': #{trie.suggestions("he").inspect}"
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Example 4: Trie Data Structure
5
+ #
6
+ # This example demonstrates how to use the Trie data structure
7
+ # for efficient word lookup and prefix-based operations.
8
+
9
+ require_relative "../lib/kotoshu"
10
+
11
+ puts "=== Example 4: Trie Data Structure ==="
12
+ puts
13
+
14
+ # Build a trie from an array of words
15
+ words = %w[
16
+ hello help held heap
17
+ world work word
18
+ test text toast
19
+ run running runner
20
+ ]
21
+
22
+ trie = Kotoshu.trie(words)
23
+
24
+ puts "Built trie with #{words.size} words"
25
+ puts "All words: #{trie.all_words.inspect}"
26
+ puts
27
+
28
+ # Lookup operations
29
+ puts "Lookup Operations:"
30
+ puts "-" * 20
31
+ puts "Has 'hello': #{trie.has_word?("hello")}"
32
+ puts "Has 'hell': #{trie.has_word?("hell")}"
33
+ puts "Has 'HELLO' (case-sensitive): #{trie.has_word?("HELLO")}"
34
+ puts
35
+
36
+ # Prefix operations
37
+ puts "Prefix Operations:"
38
+ puts "-" * 20
39
+ puts "Has prefix 'hel': #{trie.has_prefix?("hel")}"
40
+ puts "Has prefix 'wor': #{trie.has_prefix?("wor")}"
41
+ puts "Has prefix 'xyz': #{trie.has_prefix?("xyz")}"
42
+ puts
43
+
44
+ # Words with prefix
45
+ puts "Words with prefix 'hel': #{trie.words_with_prefix("hel").inspect}"
46
+ puts "Words with prefix 'te': #{trie.words_with_prefix("te").inspect}"
47
+ puts
48
+
49
+ # Suggestions based on prefix
50
+ puts "Suggestions for 'hel':"
51
+ puts " #{trie.suggestions("hel", max_results: 10).inspect}"
52
+ puts
53
+
54
+ puts "Suggestions for 'te':"
55
+ puts " #{trie.suggestions("te", max_results: 10).inspect}"
56
+ puts
57
+
58
+ # Traverse the trie
59
+ puts "Traversing trie:"
60
+ puts "-" * 20
61
+ trie.each_word do |word, payload|
62
+ puts " #{word} (payload: #{payload.inspect})"
63
+ end
64
+ puts
65
+
66
+ # Trie builder methods
67
+ puts "Building tries from different sources:"
68
+ puts "-" * 20
69
+
70
+ # From string
71
+ string_trie = Kotoshu.trie("hello world test")
72
+ puts "From string: #{string_trie.all_words.inspect}"
73
+
74
+ # From file (if exists)
75
+ test_file = "dictionaries/plain_text/en_US/words.txt"
76
+ if File.exist?(test_file)
77
+ file_trie = Kotoshu.trie(test_file)
78
+ puts "From file: loaded #{file_trie.size} words"
79
+ puts "First 5 words: #{file_trie.all_words.first(5).inspect}"
80
+ end
81
+
82
+ # Trie set operations
83
+ puts
84
+ puts "Trie Set Operations:"
85
+ puts "-" * 20
86
+
87
+ trie1 = Kotoshu.trie(%w[hello world test])
88
+ trie2 = Kotoshu.trie(%w[hello world ruby])
89
+
90
+ puts "Trie 1: #{trie1.all_words.inspect}"
91
+ puts "Trie 2: #{trie2.all_words.inspect}"
92
+ puts
93
+
94
+ # Union (|)
95
+ union = trie1 | trie2
96
+ puts "Union: #{union.all_words.inspect}"
97
+
98
+ # Intersection (&)
99
+ intersection = trie1 & trie2
100
+ puts "Intersection: #{intersection.all_words.inspect}"
101
+
102
+ # Merge (mutating)
103
+ merged = trie1.dup
104
+ merged.merge!(trie2)
105
+ puts "Merged: #{merged.all_words.inspect}"
106
+
107
+ # Difference
108
+ # Note: Trie doesn't have difference (-) operator, but we can simulate it
109
+ all_words = trie1.all_words | trie2.all_words
110
+ common = trie1.all_words & trie2.all_words
111
+ difference = all_words - common
112
+ puts "Words in only one trie: #{difference.inspect}"
113
+
114
+ puts
115
+ puts "Trie Statistics:"
116
+ puts "-" * 20
117
+ puts "Total words: #{trie.size}"
118
+ puts "Unique prefixes: #{trie.size}"
119
+ # puts "Max depth: #{trie.max_depth}" # Method not implemented yet
120
+
121
+ # Advanced: Payload storage
122
+ puts
123
+ puts "Advanced: Payload Storage:"
124
+ puts "-" * 20
125
+
126
+ payload_trie = Kotoshu::Core::Trie::Builder.new
127
+ payload_trie.add_word("hello", { definition: "a greeting", count: 5 })
128
+ payload_trie.add_word("help", { definition: "assistance", count: 3 })
129
+ payload_trie.add_word("world", { definition: "earth", count: 1 })
130
+
131
+ payload_trie_obj = payload_trie.build
132
+
133
+ puts "Word 'hello' payload: #{payload_trie_obj.find_node("hello")&.payload.inspect}"
134
+ puts "Word 'help' payload: #{payload_trie_obj.find_node("help")&.payload.inspect}"
135
+
136
+ # Convert IndexedDictionary to trie
137
+ puts
138
+ puts "IndexedDictionary to Trie:"
139
+ puts "-" * 20
140
+
141
+ dict = Kotoshu.dictionary(%w[hello world test])
142
+ trie_from_dict = dict.to_trie
143
+
144
+ puts "Dictionary words: #{dict.words.inspect}"
145
+ puts "Trie words: #{trie_from_dict.all_words.inspect}"
146
+ puts "Trie has 'hello': #{trie_from_dict.has_word?("hello")}"