kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "time"
5
+ require "fileutils"
6
+
7
+ module Kotoshu
8
+ module Integrity
9
+ # Append-only JSON audit log of every resource download.
10
+ #
11
+ # Each entry is one JSON object per line, written to `audit.log`
12
+ # under the configured Kotoshu home directory. The log is consulted
13
+ # by users when investigating "what did Kotoshu fetch?" and by CI
14
+ # for reproducibility audits.
15
+ #
16
+ # Statuses:
17
+ # "verified" — content matched manifest entry's SHA-256
18
+ # "unverified" — no manifest entry available; bytes trusted as-is
19
+ # "mismatch" — SHA-256 mismatch (also raises IntegrityError)
20
+ # "missing" — attempted download failed (network, 404, etc.)
21
+ #
22
+ # The log is opened, appended, and closed per entry — no long-lived
23
+ # file handle. Writes are line-buffered and fsync'd so the record
24
+ # survives a crash mid-batch.
25
+ class AuditLog
26
+ # Default location: $XDG_DATA_HOME/kotoshu/audit.log
27
+ # (~/.local/share/kotoshu/audit.log), or $KOTOSHU_AUDIT_LOG.
28
+ def self.default_path
29
+ Kotoshu::Paths.audit_log_path
30
+ end
31
+
32
+ attr_reader :path
33
+
34
+ def initialize(path: self.class.default_path)
35
+ @path = path
36
+ end
37
+
38
+ # Record one download attempt. Returns the written entry hash.
39
+ #
40
+ # @param url [String] Source URL
41
+ # @param size [Integer, nil] Bytes downloaded (nil on missing)
42
+ # @param sha256 [String, nil] Computed SHA-256 of bytes (nil on missing)
43
+ # @param manifest_sha256 [String, nil] Expected SHA-256 from manifest
44
+ # @param status [String] One of: verified, unverified, mismatch, missing
45
+ # @param resource_id [String, nil] Caller-supplied resource identifier
46
+ def record(url:, status:, size: nil, sha256: nil,
47
+ manifest_sha256: nil, resource_id: nil)
48
+ entry = {
49
+ timestamp: Time.now.utc.iso8601,
50
+ url: url,
51
+ resource_id: resource_id,
52
+ size: size,
53
+ sha256: sha256,
54
+ manifest_sha256: manifest_sha256,
55
+ status: status
56
+ }
57
+ FileUtils.mkdir_p(File.dirname(@path))
58
+ File.open(@path, "a", encoding: "UTF-8") do |f|
59
+ f.flock(File::LOCK_EX)
60
+ f.write("#{entry.to_json}\n")
61
+ f.fsync
62
+ end
63
+ entry
64
+ end
65
+
66
+ # Iterate every recorded entry (parsed Hashes).
67
+ def each
68
+ return enum_for(:each) unless block_given?
69
+ return unless File.exist?(@path)
70
+
71
+ File.foreach(@path, encoding: "UTF-8") do |line|
72
+ line = line.strip
73
+ next if line.empty?
74
+
75
+ yield JSON.parse(line)
76
+ end
77
+ end
78
+
79
+ def entries
80
+ each.to_a
81
+ end
82
+
83
+ def clear!
84
+ FileUtils.rm_f(@path)
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "digest"
5
+ require_relative "../core/exceptions"
6
+
7
+ module Kotoshu
8
+ module Integrity
9
+ # Parsed view of a content repo's `manifest.json`.
10
+ #
11
+ # Format (per TODO.impl/09-integrity-security.md task 1):
12
+ #
13
+ # {
14
+ # "version": 1,
15
+ # "generated_at": "2026-06-25T10:00:00Z",
16
+ # "resources": {
17
+ # "en/spelling/index.dic": {
18
+ # "size": 49568,
19
+ # "sha256": "ab12...",
20
+ # "language": "en",
21
+ # "type": "spelling",
22
+ # "license": "LGPL/MPL/GPL",
23
+ # "source": "SCROLL"
24
+ # }
25
+ # }
26
+ # }
27
+ #
28
+ # Construction:
29
+ #
30
+ # manifest = Manifest.parse(json_string)
31
+ # manifest.fetch("en/spelling/index.dic") # => Entry or nil
32
+ # manifest.verify_content!("en/spelling/index.dic", bytes) # raises on mismatch
33
+ #
34
+ # `Manifest.load(url, http:)`, returns nil when the manifest 404s (graceful
35
+ # degradation — see module docs).
36
+ class Manifest
37
+ Entry = Struct.new(:path, :sha256, :size, :language, :type, :license, :source,
38
+ keyword_init: true) do
39
+ def verify?(content)
40
+ Digest::SHA256.hexdigest(content) == sha256
41
+ end
42
+ end
43
+
44
+ # Parse a manifest JSON string. Returns an empty Manifest if the JSON
45
+ # is parseable but has no resources (caller treats as "no constraints").
46
+ def self.parse(json)
47
+ data = JSON.parse(json)
48
+ entries = {}
49
+ (data["resources"] || {}).each do |path, fields|
50
+ entries[path] = Entry.new(
51
+ path: path,
52
+ sha256: fields["sha256"],
53
+ size: fields["size"],
54
+ language: fields["language"],
55
+ type: fields["type"],
56
+ license: fields["license"],
57
+ source: fields["source"]
58
+ )
59
+ end
60
+ new(entries, version: data["version"], generated_at: data["generated_at"])
61
+ rescue JSON::ParserError => e
62
+ raise Kotoshu::IntegrityError.new(
63
+ "manifest",
64
+ expected: "<valid JSON>",
65
+ actual: "<parse error: #{e.message}>"
66
+ )
67
+ end
68
+
69
+ # Fetch and parse a manifest from a URL. Returns nil when the
70
+ # manifest is absent (HTTP 404/410) so callers can fall back to
71
+ # unverified downloads — see module docs. Any other failure
72
+ # (5xx, network error, parse error) raises.
73
+ def self.load(url, http: Kotoshu::Integrity::NetHTTP)
74
+ body = http.get(url)
75
+ return nil if body.nil?
76
+
77
+ parse(body)
78
+ end
79
+
80
+ attr_reader :entries, :version, :generated_at
81
+
82
+ def initialize(entries, version: nil, generated_at: nil)
83
+ @entries = entries
84
+ @version = version
85
+ @generated_at = generated_at
86
+ end
87
+
88
+ def fetch(path)
89
+ @entries[path]
90
+ end
91
+
92
+ def empty?
93
+ @entries.empty?
94
+ end
95
+
96
+ # Verify that content for `path` matches the manifest entry.
97
+ # Raises {Kotoshu::IntegrityError} on mismatch. No-op when the
98
+ # manifest has no entry for `path` (returns nil — caller decides
99
+ # whether to treat absence as failure in strict mode).
100
+ def verify_content!(path, content, url: nil)
101
+ entry = @entries[path]
102
+ return nil unless entry
103
+
104
+ actual = Digest::SHA256.hexdigest(content)
105
+ unless actual == entry.sha256
106
+ raise Kotoshu::IntegrityError.new(
107
+ path,
108
+ expected: entry.sha256,
109
+ actual: actual,
110
+ url: url
111
+ )
112
+ end
113
+ true
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require "uri"
5
+
6
+ module Kotoshu
7
+ module Integrity
8
+ # Thin wrapper around Net::HTTP so manifest fetches are testable
9
+ # without the network. Returns response body as a String on 2xx,
10
+ # nil on 404/410 (so callers can treat "manifest not published yet"
11
+ # as graceful degradation), and raises on other errors.
12
+ module NetHTTP
13
+ class << self
14
+ def get(url, redirect_limit: 3)
15
+ uri = URI(url)
16
+ raise ArgumentError, "Only http/https supported: #{url}" unless
17
+ uri.scheme == "http" || uri.scheme == "https"
18
+
19
+ http = Net::HTTP.new(uri.host, uri.port)
20
+ http.use_ssl = (uri.scheme == "https")
21
+ http.open_timeout = 10
22
+ http.read_timeout = 30
23
+
24
+ request = Net::HTTP::Get.new(uri.request_uri)
25
+ response = http.request(request)
26
+
27
+ case response
28
+ when Net::HTTPSuccess
29
+ response.body
30
+ when Net::HTTPNotFound, Net::HTTPGone
31
+ nil
32
+ when Net::HTTPRedirection
33
+ raise TooManyRedirects if redirect_limit.zero?
34
+
35
+ get(response["location"], redirect_limit: redirect_limit - 1)
36
+ else
37
+ raise HttpError, "GET #{url} failed: #{response.code} #{response.message}"
38
+ end
39
+ end
40
+ end
41
+
42
+ class HttpError < StandardError; end
43
+ class TooManyRedirects < StandardError; end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ # Integrity verification for downloaded resources.
5
+ #
6
+ # Two cooperating pieces:
7
+ #
8
+ # - {Manifest} — parsed view of a content repo's `manifest.json`. Lookup
9
+ # by relative path yields the expected SHA-256 and size.
10
+ # - {AuditLog} — append-only JSON log at `$XDG_DATA_HOME/kotoshu/audit.log` recording
11
+ # every download's URL, size, computed SHA-256, manifest hash (when
12
+ # available), and status.
13
+ #
14
+ # Caches call `Manifest.verify_content!` after each download. If the
15
+ # manifest is absent (the upstream repo hasn't shipped one yet), the
16
+ # caller logs the download with status `"unverified"` and proceeds —
17
+ # graceful degradation. When a manifest IS present and the SHA-256
18
+ # doesn't match, {Kotoshu::IntegrityError} is raised and the corrupt
19
+ # bytes are removed before they reach the cache.
20
+ module Integrity
21
+ autoload :Manifest, "kotoshu/integrity/manifest"
22
+ autoload :AuditLog, "kotoshu/integrity/audit_log"
23
+ autoload :NetHTTP, "kotoshu/integrity/net_http"
24
+ end
25
+ end
@@ -0,0 +1,115 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Keyboard
5
+ # Base class for keyboard layouts
6
+ #
7
+ # Each layout defines key positions and provides distance calculations
8
+ # for typo detection and suggestion ranking in spell checking.
9
+ #
10
+ # @example Using a keyboard layout
11
+ # layout = Keyboard::Layouts::QWERTY.new
12
+ # layout.distance('q', 'w') # => 1 (adjacent keys)
13
+ # layout.distance('q', 'p') # => 8 (far apart)
14
+ # layout.adjacent_keys('q') # => ['w', 'a', 's']
15
+ #
16
+ # @example Checking language support
17
+ # qwerty = Keyboard::Layouts::QWERTY.new
18
+ # qwerty.supports_language?('en') # => true
19
+ # qwerty.supports_language?('de') # => false
20
+ #
21
+ class Layout
22
+ # @return [String] the name of this keyboard layout
23
+ attr_reader :name
24
+
25
+ # @return [Array<String>] list of language codes this layout supports
26
+ attr_reader :language_codes
27
+
28
+ # @return [Hash] mapping of key characters to [row, col] positions
29
+ attr_reader :key_positions
30
+
31
+ # Initialize a keyboard layout
32
+ #
33
+ # @param name [String] the name of the layout
34
+ # @param language_codes [Array<String>] list of language codes this layout supports
35
+ # @param key_positions [Hash] mapping of key characters to [row, col] positions
36
+ def initialize(name:, language_codes:, key_positions:)
37
+ @name = name
38
+ @language_codes = Array(language_codes).freeze
39
+ @key_positions = key_positions.freeze
40
+ end
41
+
42
+ # Get position [row, col] for a key
43
+ #
44
+ # @param key [String] the key character to look up
45
+ # @return [Array<Integer>, nil] the [row, col] position, or nil if key not found
46
+ def position(key)
47
+ @key_positions[key.downcase]
48
+ end
49
+
50
+ # Calculate Manhattan distance between two keys
51
+ #
52
+ # Manhattan distance is the sum of absolute differences of row and column:
53
+ # distance = abs(row1 - row2) + abs(col1 - col2)
54
+ #
55
+ # @param key1 [String] first key character
56
+ # @param key2 [String] second key character
57
+ # @return [Integer] Manhattan distance (0 if same key, Float::INFINITY if either key not found)
58
+ def distance(key1, key2)
59
+ pos1 = position(key1)
60
+ pos2 = position(key2)
61
+
62
+ return Float::INFINITY unless pos1 && pos2
63
+
64
+ (pos1[0] - pos2[0]).abs + (pos1[1] - pos2[1]).abs
65
+ end
66
+
67
+ # Check if layout supports a language
68
+ #
69
+ # Supports both exact matching and language variant matching.
70
+ # For example, if 'en' is supported, then 'en-US', 'en-GB', etc. are also supported.
71
+ #
72
+ # @param language_code [String] the language code to check (e.g., 'en', 'en-US', 'de')
73
+ # @return [Boolean] true if this layout supports the language
74
+ def supports_language?(language_code)
75
+ # Try exact match first
76
+ return true if @language_codes.include?(language_code)
77
+
78
+ # Try base language match (e.g., 'en' for 'en-US')
79
+ base_lang = language_code.to_s.split('-').first
80
+ @language_codes.include?(base_lang)
81
+ end
82
+
83
+ # Get adjacent keys for a given key (within 1 unit distance)
84
+ #
85
+ # Adjacent keys are those that are directly next to the given key
86
+ # horizontally or vertically (not diagonal).
87
+ #
88
+ # @param key [String] the key character to find adjacent keys for
89
+ # @return [Array<String>] list of adjacent key characters
90
+ def adjacent_keys(key)
91
+ pos = position(key)
92
+ return [] unless pos
93
+
94
+ @key_positions.select do |k, p|
95
+ next if k == key
96
+ ((p[0] - pos[0]).abs + (p[1] - pos[1]).abs) == 1
97
+ end.keys
98
+ end
99
+
100
+ # String representation of the layout
101
+ #
102
+ # @return [String] layout name
103
+ def to_s
104
+ "Keyboard::#{@name}"
105
+ end
106
+
107
+ # Inspect method for debugging
108
+ #
109
+ # @return [String] detailed inspection string
110
+ def inspect
111
+ "#<#{self.class} name=#{@name} languages=#{@language_codes.join(',')}>"
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../layout'
4
+
5
+ module Kotoshu
6
+ module Keyboard
7
+ module Layouts
8
+ # AZERTY keyboard layout
9
+ #
10
+ # Standard AZERTY layout used for:
11
+ # - French (fr, fr-FR)
12
+ # - Belgium (be)
13
+ #
14
+ # Key differences from QWERTY:
15
+ # - a and q are swapped (a/q → q/a)
16
+ # - z and w are swapped (z/w → w/z)
17
+ # - Number row is shifted (requires Shift for numbers)
18
+ # - Has accent keys: é, à, ç, è, ù
19
+ #
20
+ # Key positions use [row, col] coordinates where:
21
+ # - row 0: number/symbol row (²&é"'(-...)
22
+ # - row 1: top row (azerty...)
23
+ # - row 2: home row (qsdfg...)
24
+ # - row 3: bottom row (wxcvb...) - note: w is here
25
+ class AZERTY < Layout
26
+ # Key positions for AZERTY layout
27
+ # Each key maps to [row, column] coordinates
28
+ KEY_POSITIONS = {
29
+ # Top row (number/symbol row - numbers require Shift)
30
+ '`' => [0, 0], '1' => [0, 1], '2' => [0, 2], '3' => [0, 3], '4' => [0, 4],
31
+ '5' => [0, 5], '6' => [0, 6], '7' => [0, 7], '8' => [0, 8], '9' => [0, 9],
32
+ '0' => [0, 10], ')' => [0, 11], '=' => [0, 12],
33
+ # Top row (AZERTY - note a and q swapped, z and w swapped)
34
+ 'a' => [1, 0], 'z' => [1, 1], 'e' => [1, 2], 'r' => [1, 3], 't' => [1, 4],
35
+ 'y' => [1, 5], 'u' => [1, 6], 'i' => [1, 7], 'o' => [1, 8], 'p' => [1, 9],
36
+ '^' => [1, 10], '$' => [1, 11],
37
+ # Home row (QSDFG - note q is here)
38
+ 'q' => [2, 0], 's' => [2, 1], 'd' => [2, 2], 'f' => [2, 3], 'g' => [2, 4],
39
+ 'h' => [2, 5], 'j' => [2, 6], 'k' => [2, 7], 'l' => [2, 8], 'm' => [2, 9],
40
+ 'ù' => [2, 10],
41
+ # Bottom row (WXCVB - note w is here)
42
+ 'w' => [3, 0], 'x' => [3, 1], 'c' => [3, 2], 'v' => [3, 3], 'b' => [3, 4],
43
+ 'n' => [3, 5], ',' => [3, 6], ';' => [3, 7], ':' => [3, 8], '!' => [3, 9]
44
+ }.freeze
45
+
46
+ # Initialize AZERTY layout
47
+ def initialize
48
+ super(
49
+ name: 'AZERTY',
50
+ language_codes: %w[fr be fr-FR],
51
+ key_positions: KEY_POSITIONS
52
+ )
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../layout'
4
+
5
+ module Kotoshu
6
+ module Keyboard
7
+ module Layouts
8
+ # Dvorak keyboard layout
9
+ #
10
+ # Dvorak Simplified Keyboard layout designed for efficiency:
11
+ # - English (en, en-US) with Dvorak layout
12
+ #
13
+ # Key differences from QWERTY:
14
+ # - Vowels (AOEUIDHTNS) on home row left
15
+ # - Most common consonants on home row right
16
+ # - Designed to minimize finger movement
17
+ # - ~70% of keystrokes on home row (vs ~32% for QWERTY)
18
+ #
19
+ # Key positions use [row, col] coordinates where:
20
+ # - row 0: number row (`1"2>...)
21
+ # - row 1: top row (',.<pyfg...)
22
+ # - row 2: home row (aoeuidhtns...)
23
+ # - row 3: bottom row (;qjkxbmwvz)
24
+ class Dvorak < Layout
25
+ # Key positions for Dvorak layout
26
+ # Each key maps to [row, column] coordinates
27
+ KEY_POSITIONS = {
28
+ # Number row
29
+ '`' => [0, 0], '1' => [0, 1], '2' => [0, 2], '3' => [0, 3], '4' => [0, 4],
30
+ '5' => [0, 5], '6' => [0, 6], '7' => [0, 7], '8' => [0, 8], '9' => [0, 9],
31
+ '0' => [0, 10], '[' => [0, 11], ']' => [0, 12],
32
+ # Top row (punctuation and high-frequency consonants)
33
+ '\'' => [1, 0], ',' => [1, 1], '.' => [1, 2], 'p' => [1, 3], 'y' => [1, 4],
34
+ 'f' => [1, 5], 'g' => [1, 6], 'c' => [1, 7], 'r' => [1, 8], 'l' => [1, 9],
35
+ '/' => [1, 10], '=' => [1, 11],
36
+ # Home row (vowels left, high-frequency consonants right)
37
+ 'a' => [2, 0], 'o' => [2, 1], 'e' => [2, 2], 'u' => [2, 3], 'i' => [2, 4],
38
+ 'd' => [2, 5], 'h' => [2, 6], 't' => [2, 7], 'n' => [2, 8], 's' => [2, 9],
39
+ '-' => [2, 10],
40
+ # Bottom row (low-frequency letters)
41
+ ';' => [3, 0], 'q' => [3, 1], 'j' => [3, 2], 'k' => [3, 3], 'x' => [3, 4],
42
+ 'b' => [3, 5], 'm' => [3, 6], 'w' => [3, 7], 'v' => [3, 8], 'z' => [3, 9]
43
+ }.freeze
44
+
45
+ # Initialize Dvorak layout
46
+ def initialize
47
+ super(
48
+ name: 'Dvorak',
49
+ language_codes: %w[en en-US],
50
+ key_positions: KEY_POSITIONS
51
+ )
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../layout'
4
+
5
+ module Kotoshu
6
+ module Keyboard
7
+ module Layouts
8
+ # JCUKEN keyboard layout
9
+ #
10
+ # Standard JCUKEN layout used for:
11
+ # - Russian (ru, ru-RU)
12
+ # - Ukrainian (uk)
13
+ # - Belarusian (be)
14
+ # - Bulgarian (bg)
15
+ #
16
+ # This is the standard Cyrillic keyboard layout.
17
+ # Key differences from QWERTY:
18
+ # - Completely different alphabet (Cyrillic: 33 letters)
19
+ # - JCUKEN mapping corresponds to QWERTY positions
20
+ # - Has special keys: ё, ъ, ь
21
+ #
22
+ # Key positions use [row, col] coordinates where:
23
+ # - row 0: number row (ё1"2...)
24
+ # - row 1: top row (йцукен...)
25
+ # - row 2: home row (фывап...)
26
+ # - row 3: bottom row (ячсми...)
27
+ class JCUKEN < Layout
28
+ # Key positions for JCUKEN layout (Cyrillic)
29
+ # Each key maps to [row, column] coordinates
30
+ KEY_POSITIONS = {
31
+ # Number row
32
+ 'ё' => [0, 0], '1' => [0, 1], '2' => [0, 2], '3' => [0, 3], '4' => [0, 4],
33
+ '5' => [0, 5], '6' => [0, 6], '7' => [0, 7], '8' => [0, 8], '9' => [0, 9],
34
+ '0' => [0, 10], '-' => [0, 11], '=' => [0, 12],
35
+ # Top row (ЙЦУКЕН - corresponds to QWERTY)
36
+ 'й' => [1, 0], 'ц' => [1, 1], 'у' => [1, 2], 'к' => [1, 3], 'е' => [1, 4],
37
+ 'н' => [1, 5], 'г' => [1, 6], 'ш' => [1, 7], 'щ' => [1, 8], 'з' => [1, 9],
38
+ 'х' => [1, 10], 'ъ' => [1, 11],
39
+ # Home row (ФЫВАПРОЛД - corresponds to ASDFGHJKL)
40
+ 'ф' => [2, 0], 'ы' => [2, 1], 'в' => [2, 2], 'а' => [2, 3], 'п' => [2, 4],
41
+ 'р' => [2, 5], 'о' => [2, 6], 'л' => [2, 7], 'д' => [2, 8], 'ж' => [2, 9],
42
+ 'э' => [2, 10],
43
+ # Bottom row (ЯЧСМИТЬБЮ - corresponds to ZXCVBNM)
44
+ 'я' => [3, 0], 'ч' => [3, 1], 'с' => [3, 2], 'м' => [3, 3], 'и' => [3, 4],
45
+ 'т' => [3, 5], 'ь' => [3, 6], 'б' => [3, 7], 'ю' => [3, 8], '.' => [3, 9]
46
+ }.freeze
47
+
48
+ # Initialize JCUKEN layout
49
+ def initialize
50
+ super(
51
+ name: 'JCUKEN',
52
+ language_codes: %w[ru uk be bg ru-RU],
53
+ key_positions: KEY_POSITIONS
54
+ )
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../layout'
4
+
5
+ module Kotoshu
6
+ module Keyboard
7
+ module Layouts
8
+ # QWERTY keyboard layout
9
+ #
10
+ # Standard QWERTY layout used for:
11
+ # - English (en, en-US, en-GB, etc.)
12
+ # - Spanish (es)
13
+ # - Portuguese (pt, pt-BR, pt-PT)
14
+ # - United States (us)
15
+ #
16
+ # Key positions use [row, col] coordinates where:
17
+ # - row 0: number row (`1`2`3...)
18
+ # - row 1: top row (qwerty...)
19
+ # - row 2: home row (asdfg...)
20
+ # - row 3: bottom row (zxcvb...)
21
+ class QWERTY < Layout
22
+ # Key positions for QWERTY layout
23
+ # Each key maps to [row, column] coordinates
24
+ KEY_POSITIONS = {
25
+ # Number row
26
+ '`' => [0, 0], '1' => [0, 1], '2' => [0, 2], '3' => [0, 3], '4' => [0, 4],
27
+ '5' => [0, 5], '6' => [0, 6], '7' => [0, 7], '8' => [0, 8], '9' => [0, 9],
28
+ '0' => [0, 10], '-' => [0, 11], '=' => [0, 12],
29
+ # Top row (QWERTY)
30
+ 'q' => [1, 0], 'w' => [1, 1], 'e' => [1, 2], 'r' => [1, 3], 't' => [1, 4],
31
+ 'y' => [1, 5], 'u' => [1, 6], 'i' => [1, 7], 'o' => [1, 8], 'p' => [1, 9],
32
+ '[' => [1, 10], ']' => [1, 11], '\\' => [1, 12],
33
+ # Home row (ASDFG)
34
+ 'a' => [2, 0], 's' => [2, 1], 'd' => [2, 2], 'f' => [2, 3], 'g' => [2, 4],
35
+ 'h' => [2, 5], 'j' => [2, 6], 'k' => [2, 7], 'l' => [2, 8], ';' => [2, 9],
36
+ '\'' => [2, 10],
37
+ # Bottom row (ZXCVB)
38
+ 'z' => [3, 0], 'x' => [3, 1], 'c' => [3, 2], 'v' => [3, 3], 'b' => [3, 4],
39
+ 'n' => [3, 5], 'm' => [3, 6], ',' => [3, 7], '.' => [3, 8], '/' => [3, 9]
40
+ }.freeze
41
+
42
+ # Initialize QWERTY layout
43
+ def initialize
44
+ super(
45
+ name: 'QWERTY',
46
+ language_codes: %w[en es pt us en-US en-GB en-AU en-CA en-NZ en-ZA
47
+ es-ES es-MX pt-BR pt-PT],
48
+ key_positions: KEY_POSITIONS
49
+ )
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../layout'
4
+
5
+ module Kotoshu
6
+ module Keyboard
7
+ module Layouts
8
+ # QWERTZ keyboard layout
9
+ #
10
+ # Standard QWERTZ layout used for:
11
+ # - German (de, de-DE, de-AT, de-CH)
12
+ # - Austria (at)
13
+ # - Switzerland (ch)
14
+ #
15
+ # Key differences from QWERTY:
16
+ # - z and y are swapped (z/y → y/z)
17
+ # - Has umlaut keys: ä, ö, ü
18
+ # - Has ß key (Eszett)
19
+ #
20
+ # Key positions use [row, col] coordinates where:
21
+ # - row 0: number row (^°1"2...)
22
+ # - row 1: top row (qwertz...)
23
+ # - row 2: home row (asdfg...)
24
+ # - row 3: bottom row (yxcvb...) - note: y is here, not in top row
25
+ class QWERTZ < Layout
26
+ # Key positions for QWERTZ layout
27
+ # Each key maps to [row, column] coordinates
28
+ KEY_POSITIONS = {
29
+ # Number row
30
+ '^' => [0, 0], '1' => [0, 1], '2' => [0, 2], '3' => [0, 3], '4' => [0, 4],
31
+ '5' => [0, 5], '6' => [0, 6], '7' => [0, 7], '8' => [0, 8], '9' => [0, 9],
32
+ '0' => [0, 10], 'ß' => [0, 11], '´' => [0, 12],
33
+ # Top row (QWERTZ - note z and y are swapped)
34
+ 'q' => [1, 0], 'w' => [1, 1], 'e' => [1, 2], 'r' => [1, 3], 't' => [1, 4],
35
+ 'z' => [1, 5], 'u' => [1, 6], 'i' => [1, 7], 'o' => [1, 8], 'p' => [1, 9],
36
+ 'ü' => [1, 10], '+' => [1, 11],
37
+ # Home row (ASDFG)
38
+ 'a' => [2, 0], 's' => [2, 1], 'd' => [2, 2], 'f' => [2, 3], 'g' => [2, 4],
39
+ 'h' => [2, 5], 'j' => [2, 6], 'k' => [2, 7], 'l' => [2, 8], 'ö' => [2, 9],
40
+ 'ä' => [2, 10],
41
+ # Bottom row (YXCVB - note y is here)
42
+ 'y' => [3, 0], 'x' => [3, 1], 'c' => [3, 2], 'v' => [3, 3], 'b' => [3, 4],
43
+ 'n' => [3, 5], 'm' => [3, 6], ',' => [3, 7], '.' => [3, 8], '-' => [3, 9]
44
+ }.freeze
45
+
46
+ # Initialize QWERTZ layout
47
+ def initialize
48
+ super(
49
+ name: 'QWERTZ',
50
+ language_codes: %w[de at ch de-DE de-AT de-CH],
51
+ key_positions: KEY_POSITIONS
52
+ )
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end