kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'word_embedding'
4
+
5
+ module Kotoshu
6
+ module Models
7
+ # Value object for embedding search results (nearest neighbors).
8
+ #
9
+ # Represents a single suggestion from semantic similarity search,
10
+ # with similarity score and optional embedding reference.
11
+ #
12
+ # @example Creating a neighbor
13
+ # neighbor = NearestNeighbor.new("hello", 0.85, embedding: emb)
14
+ # neighbor.to_s # => "hello [85%]"
15
+ class NearestNeighbor
16
+ attr_reader :word, :similarity, :distance, :embedding
17
+
18
+ # Create a new nearest neighbor result.
19
+ #
20
+ # @param word [String] The suggested word
21
+ # @param similarity [Float] Cosine similarity (0.0 to 1.0)
22
+ # @param embedding [WordEmbedding, nil] Optional embedding reference
23
+ def initialize(word, similarity, embedding: nil)
24
+ raise ArgumentError, "Similarity must be 0-1" unless similarity.between?(0.0, 1.0)
25
+
26
+ @word = word
27
+ @similarity = similarity
28
+ @distance = 1.0 - similarity
29
+ @embedding = embedding
30
+ freeze
31
+ end
32
+
33
+ # Comparison for sorting (higher similarity = better).
34
+ #
35
+ # @param other [NearestNeighbor] Another neighbor
36
+ # @return [Integer] Comparison result (-1, 0, 1)
37
+ def <=>(other)
38
+ return 0 unless other.is_a?(NearestNeighbor)
39
+
40
+ # Higher similarity = better rank (sort descending)
41
+ other.similarity <=> @similarity
42
+ end
43
+
44
+ # Check if this equals another neighbor.
45
+ #
46
+ # @param other [Object] Another object
47
+ # @return [Boolean] True if words match
48
+ def ==(other)
49
+ return false unless other.is_a?(NearestNeighbor)
50
+
51
+ @word == other.word
52
+ end
53
+ alias_method :eql?, :==
54
+
55
+ # Hash code for hash table usage.
56
+ #
57
+ # @return [Integer] Hash code
58
+ def hash
59
+ @word.hash
60
+ end
61
+
62
+ # String representation with percentage.
63
+ #
64
+ # @return [String] Human-readable representation
65
+ def to_s
66
+ "#{@word} [#{(@similarity * 100).to_i}%]"
67
+ end
68
+ alias_method :inspect, :to_s
69
+
70
+ # Check if this is a high-confidence suggestion.
71
+ #
72
+ # @return [Boolean] True if similarity > 0.8
73
+ def high_confidence?
74
+ @similarity > 0.8
75
+ end
76
+
77
+ # Get confidence level category.
78
+ #
79
+ # @return [Symbol] :high, :medium, or :low
80
+ def confidence_level
81
+ return :high if @similarity > 0.8
82
+ return :medium if @similarity > 0.5
83
+ :low
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,333 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Models
5
+ # ONNX embedding model implementation.
6
+ #
7
+ # Loads FastText models converted to ONNX format for faster inference.
8
+ # Uses ONNX Runtime for efficient embedding lookup.
9
+ #
10
+ # @example Loading from file
11
+ # model = OnnxModel.from_file('fasttext.en.onnx')
12
+ # embedding = model.embedding_for('hello')
13
+ #
14
+ # @example Loading from GitHub (via ModelCache)
15
+ # model = OnnxModel.from_github('en')
16
+ # neighbors = model.nearest_neighbors('hello', k: 10)
17
+ class OnnxModel < EmbeddingModel
18
+ # Soft-load onnxruntime. The gem is intentionally NOT a hard runtime
19
+ # dependency — it fails to build on some platforms and would block
20
+ # install for users who only want traditional spell-checking. Semantic
21
+ # features light up automatically when the gem is present.
22
+ #
23
+ # KOTOSHU_NO_ONNX=1 forces semantic analysis off even when the gem is
24
+ # installed (useful for benchmarks / CI determinism).
25
+ ONNX_LOADED = begin
26
+ if ENV["KOTOSHU_NO_ONNX"] == "1"
27
+ false
28
+ else
29
+ require "onnxruntime"
30
+ true
31
+ end
32
+ rescue LoadError
33
+ false
34
+ end
35
+
36
+ # Error raised when semantic features are requested but onnxruntime
37
+ # is unavailable. Caller-friendly message points at the fix.
38
+ class OnnxUnavailable < Kotoshu::Error
39
+ def initialize(detail = nil)
40
+ message = "onnxruntime gem not loaded"
41
+ message += " (#{detail})" if detail
42
+ message += ". Install with: gem install onnxruntime"
43
+ message += ". Or set KOTOSHU_NO_ONNX=1 to silence this in code paths that opt out."
44
+ super(message)
45
+ end
46
+ end
47
+
48
+ # Default dimension for FastText models
49
+ DEFAULT_DIMENSION = 300
50
+
51
+ attr_reader :onnx_path, :vocabulary, :embedding_matrix
52
+
53
+ # Create a new ONNX model.
54
+ #
55
+ # @param language_code [String] ISO 639-1 language code
56
+ # @param dimension [Integer] Vector dimension
57
+ # @param onnx_path [String] Path to .onnx file
58
+ # @param vocabulary [Hash<String, Integer>] Word-to-index mapping
59
+ # @param embedding_matrix [Numo::SFloat] Pre-loaded embeddings (optional)
60
+ def initialize(language_code:, dimension: DEFAULT_DIMENSION, onnx_path:, vocabulary:, embedding_matrix: nil)
61
+ super(language_code: language_code, dimension: dimension)
62
+ @onnx_path = onnx_path
63
+ @vocabulary = vocabulary.freeze
64
+ @vocabulary_size = @vocabulary.size
65
+
66
+ # Pre-load embedding matrix if provided (for faster nearest neighbor search)
67
+ @embedding_matrix = embedding_matrix
68
+
69
+ # Lazy load session
70
+ @session = nil
71
+ @loaded = false
72
+ end
73
+
74
+ # Load ONNX model from a file.
75
+ #
76
+ # @param onnx_path [String] Path to .onnx file
77
+ # @param language_code [String] Language code (auto-detected from filename)
78
+ # @return [OnnxModel] Loaded model
79
+ # @raise [ArgumentError] if file doesn't exist
80
+ def self.from_file(onnx_path, language_code: nil)
81
+ raise ArgumentError, "File not found: #{onnx_path}" unless File.exist?(onnx_path)
82
+
83
+ # Detect language from filename if not provided
84
+ language_code ||= detect_language_from_path(onnx_path)
85
+
86
+ # Load vocabulary from .vocab.json file
87
+ vocab_path = onnx_path.sub('.onnx', '.vocab.json')
88
+ unless File.exist?(vocab_path)
89
+ raise ArgumentError, "Vocabulary file not found: #{vocab_path}"
90
+ end
91
+
92
+ require 'json'
93
+ vocabulary = JSON.parse(File.read(vocab_path))
94
+
95
+ # Load metadata
96
+ metadata_path = onnx_path.sub('.onnx', '.metadata.json')
97
+ dimension = DEFAULT_DIMENSION
98
+
99
+ if File.exist?(metadata_path)
100
+ metadata = JSON.parse(File.read(metadata_path))
101
+ dimension = metadata['dimension']
102
+ end
103
+
104
+ new(
105
+ language_code: language_code,
106
+ dimension: dimension,
107
+ onnx_path: onnx_path,
108
+ vocabulary: vocabulary
109
+ )
110
+ end
111
+
112
+ # Load ONNX model from GitHub (via ModelCache).
113
+ #
114
+ # Downloads the .onnx file from kotoshu/dictionaries repository.
115
+ #
116
+ # @param language_code [String] ISO 639-1 language code (de, en, es, fr, pt, ru)
117
+ # @param cache [ModelCache, nil] Optional cache instance
118
+ # @return [OnnxModel] Loaded model
119
+ # @raise [ArgumentError] if language not supported
120
+ def self.from_github(language_code, cache: nil)
121
+ require_relative '../cache/model_cache'
122
+
123
+ cache ||= Cache::ModelCache.new
124
+
125
+ # Get the .onnx file path from cache
126
+ onnx_file = cache.get_onnx_model(language_code)
127
+
128
+ from_file(onnx_file, language_code: language_code)
129
+ end
130
+
131
+ # Get embedding vector for a word.
132
+ #
133
+ # @param word [String] The word to lookup
134
+ # @return [WordEmbedding, nil] Embedding vector or nil if not found
135
+ def embedding_for(word)
136
+ return nil if word.nil? || word.empty?
137
+
138
+ index = @vocabulary[word]
139
+ return nil unless index
140
+
141
+ # Get embedding from ONNX model
142
+ vector = get_embedding_vector(index)
143
+
144
+ WordEmbedding.new(word, vector, @language_code, dimension: @dimension)
145
+ end
146
+
147
+ # Get the vocabulary (all words in the model).
148
+ #
149
+ # @return [Array<String>] Vocabulary words
150
+ def vocabulary
151
+ @vocabulary.keys
152
+ end
153
+
154
+ # Check if model is loaded.
155
+ #
156
+ # @return [Boolean] True if ONNX session is loaded
157
+ def loaded?
158
+ @loaded
159
+ end
160
+
161
+ # Find k nearest neighbors for a word.
162
+ #
163
+ # @param word [String] The query word
164
+ # @param k [Integer] Number of neighbors to return
165
+ # @return [Array<NearestNeighbor>] Nearest neighbors sorted by similarity
166
+ def nearest_neighbors(word, k: 10)
167
+ ensure_session_loaded
168
+
169
+ # Get query embedding
170
+ query = embedding_for(word)
171
+ return [] unless query
172
+
173
+ # If embedding matrix is pre-loaded, use it for faster search
174
+ if @embedding_matrix
175
+ nearest_neighbors_from_matrix(query, k)
176
+ else
177
+ super
178
+ end
179
+ end
180
+
181
+ # Batch lookup of embeddings for multiple words.
182
+ #
183
+ # More efficient than individual lookups when using ONNX.
184
+ #
185
+ # @param words [Array<String>] Words to lookup
186
+ # @return [Hash<String, WordEmbedding>] Word to embedding mapping
187
+ def batch_embeddings(words)
188
+ ensure_session_loaded
189
+
190
+ indices = words.map { |w| @vocabulary[w] }
191
+ vectors = batch_get_embeddings(indices)
192
+
193
+ words.zip(indices, vectors).each_with_object({}) do |(word, idx, vec)|
194
+ next unless idx && vec
195
+
196
+ [word, WordEmbedding.new(word, vec, @language_code, dimension: @dimension)]
197
+ end
198
+ end
199
+
200
+ # Preload the embedding matrix into memory for faster nearest neighbor search.
201
+ #
202
+ # Useful when doing many nearest neighbor queries.
203
+ #
204
+ # @return [Boolean] True if loaded successfully
205
+ def preload_embedding_matrix
206
+ ensure_session_loaded
207
+
208
+ # Get all embeddings at once
209
+ all_indices = (0...@vocabulary_size).to_a
210
+ vectors = batch_get_embeddings(all_indices)
211
+
212
+ # Convert to matrix (using Numo::SFloat for efficiency)
213
+ require 'numo/narray'
214
+ @embedding_matrix = Numo::Sfloat.cast(vectors).reshape(@vocabulary_size, @dimension)
215
+
216
+ true
217
+ rescue StandardError => e
218
+ warn "Failed to preload embedding matrix: #{e.message}"
219
+ false
220
+ end
221
+
222
+ private
223
+
224
+ # Get embedding vector from ONNX model.
225
+ #
226
+ # @param index [Integer] Word index
227
+ # @return [Array<Float>] Embedding vector
228
+ def get_embedding_vector(index)
229
+ ensure_session_loaded
230
+
231
+ result = @session.run(
232
+ ['embeddings'],
233
+ { word_indices: [index].pack('q<') } # Pack int64 as little-endian
234
+ )
235
+
236
+ # Unpack float32 array
237
+ result.first.unpack('e*')
238
+ end
239
+
240
+ # Get embeddings for multiple indices.
241
+ #
242
+ # @param indices [Array<Integer>] Word indices
243
+ # @return [Array<Array<Float>>] Embedding vectors
244
+ def batch_get_embeddings(indices)
245
+ ensure_session_loaded
246
+
247
+ valid_indices = indices.compact
248
+
249
+ return [] if valid_indices.empty?
250
+
251
+ # Pack indices as int64 array
252
+ input_data = valid_indices.pack('q<*')
253
+
254
+ result = @session.run(
255
+ ['embeddings'],
256
+ { word_indices: input_data }
257
+ )
258
+
259
+ # Unpack float32 matrix
260
+ vectors = result.first.unpack('e*')
261
+ chunk_size = @dimension
262
+
263
+ vectors.each_slice(chunk_size).to_a
264
+ end
265
+
266
+ # Find nearest neighbors using pre-loaded embedding matrix.
267
+ #
268
+ # @param query [WordEmbedding] Query embedding
269
+ # @param k [Integer] Number of neighbors
270
+ # @return [Array<NearestNeighbor>] Nearest neighbors
271
+ def nearest_neighbors_from_matrix(query, k)
272
+ return [] unless @embedding_matrix
273
+
274
+ # Compute cosine similarity with all words
275
+ query_vec = Numo::Sfloat.cast(query.vector)
276
+ similarities = []
277
+
278
+ @vocabulary.each_with_index do |(word, idx)|
279
+ vec = @embedding_matrix[idx, true]
280
+ sim = cosine_similarity(query_vec, vec)
281
+ similarities << [word, sim]
282
+ end
283
+
284
+ # Sort by similarity and take top k
285
+ similarities.sort_by { |_, s| -s }.first(k).map do |word, sim|
286
+ NearestNeighbor.new(
287
+ word: word,
288
+ similarity: sim,
289
+ embedding: embedding_for(word)
290
+ )
291
+ end
292
+ end
293
+
294
+ # Calculate cosine similarity between two vectors.
295
+ #
296
+ # @param vec1 [Numo::SFloat] First vector
297
+ # @param vec2 [Numo::SFloat] Second vector
298
+ # @return [Float] Cosine similarity
299
+ def cosine_similarity(vec1, vec2)
300
+ dot = (vec1 * vec2).sum
301
+ norm1 = Math.sqrt((vec1 ** 2).sum)
302
+ norm2 = Math.sqrt((vec2 ** 2).sum)
303
+
304
+ return 0.0 if norm1.zero? || norm2.zero?
305
+
306
+ dot / (norm1 * norm2)
307
+ end
308
+
309
+ # Ensure ONNX session is loaded.
310
+ def ensure_session_loaded
311
+ return if @loaded
312
+
313
+ raise OnnxUnavailable unless ONNX_LOADED
314
+
315
+ @session = OnnxRuntime::Session.new(@onnx_path)
316
+ @loaded = true
317
+ end
318
+
319
+ # Detect language code from file path.
320
+ #
321
+ # @param path [String] File path
322
+ # @return [String] Detected language code
323
+ def self.detect_language_from_path(path)
324
+ # Extract from path like "fasttext.en.onnx"
325
+ if path =~ /\.([a-z]{2})\./i
326
+ Regexp.last_match(1).downcase
327
+ else
328
+ 'en' # Default to English
329
+ end
330
+ end
331
+ end
332
+ end
333
+ end
@@ -0,0 +1,165 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'context'
4
+ require_relative 'suggestion'
5
+
6
+ module Kotoshu
7
+ module Models
8
+ # Unified semantic error (NO artificial spelling/grammar split!).
9
+ #
10
+ # Represents ANY kind of language error detected through semantic analysis.
11
+ # Uses semantic categories instead of traditional "spelling" vs "grammar" labels.
12
+ #
13
+ # Error types (semantic categories):
14
+ # - :word_choice - Wrong word for context (e.g., "desert" vs "dessert")
15
+ # - :verb_agreement - Subject-verb mismatch (e.g., "they is" → "they are")
16
+ # - :tense - Temporal inconsistency (e.g., "Yesterday I will go")
17
+ # - :orthographic - Actual typo/misspelling (e.g., "wrold" → "world")
18
+ # - :preposition - Wrong preposition (e.g., "bored of" → "bored with")
19
+ # - :article - Wrong article (e.g., "a apple" → "an apple")
20
+ # - :morphology - Wrong word form (e.g., "goed" → "went")
21
+ # - :capitalization - Capitalization error (e.g., "i am" → "I am")
22
+ # - :punctuation - Punctuation error (e.g., "its" vs "it's")
23
+ # - :style - Style/usage suggestion
24
+ #
25
+ # @example Creating a semantic error
26
+ # error = SemanticError.new(
27
+ # id: "error_1",
28
+ # location: Location.new(line: 5, column: 12),
29
+ # original: "desert",
30
+ # suggestions: [Suggestion.new("dessert", confidence: 0.92)],
31
+ # error_type: :word_choice,
32
+ # confidence: 0.92,
33
+ # context: context
34
+ # )
35
+ class SemanticError
36
+ # Error type definitions with display names
37
+ ERROR_TYPES = {
38
+ word_choice: 'Word Choice',
39
+ verb_agreement: 'Verb Agreement',
40
+ tense: 'Tense',
41
+ orthographic: 'Spelling',
42
+ preposition: 'Preposition',
43
+ article: 'Article',
44
+ morphology: 'Word Form',
45
+ capitalization: 'Capitalization',
46
+ punctuation: 'Punctuation',
47
+ style: 'Style'
48
+ }.freeze
49
+
50
+ attr_reader :id, :location, :original, :suggestions, :error_type, :confidence, :context
51
+
52
+ # Create a new semantic error.
53
+ #
54
+ # @param id [String, Symbol] Unique identifier for this error
55
+ # @param location [Documents::Location] Location of error in document
56
+ # @param original [String] The original (incorrect) word/text
57
+ # @param suggestions [Array<Suggestion>] Suggested corrections
58
+ # @param error_type [Symbol] Error type (must be in ERROR_TYPES)
59
+ # @param confidence [Float] Confidence score (0.0 to 1.0)
60
+ # @param context [Context] Context around the error
61
+ # @raise [ArgumentError] if error_type is invalid
62
+ def initialize(id:, location:, original:, suggestions:, error_type:, confidence:, context:)
63
+ raise ArgumentError, "Invalid error type: #{error_type}" unless ERROR_TYPES.key?(error_type)
64
+ raise ArgumentError, "Confidence must be 0-1" unless confidence.between?(0.0, 1.0)
65
+ raise ArgumentError, "Suggestions cannot be empty" if suggestions.nil? || suggestions.empty?
66
+
67
+ @id = id.to_s
68
+ @location = location
69
+ @original = original
70
+ @suggestions = suggestions.sort_by(&:confidence).reverse.freeze
71
+ @error_type = error_type
72
+ @confidence = confidence
73
+ @context = context
74
+
75
+ freeze
76
+ end
77
+
78
+ # Get user-friendly display type name.
79
+ #
80
+ # @return [String] Display type name
81
+ def display_type
82
+ ERROR_TYPES[@error_type] || @error_type.to_s.capitalize
83
+ end
84
+
85
+ # Check if this is a high-confidence error.
86
+ #
87
+ # @return [Boolean] True if confidence > 0.8
88
+ def high_confidence?
89
+ @confidence > 0.8
90
+ end
91
+
92
+ # Get confidence level category.
93
+ #
94
+ # @return [Symbol] :high, :medium, or :low
95
+ def confidence_level
96
+ return :high if @confidence > 0.8
97
+ return :medium if @confidence > 0.5
98
+ :low
99
+ end
100
+
101
+ # Get the recommended (top) suggestion.
102
+ #
103
+ # @return [Suggestion] The highest-confidence suggestion
104
+ def recommended_suggestion
105
+ @suggestions.first
106
+ end
107
+
108
+ # Check if this error equals another.
109
+ #
110
+ # @param other [Object] Another object
111
+ # @return [Boolean] True if IDs match
112
+ def ==(other)
113
+ return false unless other.is_a?(SemanticError)
114
+
115
+ @id == other.id
116
+ end
117
+ alias_method :eql?, :==
118
+
119
+ # Hash code for hash table usage.
120
+ #
121
+ # @return [Integer] Hash code
122
+ def hash
123
+ @id.hash
124
+ end
125
+
126
+ # Comparison for sorting (by location, then confidence).
127
+ #
128
+ # Errors are sorted by:
129
+ # 1. Document location (line number, then column)
130
+ # 2. Confidence (highest first)
131
+ #
132
+ # @param other [SemanticError] Another error
133
+ # @return [Integer] Comparison result (-1, 0, 1)
134
+ def <=>(other)
135
+ return 0 unless other.is_a?(SemanticError)
136
+
137
+ # First by location (line, then column)
138
+ loc_cmp = @location <=> other.location
139
+ return loc_cmp unless loc_cmp.zero?
140
+
141
+ # Then by confidence (highest first)
142
+ other.confidence <=> @confidence
143
+ end
144
+
145
+ # String representation.
146
+ #
147
+ # @return [String] Human-readable representation
148
+ def to_s
149
+ "#{@location}: '#{@original}' → #{recommended_suggestion.word} [#{(@confidence * 100).to_i}%]"
150
+ end
151
+ alias_method :inspect, :to_s
152
+
153
+ # Create an abbreviated display for lists.
154
+ #
155
+ # @param max_length [Integer] Maximum line length
156
+ # @return [String] Abbreviated representation
157
+ def abbreviated(max_length: 80)
158
+ orig_display = "'#{@original}'"
159
+ sugg_display = "'#{recommended_suggestion.word}'"
160
+
161
+ "#{@location}: #{orig_display} → #{sugg_display} [#{(@confidence * 100).to_i}%]"
162
+ end
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Models
5
+ # Value object for correction suggestions.
6
+ #
7
+ # Represents a suggested correction for a detected error,
8
+ # with confidence score and metadata.
9
+ #
10
+ # @example Creating a suggestion
11
+ # suggestion = Suggestion.new("dessert", confidence: 0.92, source: :semantic)
12
+ # suggestion.to_s # => "dessert [92%]"
13
+ class Suggestion
14
+ attr_reader :word, :confidence, :source, :metadata
15
+
16
+ # Create a new suggestion.
17
+ #
18
+ # @param word [String] The suggested word
19
+ # @param confidence [Float] Confidence score (0.0 to 1.0)
20
+ # @param source [Symbol, nil] Source of the suggestion (e.g., :semantic, :edit_distance)
21
+ # @param metadata [Hash] Additional metadata (optional)
22
+ # @option metadata [WordEmbedding, nil] :embedding The word embedding
23
+ # @option metadata [Float] :edit_distance Edit distance score
24
+ # @option metadata [Float] :frequency_bonus Frequency score bonus
25
+ # @option metadata [String] :explanation Explanation for the suggestion
26
+ def initialize(word, confidence:, source: nil, metadata: {})
27
+ raise ArgumentError, "Confidence must be 0-1" unless confidence.between?(0.0, 1.0)
28
+
29
+ @word = word
30
+ @confidence = confidence
31
+ @source = source || :unknown
32
+ @metadata = metadata.freeze
33
+ freeze
34
+ end
35
+
36
+ # Comparison for sorting (higher confidence = better).
37
+ #
38
+ # @param other [Suggestion] Another suggestion
39
+ # @return [Integer] Comparison result (-1, 0, 1)
40
+ def <=>(other)
41
+ return 0 unless other.is_a?(Suggestion)
42
+
43
+ # Higher confidence = better rank (sort descending)
44
+ other.confidence <=> @confidence
45
+ end
46
+
47
+ # Check if this equals another suggestion.
48
+ #
49
+ # @param other [Object] Another object
50
+ # @return [Boolean] True if words match
51
+ def ==(other)
52
+ return false unless other.is_a?(Suggestion)
53
+
54
+ @word == other.word
55
+ end
56
+ alias_method :eql?, :==
57
+
58
+ # Hash code for hash table usage.
59
+ #
60
+ # @return [Integer] Hash code
61
+ def hash
62
+ @word.hash
63
+ end
64
+
65
+ # String representation with percentage.
66
+ #
67
+ # @return [String] Human-readable representation
68
+ def to_s
69
+ if @source && @source != :unknown
70
+ "#{@word} [#{(@confidence * 100).to_i}%] (#{@source})"
71
+ else
72
+ "#{@word} [#{(@confidence * 100).to_i}%]"
73
+ end
74
+ end
75
+ alias_method :inspect, :to_s
76
+
77
+ # Get the embedding if available.
78
+ #
79
+ # @return [WordEmbedding, nil] The embedding or nil
80
+ def embedding
81
+ @metadata[:embedding]
82
+ end
83
+
84
+ # Get the edit distance if available.
85
+ #
86
+ # @return [Float, nil] Edit distance or nil
87
+ def edit_distance
88
+ @metadata[:edit_distance]
89
+ end
90
+
91
+ # Check if this is a high-confidence suggestion.
92
+ #
93
+ # @return [Boolean] True if confidence > 0.8
94
+ def high_confidence?
95
+ @confidence > 0.8
96
+ end
97
+
98
+ # Get explanation text if available.
99
+ #
100
+ # @return [String, nil] Explanation or nil
101
+ def explanation
102
+ @metadata[:explanation]
103
+ end
104
+ end
105
+ end
106
+ end