kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,275 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ FastText to ONNX Converter
4
+
5
+ Converts FastText .vec files to ONNX format for efficient deployment.
6
+ The ONNX model contains an embedding lookup layer with the word vectors.
7
+
8
+ Usage:
9
+ python fasttext_to_onnx.py <vec_file> <output_onnx> [--vocab_size N]
10
+
11
+ Example:
12
+ python fasttext_to_onnx.py cc.en.300.vec fasttext.en.onnx --vocab_size 50000
13
+ """
14
+
15
+ import sys
16
+ import argparse
17
+ import json
18
+ from pathlib import Path
19
+
20
+ import numpy as np
21
+ import onnx
22
+ from onnx import helper, numpy_helper
23
+ from onnx import TensorProto
24
+
25
+
26
+ def parse_fasttext_vec(vec_file_path, vocab_size=None):
27
+ """
28
+ Parse FastText .vec file and extract vocabulary and embeddings.
29
+
30
+ Args:
31
+ vec_file_path: Path to FastText .vec file
32
+ vocab_size: Maximum vocabulary size (None for all words)
33
+
34
+ Returns:
35
+ tuple: (vocab_dict, embeddings_array, metadata)
36
+ - vocab_dict: Dictionary mapping word to index
37
+ - embeddings_array: NumPy array of shape [vocab_size, embedding_dim]
38
+ - metadata: Dictionary with file metadata
39
+ """
40
+ print(f"Parsing FastText file: {vec_file_path}")
41
+
42
+ word_to_idx = {}
43
+ embeddings = []
44
+ metadata = {
45
+ "source_format": "fasttext_vec",
46
+ "embedding_dim": None,
47
+ "vocab_size": 0,
48
+ "total_words_in_file": 0
49
+ }
50
+
51
+ with open(vec_file_path, 'r', encoding='utf-8') as f:
52
+ # First line: vocab_size and dimension
53
+ first_line = f.readline().strip()
54
+ parts = first_line.split()
55
+ total_vocab = int(parts[0])
56
+ embedding_dim = int(parts[1])
57
+
58
+ metadata["embedding_dim"] = embedding_dim
59
+ metadata["total_words_in_file"] = total_vocab
60
+
61
+ print(f" Total words in file: {total_vocab}")
62
+ print(f" Embedding dimension: {embedding_dim}")
63
+
64
+ # Limit vocab size if specified
65
+ if vocab_size is None or vocab_size > total_vocab:
66
+ vocab_size = total_vocab
67
+
68
+ metadata["vocab_size"] = vocab_size
69
+
70
+ print(f" Loading {vocab_size} words...")
71
+
72
+ # Read embeddings
73
+ for idx in range(vocab_size):
74
+ line = f.readline()
75
+ if not line:
76
+ break
77
+
78
+ parts = line.strip().split()
79
+ word = parts[0]
80
+ vector = np.array([float(x) for x in parts[1:]], dtype=np.float32)
81
+
82
+ word_to_idx[word] = idx
83
+ embeddings.append(vector)
84
+
85
+ if (idx + 1) % 10000 == 0:
86
+ print(f" Loaded {idx + 1} words...")
87
+
88
+ # Stack embeddings into a matrix
89
+ embeddings_matrix = np.vstack(embeddings).astype(np.float32)
90
+
91
+ print(f" Embeddings matrix shape: {embeddings_matrix.shape}")
92
+ print(f" Matrix size in MB: {embeddings_matrix.nbytes / (1024 * 1024):.2f}")
93
+
94
+ return word_to_idx, embeddings_matrix, metadata
95
+
96
+
97
+ def create_onnx_model(embeddings_matrix, word_to_idx, model_name="fasttext"):
98
+ """
99
+ Create ONNX model with embedding lookup layer.
100
+
101
+ Args:
102
+ embeddings_matrix: NumPy array of word embeddings [vocab_size, embedding_dim]
103
+ word_to_idx: Dictionary mapping words to indices
104
+ model_name: Name for the ONNX model
105
+
106
+ Returns:
107
+ onnx.ModelProto: ONNX model
108
+ """
109
+ vocab_size, embedding_dim = embeddings_matrix.shape
110
+
111
+ print(f"Creating ONNX model...")
112
+ print(f" Input: word_index (int64)")
113
+ print(f" Output: embedding (float32, shape=[{embedding_dim}])")
114
+
115
+ # Create input (word index)
116
+ input_tensor = helper.make_tensor_value_info(
117
+ 'word_index',
118
+ TensorProto.INT64,
119
+ [1] # Scalar input (single word index)
120
+ )
121
+
122
+ # Create output (embedding vector)
123
+ output_tensor = helper.make_tensor_value_info(
124
+ 'embedding',
125
+ TensorProto.FLOAT,
126
+ [embedding_dim]
127
+ )
128
+
129
+ # Create initializers for the embedding matrix
130
+ embedding_initializer = numpy_helper.from_array(embeddings_matrix, name='word_embeddings')
131
+
132
+ # Create Constant node for the embedding matrix
133
+ embedding_constant = helper.make_node(
134
+ 'Constant',
135
+ inputs=[],
136
+ outputs=['embeddings_matrix'],
137
+ value=embedding_initializer
138
+ )
139
+
140
+ # Create Gather node to lookup embedding by index
141
+ gather_node = helper.make_node(
142
+ 'Gather',
143
+ inputs=['embeddings_matrix', 'word_index'],
144
+ outputs=['embedding_flat'],
145
+ axis=0
146
+ )
147
+
148
+ # Create Squeeze node to remove the batch dimension
149
+ squeeze_node = helper.make_node(
150
+ 'Squeeze',
151
+ inputs=['embedding_flat'],
152
+ outputs=['embedding'],
153
+ axes=[0] # Remove first dimension
154
+ )
155
+
156
+ # Create graph
157
+ graph = helper.make_graph(
158
+ [embedding_constant, gather_node, squeeze_node],
159
+ f'{model_name}_embedding',
160
+ [input_tensor],
161
+ [output_tensor]
162
+ )
163
+
164
+ # Create model
165
+ model = helper.make_model(
166
+ graph,
167
+ producer_name='kotoshu-fasttext-converter',
168
+ producer_version='1.0.0',
169
+ opset_imports=[helper.make_operatorsetid('', 11)], # ONNX opset 11 for compatibility
170
+ ir_version=11 # Set IR version to match opset
171
+ )
172
+
173
+ # Add metadata
174
+ from onnx import StringStringEntryProto
175
+ model.metadata_props.append(StringStringEntryProto(key='vocabulary_size', value=str(vocab_size)))
176
+ model.metadata_props.append(StringStringEntryProto(key='embedding_dimension', value=str(embedding_dim)))
177
+ model.metadata_props.append(StringStringEntryProto(key='model_type', value='fasttext_embedding'))
178
+
179
+ print(f" ONNX model created successfully")
180
+
181
+ return model
182
+
183
+
184
+ def save_vocabulary(word_to_idx, vocab_file_path):
185
+ """
186
+ Save vocabulary dictionary to JSON file.
187
+
188
+ Args:
189
+ word_to_idx: Dictionary mapping words to indices
190
+ vocab_file_path: Path to save vocabulary file
191
+ """
192
+ print(f"Saving vocabulary to: {vocab_file_path}")
193
+
194
+ vocab_data = {
195
+ "vocab_size": len(word_to_idx),
196
+ "word_to_idx": word_to_idx
197
+ }
198
+
199
+ with open(vocab_file_path, 'w', encoding='utf-8') as f:
200
+ json.dump(vocab_data, f, ensure_ascii=False, indent=2)
201
+
202
+ print(f" Vocabulary saved: {len(word_to_idx)} words")
203
+
204
+
205
+ def main():
206
+ parser = argparse.ArgumentParser(
207
+ description='Convert FastText .vec file to ONNX format'
208
+ )
209
+ parser.add_argument(
210
+ 'vec_file',
211
+ type=str,
212
+ help='Path to FastText .vec file'
213
+ )
214
+ parser.add_argument(
215
+ 'output_onnx',
216
+ type=str,
217
+ help='Path to output ONNX file'
218
+ )
219
+ parser.add_argument(
220
+ '--vocab-size',
221
+ type=int,
222
+ default=None,
223
+ help='Maximum vocabulary size (default: all words)'
224
+ )
225
+ parser.add_argument(
226
+ '--save-vocab',
227
+ type=str,
228
+ default=None,
229
+ help='Path to save vocabulary JSON file (optional)'
230
+ )
231
+
232
+ args = parser.parse_args()
233
+
234
+ # Validate input file
235
+ vec_file = Path(args.vec_file)
236
+ if not vec_file.exists():
237
+ print(f"Error: Input file not found: {vec_file}")
238
+ sys.exit(1)
239
+
240
+ # Parse FastText file
241
+ word_to_idx, embeddings_matrix, metadata = parse_fasttext_vec(
242
+ vec_file,
243
+ vocab_size=args.vocab_size
244
+ )
245
+
246
+ # Create ONNX model
247
+ model = create_onnx_model(embeddings_matrix, word_to_idx)
248
+
249
+ # Save ONNX model
250
+ output_path = Path(args.output_onnx)
251
+ output_path.parent.mkdir(parents=True, exist_ok=True)
252
+
253
+ print(f"Saving ONNX model to: {output_path}")
254
+ onnx.save(model, str(output_path))
255
+ print(f" Model saved successfully")
256
+
257
+ # Calculate file sizes
258
+ vec_size_mb = vec_file.stat().st_size / (1024 * 1024)
259
+ onnx_size_mb = output_path.stat().st_size / (1024 * 1024)
260
+
261
+ print(f"\nFile size comparison:")
262
+ print(f" Input .vec file: {vec_size_mb:.2f} MB")
263
+ print(f" Output .onnx file: {onnx_size_mb:.2f} MB")
264
+ print(f" Compression ratio: {(vec_size_mb / onnx_size_mb):.2f}x")
265
+
266
+ # Save vocabulary if requested
267
+ if args.save_vocab:
268
+ save_vocabulary(word_to_idx, args.save_vocab)
269
+
270
+ print(f"\nConversion complete!")
271
+ print(f" Model metadata: {metadata}")
272
+
273
+
274
+ if __name__ == '__main__':
275
+ main()
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ # Single source of truth for where each remote resource lives.
5
+ #
6
+ # Every URL the cache layer fetches is built here. Caches do not
7
+ # construct URL strings inline. Per-repo pins honor that
8
+ # `kotoshu/dictionaries` ships on `v1` while the other repos are on
9
+ # `main`, which previously caused silent 404s on first-use.
10
+ #
11
+ # @example
12
+ # registry = Kotoshu::SourceRegistry.new
13
+ # registry.url_for(:spelling, lang: "en", ext: "aff")
14
+ # # => "https://raw.githubusercontent.com/kotoshu/dictionaries/v1/en/spelling/index.aff"
15
+ class SourceRegistry
16
+ Source = Struct.new(:repo, :default_pin, :template, keyword_init: true)
17
+
18
+ DEFAULT_BASE_URL = "https://raw.githubusercontent.com/kotoshu"
19
+
20
+ # @return [Hash<Symbol, Source>]
21
+ SOURCES = {
22
+ spelling: Source.new(repo: "dictionaries", default_pin: "v1", template: "dictionaries/%<pin>s/%<lang>s/spelling/index.%<ext>s"),
23
+ grammar: Source.new(repo: "dictionaries", default_pin: "v1", template: "dictionaries/%<pin>s/%<lang>s/grammar/rules.yaml"),
24
+ dict_manifest: Source.new(repo: "dictionaries", default_pin: "v1", template: "dictionaries/%<pin>s/manifest.json"),
25
+ frequency: Source.new(repo: "frequency-list-kelly", default_pin: "main", template: "frequency-list-kelly/%<pin>s/data/%<lang>s.json"),
26
+ freq_manifest: Source.new(repo: "frequency-list-kelly", default_pin: "main", template: "frequency-list-kelly/%<pin>s/manifest.json"),
27
+ model: Source.new(repo: "models-fasttext-onnx", default_pin: "main", template: "models-fasttext-onnx/%<pin>s/models/%<lang>s/fasttext.%<lang>s.onnx"),
28
+ model_vocab: Source.new(repo: "models-fasttext-onnx", default_pin: "main", template: "models-fasttext-onnx/%<pin>s/models/%<lang>s/fasttext.%<lang>s.vocab.json"),
29
+ model_manifest: Source.new(repo: "models-fasttext-onnx", default_pin: "main", template: "models-fasttext-onnx/%<pin>s/manifest.json")
30
+ }.freeze
31
+
32
+ # @param base_url [String] GitHub raw root, no trailing slash.
33
+ # @param pins [Hash<String, String>] Optional per-repo pin overrides
34
+ # keyed by repo name (e.g. `{ "dictionaries" => "v2" }`).
35
+ def initialize(base_url: DEFAULT_BASE_URL, pins: {})
36
+ @base_url = base_url.to_s.chomp("/")
37
+ @pins = pins.transform_keys(&:to_s).freeze
38
+ end
39
+
40
+ # @return [String] Configured GitHub raw root (no trailing slash).
41
+ attr_reader :base_url
42
+
43
+ # @param source_key [Symbol] One of `SOURCES.keys`.
44
+ # @param lang [String, nil] Language code, interpolated into template.
45
+ # @param ext [String, nil] File extension, interpolated into template.
46
+ # @return [String] Fully-qualified URL.
47
+ def url_for(source_key, lang: nil, ext: nil)
48
+ source = SOURCES.fetch(source_key) do
49
+ raise ArgumentError, "unknown source: #{source_key.inspect}"
50
+ end
51
+ path = source.template % { pin: pin_for(source), lang: lang, ext: ext }
52
+ "#{@base_url}/#{path}"
53
+ end
54
+
55
+ # @param source_key [Symbol]
56
+ # @return [String] Resolved pin (override or default).
57
+ def pin_for_source(source_key)
58
+ source = SOURCES.fetch(source_key)
59
+ pin_for(source)
60
+ end
61
+
62
+ # @param source_key [Symbol]
63
+ # @return [String] Repo name (e.g. "dictionaries").
64
+ def repo_for(source_key)
65
+ SOURCES.fetch(source_key).repo
66
+ end
67
+
68
+ private
69
+
70
+ def pin_for(source)
71
+ @pins.fetch(source.repo, source.default_pin)
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ class Spellchecker
5
+ # Parallel file checker for concurrent spellchecking.
6
+ #
7
+ # Uses a thread pool to check multiple files simultaneously,
8
+ # providing significant speedup on multi-core systems.
9
+ #
10
+ # @example Check files in parallel
11
+ # checker = ParallelChecker.new(spellchecker: spellchecker, worker_count: 4)
12
+ # results = checker.check_files_parallel(["file1.txt", "file2.txt"])
13
+ class ParallelChecker
14
+ # Default number of worker threads
15
+ DEFAULT_WORKER_COUNT = 4
16
+
17
+ # @return [Spellchecker] The underlying spellchecker
18
+ attr_reader :spellchecker
19
+
20
+ # @return [Integer] Number of worker threads
21
+ attr_reader :worker_count
22
+
23
+ # Create a new parallel checker.
24
+ #
25
+ # @param spellchecker [Spellchecker] The spellchecker to use
26
+ # @param worker_count [Integer] Number of worker threads (default: 4)
27
+ def initialize(spellchecker:, worker_count: DEFAULT_WORKER_COUNT)
28
+ @spellchecker = spellchecker
29
+ @worker_count = worker_count
30
+ @queue = Queue.new
31
+ @results = []
32
+ @mutex = Mutex.new
33
+ end
34
+
35
+ # Check multiple files in parallel.
36
+ #
37
+ # @param file_paths [Array<String>] Paths to files to check
38
+ # @return [Array<Core::Models::Result::DocumentResult>] Results for each file
39
+ def check_files_parallel(file_paths)
40
+ return [] if file_paths.empty?
41
+
42
+ # Add all files to the queue
43
+ file_paths.each { |path| @queue << path }
44
+
45
+ # Add poison pills to signal workers to stop
46
+ @worker_count.times { @queue << :done }
47
+
48
+ # Create and start workers
49
+ workers = @worker_count.times.map { create_worker }
50
+
51
+ # Wait for all workers to complete
52
+ workers.each(&:join)
53
+
54
+ # Clear queue for reuse
55
+ @queue.clear while @queue.empty? == false
56
+
57
+ @results
58
+ end
59
+
60
+ # Check a single file (convenience method).
61
+ #
62
+ # @param file_path [String] Path to file
63
+ # @return [Core::Models::Result::DocumentResult] Check result
64
+ def check_file(file_path)
65
+ @spellchecker.check_file(file_path)
66
+ end
67
+
68
+ private
69
+
70
+ # Create a worker thread.
71
+ #
72
+ # @return [Thread] Worker thread
73
+ def create_worker
74
+ Thread.new do
75
+ while (path = @queue.pop) != :done
76
+ begin
77
+ result = @spellchecker.check_file(path)
78
+ @mutex.synchronize do
79
+ @results << result
80
+ end
81
+ rescue StandardError => e
82
+ # Log error but continue processing other files
83
+ warn "Error checking file #{path}: #{e.message}"
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end