kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
FastText to ONNX Converter
|
|
4
|
+
|
|
5
|
+
Converts FastText .vec files to ONNX format for efficient deployment.
|
|
6
|
+
The ONNX model contains an embedding lookup layer with the word vectors.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python fasttext_to_onnx.py <vec_file> <output_onnx> [--vocab_size N]
|
|
10
|
+
|
|
11
|
+
Example:
|
|
12
|
+
python fasttext_to_onnx.py cc.en.300.vec fasttext.en.onnx --vocab_size 50000
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import sys
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
import onnx
|
|
22
|
+
from onnx import helper, numpy_helper
|
|
23
|
+
from onnx import TensorProto
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_fasttext_vec(vec_file_path, vocab_size=None):
|
|
27
|
+
"""
|
|
28
|
+
Parse FastText .vec file and extract vocabulary and embeddings.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
vec_file_path: Path to FastText .vec file
|
|
32
|
+
vocab_size: Maximum vocabulary size (None for all words)
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
tuple: (vocab_dict, embeddings_array, metadata)
|
|
36
|
+
- vocab_dict: Dictionary mapping word to index
|
|
37
|
+
- embeddings_array: NumPy array of shape [vocab_size, embedding_dim]
|
|
38
|
+
- metadata: Dictionary with file metadata
|
|
39
|
+
"""
|
|
40
|
+
print(f"Parsing FastText file: {vec_file_path}")
|
|
41
|
+
|
|
42
|
+
word_to_idx = {}
|
|
43
|
+
embeddings = []
|
|
44
|
+
metadata = {
|
|
45
|
+
"source_format": "fasttext_vec",
|
|
46
|
+
"embedding_dim": None,
|
|
47
|
+
"vocab_size": 0,
|
|
48
|
+
"total_words_in_file": 0
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
with open(vec_file_path, 'r', encoding='utf-8') as f:
|
|
52
|
+
# First line: vocab_size and dimension
|
|
53
|
+
first_line = f.readline().strip()
|
|
54
|
+
parts = first_line.split()
|
|
55
|
+
total_vocab = int(parts[0])
|
|
56
|
+
embedding_dim = int(parts[1])
|
|
57
|
+
|
|
58
|
+
metadata["embedding_dim"] = embedding_dim
|
|
59
|
+
metadata["total_words_in_file"] = total_vocab
|
|
60
|
+
|
|
61
|
+
print(f" Total words in file: {total_vocab}")
|
|
62
|
+
print(f" Embedding dimension: {embedding_dim}")
|
|
63
|
+
|
|
64
|
+
# Limit vocab size if specified
|
|
65
|
+
if vocab_size is None or vocab_size > total_vocab:
|
|
66
|
+
vocab_size = total_vocab
|
|
67
|
+
|
|
68
|
+
metadata["vocab_size"] = vocab_size
|
|
69
|
+
|
|
70
|
+
print(f" Loading {vocab_size} words...")
|
|
71
|
+
|
|
72
|
+
# Read embeddings
|
|
73
|
+
for idx in range(vocab_size):
|
|
74
|
+
line = f.readline()
|
|
75
|
+
if not line:
|
|
76
|
+
break
|
|
77
|
+
|
|
78
|
+
parts = line.strip().split()
|
|
79
|
+
word = parts[0]
|
|
80
|
+
vector = np.array([float(x) for x in parts[1:]], dtype=np.float32)
|
|
81
|
+
|
|
82
|
+
word_to_idx[word] = idx
|
|
83
|
+
embeddings.append(vector)
|
|
84
|
+
|
|
85
|
+
if (idx + 1) % 10000 == 0:
|
|
86
|
+
print(f" Loaded {idx + 1} words...")
|
|
87
|
+
|
|
88
|
+
# Stack embeddings into a matrix
|
|
89
|
+
embeddings_matrix = np.vstack(embeddings).astype(np.float32)
|
|
90
|
+
|
|
91
|
+
print(f" Embeddings matrix shape: {embeddings_matrix.shape}")
|
|
92
|
+
print(f" Matrix size in MB: {embeddings_matrix.nbytes / (1024 * 1024):.2f}")
|
|
93
|
+
|
|
94
|
+
return word_to_idx, embeddings_matrix, metadata
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def create_onnx_model(embeddings_matrix, word_to_idx, model_name="fasttext"):
|
|
98
|
+
"""
|
|
99
|
+
Create ONNX model with embedding lookup layer.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
embeddings_matrix: NumPy array of word embeddings [vocab_size, embedding_dim]
|
|
103
|
+
word_to_idx: Dictionary mapping words to indices
|
|
104
|
+
model_name: Name for the ONNX model
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
onnx.ModelProto: ONNX model
|
|
108
|
+
"""
|
|
109
|
+
vocab_size, embedding_dim = embeddings_matrix.shape
|
|
110
|
+
|
|
111
|
+
print(f"Creating ONNX model...")
|
|
112
|
+
print(f" Input: word_index (int64)")
|
|
113
|
+
print(f" Output: embedding (float32, shape=[{embedding_dim}])")
|
|
114
|
+
|
|
115
|
+
# Create input (word index)
|
|
116
|
+
input_tensor = helper.make_tensor_value_info(
|
|
117
|
+
'word_index',
|
|
118
|
+
TensorProto.INT64,
|
|
119
|
+
[1] # Scalar input (single word index)
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Create output (embedding vector)
|
|
123
|
+
output_tensor = helper.make_tensor_value_info(
|
|
124
|
+
'embedding',
|
|
125
|
+
TensorProto.FLOAT,
|
|
126
|
+
[embedding_dim]
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Create initializers for the embedding matrix
|
|
130
|
+
embedding_initializer = numpy_helper.from_array(embeddings_matrix, name='word_embeddings')
|
|
131
|
+
|
|
132
|
+
# Create Constant node for the embedding matrix
|
|
133
|
+
embedding_constant = helper.make_node(
|
|
134
|
+
'Constant',
|
|
135
|
+
inputs=[],
|
|
136
|
+
outputs=['embeddings_matrix'],
|
|
137
|
+
value=embedding_initializer
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Create Gather node to lookup embedding by index
|
|
141
|
+
gather_node = helper.make_node(
|
|
142
|
+
'Gather',
|
|
143
|
+
inputs=['embeddings_matrix', 'word_index'],
|
|
144
|
+
outputs=['embedding_flat'],
|
|
145
|
+
axis=0
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# Create Squeeze node to remove the batch dimension
|
|
149
|
+
squeeze_node = helper.make_node(
|
|
150
|
+
'Squeeze',
|
|
151
|
+
inputs=['embedding_flat'],
|
|
152
|
+
outputs=['embedding'],
|
|
153
|
+
axes=[0] # Remove first dimension
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Create graph
|
|
157
|
+
graph = helper.make_graph(
|
|
158
|
+
[embedding_constant, gather_node, squeeze_node],
|
|
159
|
+
f'{model_name}_embedding',
|
|
160
|
+
[input_tensor],
|
|
161
|
+
[output_tensor]
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Create model
|
|
165
|
+
model = helper.make_model(
|
|
166
|
+
graph,
|
|
167
|
+
producer_name='kotoshu-fasttext-converter',
|
|
168
|
+
producer_version='1.0.0',
|
|
169
|
+
opset_imports=[helper.make_operatorsetid('', 11)], # ONNX opset 11 for compatibility
|
|
170
|
+
ir_version=11 # Set IR version to match opset
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Add metadata
|
|
174
|
+
from onnx import StringStringEntryProto
|
|
175
|
+
model.metadata_props.append(StringStringEntryProto(key='vocabulary_size', value=str(vocab_size)))
|
|
176
|
+
model.metadata_props.append(StringStringEntryProto(key='embedding_dimension', value=str(embedding_dim)))
|
|
177
|
+
model.metadata_props.append(StringStringEntryProto(key='model_type', value='fasttext_embedding'))
|
|
178
|
+
|
|
179
|
+
print(f" ONNX model created successfully")
|
|
180
|
+
|
|
181
|
+
return model
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def save_vocabulary(word_to_idx, vocab_file_path):
|
|
185
|
+
"""
|
|
186
|
+
Save vocabulary dictionary to JSON file.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
word_to_idx: Dictionary mapping words to indices
|
|
190
|
+
vocab_file_path: Path to save vocabulary file
|
|
191
|
+
"""
|
|
192
|
+
print(f"Saving vocabulary to: {vocab_file_path}")
|
|
193
|
+
|
|
194
|
+
vocab_data = {
|
|
195
|
+
"vocab_size": len(word_to_idx),
|
|
196
|
+
"word_to_idx": word_to_idx
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
with open(vocab_file_path, 'w', encoding='utf-8') as f:
|
|
200
|
+
json.dump(vocab_data, f, ensure_ascii=False, indent=2)
|
|
201
|
+
|
|
202
|
+
print(f" Vocabulary saved: {len(word_to_idx)} words")
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def main():
|
|
206
|
+
parser = argparse.ArgumentParser(
|
|
207
|
+
description='Convert FastText .vec file to ONNX format'
|
|
208
|
+
)
|
|
209
|
+
parser.add_argument(
|
|
210
|
+
'vec_file',
|
|
211
|
+
type=str,
|
|
212
|
+
help='Path to FastText .vec file'
|
|
213
|
+
)
|
|
214
|
+
parser.add_argument(
|
|
215
|
+
'output_onnx',
|
|
216
|
+
type=str,
|
|
217
|
+
help='Path to output ONNX file'
|
|
218
|
+
)
|
|
219
|
+
parser.add_argument(
|
|
220
|
+
'--vocab-size',
|
|
221
|
+
type=int,
|
|
222
|
+
default=None,
|
|
223
|
+
help='Maximum vocabulary size (default: all words)'
|
|
224
|
+
)
|
|
225
|
+
parser.add_argument(
|
|
226
|
+
'--save-vocab',
|
|
227
|
+
type=str,
|
|
228
|
+
default=None,
|
|
229
|
+
help='Path to save vocabulary JSON file (optional)'
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
args = parser.parse_args()
|
|
233
|
+
|
|
234
|
+
# Validate input file
|
|
235
|
+
vec_file = Path(args.vec_file)
|
|
236
|
+
if not vec_file.exists():
|
|
237
|
+
print(f"Error: Input file not found: {vec_file}")
|
|
238
|
+
sys.exit(1)
|
|
239
|
+
|
|
240
|
+
# Parse FastText file
|
|
241
|
+
word_to_idx, embeddings_matrix, metadata = parse_fasttext_vec(
|
|
242
|
+
vec_file,
|
|
243
|
+
vocab_size=args.vocab_size
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Create ONNX model
|
|
247
|
+
model = create_onnx_model(embeddings_matrix, word_to_idx)
|
|
248
|
+
|
|
249
|
+
# Save ONNX model
|
|
250
|
+
output_path = Path(args.output_onnx)
|
|
251
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
252
|
+
|
|
253
|
+
print(f"Saving ONNX model to: {output_path}")
|
|
254
|
+
onnx.save(model, str(output_path))
|
|
255
|
+
print(f" Model saved successfully")
|
|
256
|
+
|
|
257
|
+
# Calculate file sizes
|
|
258
|
+
vec_size_mb = vec_file.stat().st_size / (1024 * 1024)
|
|
259
|
+
onnx_size_mb = output_path.stat().st_size / (1024 * 1024)
|
|
260
|
+
|
|
261
|
+
print(f"\nFile size comparison:")
|
|
262
|
+
print(f" Input .vec file: {vec_size_mb:.2f} MB")
|
|
263
|
+
print(f" Output .onnx file: {onnx_size_mb:.2f} MB")
|
|
264
|
+
print(f" Compression ratio: {(vec_size_mb / onnx_size_mb):.2f}x")
|
|
265
|
+
|
|
266
|
+
# Save vocabulary if requested
|
|
267
|
+
if args.save_vocab:
|
|
268
|
+
save_vocabulary(word_to_idx, args.save_vocab)
|
|
269
|
+
|
|
270
|
+
print(f"\nConversion complete!")
|
|
271
|
+
print(f" Model metadata: {metadata}")
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
if __name__ == '__main__':
|
|
275
|
+
main()
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
# Single source of truth for where each remote resource lives.
|
|
5
|
+
#
|
|
6
|
+
# Every URL the cache layer fetches is built here. Caches do not
|
|
7
|
+
# construct URL strings inline. Per-repo pins honor that
|
|
8
|
+
# `kotoshu/dictionaries` ships on `v1` while the other repos are on
|
|
9
|
+
# `main`, which previously caused silent 404s on first-use.
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# registry = Kotoshu::SourceRegistry.new
|
|
13
|
+
# registry.url_for(:spelling, lang: "en", ext: "aff")
|
|
14
|
+
# # => "https://raw.githubusercontent.com/kotoshu/dictionaries/v1/en/spelling/index.aff"
|
|
15
|
+
class SourceRegistry
|
|
16
|
+
Source = Struct.new(:repo, :default_pin, :template, keyword_init: true)
|
|
17
|
+
|
|
18
|
+
DEFAULT_BASE_URL = "https://raw.githubusercontent.com/kotoshu"
|
|
19
|
+
|
|
20
|
+
# @return [Hash<Symbol, Source>]
|
|
21
|
+
SOURCES = {
|
|
22
|
+
spelling: Source.new(repo: "dictionaries", default_pin: "v1", template: "dictionaries/%<pin>s/%<lang>s/spelling/index.%<ext>s"),
|
|
23
|
+
grammar: Source.new(repo: "dictionaries", default_pin: "v1", template: "dictionaries/%<pin>s/%<lang>s/grammar/rules.yaml"),
|
|
24
|
+
dict_manifest: Source.new(repo: "dictionaries", default_pin: "v1", template: "dictionaries/%<pin>s/manifest.json"),
|
|
25
|
+
frequency: Source.new(repo: "frequency-list-kelly", default_pin: "main", template: "frequency-list-kelly/%<pin>s/data/%<lang>s.json"),
|
|
26
|
+
freq_manifest: Source.new(repo: "frequency-list-kelly", default_pin: "main", template: "frequency-list-kelly/%<pin>s/manifest.json"),
|
|
27
|
+
model: Source.new(repo: "models-fasttext-onnx", default_pin: "main", template: "models-fasttext-onnx/%<pin>s/models/%<lang>s/fasttext.%<lang>s.onnx"),
|
|
28
|
+
model_vocab: Source.new(repo: "models-fasttext-onnx", default_pin: "main", template: "models-fasttext-onnx/%<pin>s/models/%<lang>s/fasttext.%<lang>s.vocab.json"),
|
|
29
|
+
model_manifest: Source.new(repo: "models-fasttext-onnx", default_pin: "main", template: "models-fasttext-onnx/%<pin>s/manifest.json")
|
|
30
|
+
}.freeze
|
|
31
|
+
|
|
32
|
+
# @param base_url [String] GitHub raw root, no trailing slash.
|
|
33
|
+
# @param pins [Hash<String, String>] Optional per-repo pin overrides
|
|
34
|
+
# keyed by repo name (e.g. `{ "dictionaries" => "v2" }`).
|
|
35
|
+
def initialize(base_url: DEFAULT_BASE_URL, pins: {})
|
|
36
|
+
@base_url = base_url.to_s.chomp("/")
|
|
37
|
+
@pins = pins.transform_keys(&:to_s).freeze
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# @return [String] Configured GitHub raw root (no trailing slash).
|
|
41
|
+
attr_reader :base_url
|
|
42
|
+
|
|
43
|
+
# @param source_key [Symbol] One of `SOURCES.keys`.
|
|
44
|
+
# @param lang [String, nil] Language code, interpolated into template.
|
|
45
|
+
# @param ext [String, nil] File extension, interpolated into template.
|
|
46
|
+
# @return [String] Fully-qualified URL.
|
|
47
|
+
def url_for(source_key, lang: nil, ext: nil)
|
|
48
|
+
source = SOURCES.fetch(source_key) do
|
|
49
|
+
raise ArgumentError, "unknown source: #{source_key.inspect}"
|
|
50
|
+
end
|
|
51
|
+
path = source.template % { pin: pin_for(source), lang: lang, ext: ext }
|
|
52
|
+
"#{@base_url}/#{path}"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# @param source_key [Symbol]
|
|
56
|
+
# @return [String] Resolved pin (override or default).
|
|
57
|
+
def pin_for_source(source_key)
|
|
58
|
+
source = SOURCES.fetch(source_key)
|
|
59
|
+
pin_for(source)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# @param source_key [Symbol]
|
|
63
|
+
# @return [String] Repo name (e.g. "dictionaries").
|
|
64
|
+
def repo_for(source_key)
|
|
65
|
+
SOURCES.fetch(source_key).repo
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
private
|
|
69
|
+
|
|
70
|
+
def pin_for(source)
|
|
71
|
+
@pins.fetch(source.repo, source.default_pin)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
class Spellchecker
|
|
5
|
+
# Parallel file checker for concurrent spellchecking.
|
|
6
|
+
#
|
|
7
|
+
# Uses a thread pool to check multiple files simultaneously,
|
|
8
|
+
# providing significant speedup on multi-core systems.
|
|
9
|
+
#
|
|
10
|
+
# @example Check files in parallel
|
|
11
|
+
# checker = ParallelChecker.new(spellchecker: spellchecker, worker_count: 4)
|
|
12
|
+
# results = checker.check_files_parallel(["file1.txt", "file2.txt"])
|
|
13
|
+
class ParallelChecker
|
|
14
|
+
# Default number of worker threads
|
|
15
|
+
DEFAULT_WORKER_COUNT = 4
|
|
16
|
+
|
|
17
|
+
# @return [Spellchecker] The underlying spellchecker
|
|
18
|
+
attr_reader :spellchecker
|
|
19
|
+
|
|
20
|
+
# @return [Integer] Number of worker threads
|
|
21
|
+
attr_reader :worker_count
|
|
22
|
+
|
|
23
|
+
# Create a new parallel checker.
|
|
24
|
+
#
|
|
25
|
+
# @param spellchecker [Spellchecker] The spellchecker to use
|
|
26
|
+
# @param worker_count [Integer] Number of worker threads (default: 4)
|
|
27
|
+
def initialize(spellchecker:, worker_count: DEFAULT_WORKER_COUNT)
|
|
28
|
+
@spellchecker = spellchecker
|
|
29
|
+
@worker_count = worker_count
|
|
30
|
+
@queue = Queue.new
|
|
31
|
+
@results = []
|
|
32
|
+
@mutex = Mutex.new
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Check multiple files in parallel.
|
|
36
|
+
#
|
|
37
|
+
# @param file_paths [Array<String>] Paths to files to check
|
|
38
|
+
# @return [Array<Core::Models::Result::DocumentResult>] Results for each file
|
|
39
|
+
def check_files_parallel(file_paths)
|
|
40
|
+
return [] if file_paths.empty?
|
|
41
|
+
|
|
42
|
+
# Add all files to the queue
|
|
43
|
+
file_paths.each { |path| @queue << path }
|
|
44
|
+
|
|
45
|
+
# Add poison pills to signal workers to stop
|
|
46
|
+
@worker_count.times { @queue << :done }
|
|
47
|
+
|
|
48
|
+
# Create and start workers
|
|
49
|
+
workers = @worker_count.times.map { create_worker }
|
|
50
|
+
|
|
51
|
+
# Wait for all workers to complete
|
|
52
|
+
workers.each(&:join)
|
|
53
|
+
|
|
54
|
+
# Clear queue for reuse
|
|
55
|
+
@queue.clear while @queue.empty? == false
|
|
56
|
+
|
|
57
|
+
@results
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Check a single file (convenience method).
|
|
61
|
+
#
|
|
62
|
+
# @param file_path [String] Path to file
|
|
63
|
+
# @return [Core::Models::Result::DocumentResult] Check result
|
|
64
|
+
def check_file(file_path)
|
|
65
|
+
@spellchecker.check_file(file_path)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
private
|
|
69
|
+
|
|
70
|
+
# Create a worker thread.
|
|
71
|
+
#
|
|
72
|
+
# @return [Thread] Worker thread
|
|
73
|
+
def create_worker
|
|
74
|
+
Thread.new do
|
|
75
|
+
while (path = @queue.pop) != :done
|
|
76
|
+
begin
|
|
77
|
+
result = @spellchecker.check_file(path)
|
|
78
|
+
@mutex.synchronize do
|
|
79
|
+
@results << result
|
|
80
|
+
end
|
|
81
|
+
rescue StandardError => e
|
|
82
|
+
# Log error but continue processing other files
|
|
83
|
+
warn "Error checking file #{path}: #{e.message}"
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|