phrasekit 0.2.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7a0ee253439d679820d688710e836095bac788877e2d24fa3bd960451aa9bcfc
4
+ data.tar.gz: d60f38a7d5f7dc76361d3a0df96be07addd40c09e94b8161243218758f79af2a
5
+ SHA512:
6
+ metadata.gz: f7c63d27a552be412184afab038c3a3a0a53061c64db8e1c47f771563d7954872301875e6c92852f549115be93610fdac916f93552e3b73b2a3595105a86a636
7
+ data.tar.gz: de47a5bc506a9f13a07e21520b9765d6690cd66db5245aacaa152cb4b78d659a22b8a8e0c88f0fedab8b8fb73e3309454dfb56502ce74a7700284f4dbe3dc5fc
data/README.md ADDED
@@ -0,0 +1,131 @@
1
+ # PhraseKit
2
+
3
+ Ultra-fast deterministic phrase matching for Ruby using Rust and Aho-Corasick automaton.
4
+
5
+ PhraseKit provides high-performance phrase recognition over token sequences, designed for search query understanding, NLP pipelines, and information extraction at scale.
6
+
7
+ ## Features
8
+
9
+ - **Deterministic matching** using Double-Array Aho-Corasick (daachorse)
10
+ - **Sub-millisecond performance** for queries with millions of phrases
11
+ - **Hot-reloadable** artifacts with zero downtime
12
+ - **Thread-safe** operations via Magnus/Rust
13
+ - **Multiple matching policies**: leftmost-longest, leftmost-first, salience-max
14
+ - **Production-ready** with health checks, stats, and observability
15
+
16
+ ## Installation
17
+
18
+ Add to your Gemfile:
19
+
20
+ ```ruby
21
+ gem 'phrasekit'
22
+ ```
23
+
24
+ Or install directly:
25
+
26
+ ```bash
27
+ gem install phrasekit
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ ### Basic Setup
33
+
34
+ ```ruby
35
+ require 'phrasekit'
36
+
37
+ # Load phrase artifacts
38
+ PhraseKit.load!(
39
+ automaton_path: "/path/to/phrases.daac",
40
+ payloads_path: "/path/to/payloads.bin",
41
+ manifest_path: "/path/to/phrases.json"
42
+ )
43
+
44
+ # Match tokens
45
+ token_ids = [1012, 441, 7788, 902, 1455] # Your tokenized input
46
+ matches = PhraseKit.match_tokens(
47
+ token_ids: token_ids,
48
+ policy: :leftmost_longest, # or :leftmost_first, :salience_max
49
+ max: 32 # Maximum matches to return
50
+ )
51
+
52
+ # Returns array of matches:
53
+ # [
54
+ # {start: 1, end: 3, phrase_id: 12345, salience: 2.13, count: 314, n: 2},
55
+ # {start: 3, end: 5, phrase_id: 67890, salience: 1.82, count: 271, n: 2}
56
+ # ]
57
+ ```
58
+
59
+ ### Integration with SpellKit
60
+
61
+ PhraseKit is designed to work with SpellKit for typo correction:
62
+
63
+ ```ruby
64
+ class SearchTermExtractor
65
+ def call(text)
66
+ # 1. Tokenize
67
+ tokens = MyTokenizer.tokenize(text)
68
+
69
+ # 2. Spell correction (via SpellKit gem)
70
+ corrected = SpellKit.correct_tokens(tokens, guard: :domain)
71
+
72
+ # 3. Convert to token IDs
73
+ token_ids = MyTokenizer.to_ids(corrected)
74
+
75
+ # 4. Extract phrases
76
+ PhraseKit.match_tokens(token_ids: token_ids, policy: :leftmost_longest)
77
+ end
78
+ end
79
+ ```
80
+
81
+ ### Monitoring
82
+
83
+ ```ruby
84
+ # Check health
85
+ PhraseKit.healthcheck # Raises on issues
86
+
87
+ # Get statistics
88
+ PhraseKit.stats
89
+ # => {
90
+ # version: "pk-2025-09-25-01",
91
+ # loaded_at: Time,
92
+ # num_patterns: 1_287_345,
93
+ # heap_mb: 142.3,
94
+ # hits_total: 892341,
95
+ # p50_us: 47,
96
+ # p95_us: 189
97
+ # }
98
+ ```
99
+
100
+ ## Architecture
101
+
102
+ PhraseKit uses:
103
+ - **Rust** for core matching logic
104
+ - **Magnus** for Ruby-Rust bindings
105
+ - **Daachorse** for the Aho-Corasick automaton
106
+ - **Static linking** for reliability
107
+
108
+ ## Performance
109
+
110
+ Target performance with 1-3M phrases:
111
+ - p50 < 100Âľs
112
+ - p95 < 500Âľs
113
+ - Memory < 300MB
114
+
115
+ ## Development
116
+
117
+ ```bash
118
+ # Setup
119
+ bundle install
120
+ bundle exec rake compile
121
+
122
+ # Run tests
123
+ bundle exec rspec
124
+
125
+ # Build gem
126
+ gem build phrasekit.gemspec
127
+ ```
128
+
129
+ ## License
130
+
131
+ MIT License. See LICENSE.txt for details.
@@ -0,0 +1,45 @@
1
+ [package]
2
+ name = "phrasekit"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+
6
+ [lib]
7
+ crate-type = ["cdylib"]
8
+ name = "phrasekit"
9
+
10
+ [[bin]]
11
+ name = "fixture_builder"
12
+ path = "src/bin/fixture_builder.rs"
13
+
14
+ [[bin]]
15
+ name = "phrasekit_build"
16
+ path = "src/bin/phrasekit_build.rs"
17
+
18
+ [[bin]]
19
+ name = "phrasekit_mine"
20
+ path = "src/bin/phrasekit_mine.rs"
21
+
22
+ [[bin]]
23
+ name = "phrasekit_score"
24
+ path = "src/bin/phrasekit_score.rs"
25
+
26
+ [[bin]]
27
+ name = "phrasekit_tag"
28
+ path = "src/bin/phrasekit_tag.rs"
29
+
30
+ [dependencies]
31
+ magnus = { version = "0.7" }
32
+ daachorse = "1.0"
33
+ serde = { version = "1.0", features = ["derive"] }
34
+ serde_json = "1.0"
35
+ parking_lot = "0.12"
36
+ notify = "6.1"
37
+ thiserror = "1.0"
38
+ chrono = "0.4"
39
+
40
+ [dependencies.rb-sys]
41
+ version = "0.9"
42
+ features = ["stable-api-compiled-fallback"]
43
+
44
+ [dev-dependencies]
45
+ tempfile = "3.10"
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
3
+
4
+ create_rust_makefile("phrasekit/phrasekit")
@@ -0,0 +1,131 @@
1
+ use daachorse::DoubleArrayAhoCorasick;
2
+ use serde::{Deserialize, Serialize};
3
+ use std::collections::HashMap;
4
+ use std::fs::File;
5
+ use std::path::PathBuf;
6
+
7
+ #[path = "../payload.rs"]
8
+ mod payload;
9
+
10
+ #[path = "../manifest.rs"]
11
+ mod manifest;
12
+
13
+ use manifest::Manifest;
14
+ use payload::Payload;
15
+
16
+ #[derive(Debug, Serialize)]
17
+ struct Vocabulary {
18
+ tokens: HashMap<String, u32>,
19
+ special_tokens: HashMap<String, u32>,
20
+ vocab_size: usize,
21
+ separator_id: u32,
22
+ }
23
+
24
+ fn main() -> Result<(), Box<dyn std::error::Error>> {
25
+ let args: Vec<String> = std::env::args().collect();
26
+ let output_dir = if args.len() > 1 {
27
+ PathBuf::from(&args[1])
28
+ } else {
29
+ PathBuf::from("spec/fixtures")
30
+ };
31
+
32
+ std::fs::create_dir_all(&output_dir)?;
33
+
34
+ println!("Building test fixtures in: {}", output_dir.display());
35
+
36
+ // Define test patterns as byte sequences (token_id + separator)
37
+ let separator: u32 = 4294967294;
38
+
39
+ // Pattern 0: [100, 101] - "machine learning"
40
+ let pattern0 = encode_tokens(&[100, 101], separator);
41
+
42
+ // Pattern 1: [200, 101] - "deep learning"
43
+ let pattern1 = encode_tokens(&[200, 101], separator);
44
+
45
+ // Pattern 2: [100, 101, 102] - "machine learning algorithms"
46
+ let pattern2 = encode_tokens(&[100, 101, 102], separator);
47
+
48
+ let patterns = vec![pattern0, pattern1, pattern2];
49
+ let num_patterns = patterns.len();
50
+
51
+ // Build automaton
52
+ println!("Building automaton with {} patterns", num_patterns);
53
+ let automaton: DoubleArrayAhoCorasick<u32> = DoubleArrayAhoCorasick::new(patterns)
54
+ .map_err(|e| format!("Failed to build automaton: {:?}", e))?;
55
+
56
+ // Serialize automaton
57
+ let automaton_bytes = automaton.serialize();
58
+ let automaton_path = output_dir.join("phrases.daac");
59
+ std::fs::write(&automaton_path, &automaton_bytes)?;
60
+ println!("✓ Wrote automaton ({} bytes) to {}", automaton_bytes.len(), automaton_path.display());
61
+
62
+ // Create payloads
63
+ let payloads = vec![
64
+ Payload::new(100, 2.5, 150, 2), // "machine learning" - [100, 101]
65
+ Payload::new(200, 2.0, 100, 2), // "deep learning" - [200, 101]
66
+ Payload::new(300, 3.0, 200, 3), // "machine learning algorithms" - [100, 101, 102]
67
+ ];
68
+
69
+ // Write payloads
70
+ let payloads_path = output_dir.join("payloads.bin");
71
+ let mut payloads_file = File::create(&payloads_path)?;
72
+ for payload in &payloads {
73
+ payload.write_to(&mut payloads_file)?;
74
+ }
75
+ println!("✓ Wrote {} payloads to {}", payloads.len(), payloads_path.display());
76
+
77
+ // Create manifest
78
+ let manifest = Manifest {
79
+ version: "test-v1".to_string(),
80
+ tokenizer: "test-tokenizer".to_string(),
81
+ num_patterns: num_patterns,
82
+ min_count: Some(10),
83
+ salience_threshold: Some(1.0),
84
+ built_at: "2025-09-25T00:00:00Z".to_string(),
85
+ separator_id: separator,
86
+ };
87
+
88
+ let manifest_path = output_dir.join("manifest.json");
89
+ let manifest_json = serde_json::to_string_pretty(&manifest)?;
90
+ std::fs::write(&manifest_path, manifest_json)?;
91
+ println!("✓ Wrote manifest to {}", manifest_path.display());
92
+
93
+ // Create vocabulary
94
+ let mut tokens = HashMap::new();
95
+ tokens.insert("machine".to_string(), 100);
96
+ tokens.insert("learning".to_string(), 101);
97
+ tokens.insert("algorithms".to_string(), 102);
98
+ tokens.insert("deep".to_string(), 200);
99
+
100
+ let mut special_tokens = HashMap::new();
101
+ special_tokens.insert("<UNK>".to_string(), 0);
102
+
103
+ let vocabulary = Vocabulary {
104
+ tokens,
105
+ special_tokens,
106
+ vocab_size: 5,
107
+ separator_id: separator,
108
+ };
109
+
110
+ let vocab_path = output_dir.join("vocab.json");
111
+ let vocab_json = serde_json::to_string_pretty(&vocabulary)?;
112
+ std::fs::write(&vocab_path, vocab_json)?;
113
+ println!("✓ Wrote vocabulary to {}", vocab_path.display());
114
+
115
+ println!("\n✅ Test fixtures generated successfully!");
116
+ println!("\nTest patterns:");
117
+ println!(" Pattern 0: tokens [100, 101] → phrase_id 100 (salience 2.5) - 'machine learning'");
118
+ println!(" Pattern 1: tokens [200, 101] → phrase_id 200 (salience 2.0) - 'deep learning'");
119
+ println!(" Pattern 2: tokens [100, 101, 102] → phrase_id 300 (salience 3.0) - 'machine learning algorithms'");
120
+
121
+ Ok(())
122
+ }
123
+
124
+ fn encode_tokens(tokens: &[u32], separator: u32) -> Vec<u8> {
125
+ let mut bytes = Vec::new();
126
+ for &token in tokens {
127
+ bytes.extend_from_slice(&token.to_le_bytes());
128
+ bytes.extend_from_slice(&separator.to_le_bytes());
129
+ }
130
+ bytes
131
+ }
@@ -0,0 +1,326 @@
1
+ use daachorse::DoubleArrayAhoCorasick;
2
+ use serde::{Deserialize, Serialize};
3
+ use std::collections::{HashMap, HashSet};
4
+ use std::fs::File;
5
+ use std::io::{BufRead, BufReader, Write};
6
+ use std::path::{Path, PathBuf};
7
+
8
+ #[path = "../payload.rs"]
9
+ mod payload;
10
+
11
+ #[path = "../manifest.rs"]
12
+ mod manifest;
13
+
14
+ use manifest::Manifest;
15
+ use payload::Payload;
16
+
17
+ #[derive(Debug, Deserialize)]
18
+ struct PhraseInput {
19
+ tokens: Vec<String>,
20
+ phrase_id: u32,
21
+ salience: f32,
22
+ #[serde(alias = "domain_count")]
23
+ count: u32,
24
+ }
25
+
26
+ struct ProcessedPhrase {
27
+ token_ids: Vec<u32>,
28
+ phrase_id: u32,
29
+ salience: f32,
30
+ count: u32,
31
+ length: u8,
32
+ }
33
+
34
+ #[derive(Debug, Deserialize)]
35
+ struct BuildConfig {
36
+ version: String,
37
+ tokenizer: String,
38
+ separator_id: u32,
39
+ #[serde(default)]
40
+ min_count: Option<u32>,
41
+ #[serde(default)]
42
+ salience_threshold: Option<f32>,
43
+ }
44
+
45
+ #[derive(Debug)]
46
+ struct BuildStats {
47
+ total_input: usize,
48
+ filtered_low_count: usize,
49
+ filtered_low_salience: usize,
50
+ duplicate_phrase_ids: usize,
51
+ invalid_tokens: usize,
52
+ built: usize,
53
+ }
54
+
55
+ #[derive(Debug, Serialize)]
56
+ struct Vocabulary {
57
+ tokens: HashMap<String, u32>,
58
+ special_tokens: HashMap<String, u32>,
59
+ vocab_size: usize,
60
+ separator_id: u32,
61
+ }
62
+
63
+ fn main() -> Result<(), Box<dyn std::error::Error>> {
64
+ let args: Vec<String> = std::env::args().collect();
65
+
66
+ if args.len() < 4 {
67
+ eprintln!("Usage: phrasekit_build <input.jsonl> <config.json> <output_dir>");
68
+ eprintln!("\nExample:");
69
+ eprintln!(" phrasekit_build phrases.jsonl config.json ./artifacts/");
70
+ std::process::exit(1);
71
+ }
72
+
73
+ let input_path = &args[1];
74
+ let config_path = &args[2];
75
+ let output_dir = PathBuf::from(&args[3]);
76
+
77
+ println!("📦 PhraseKit Artifact Builder");
78
+ println!("════════════════════════════════════════");
79
+ println!("Input: {}", input_path);
80
+ println!("Config: {}", config_path);
81
+ println!("Output: {}", output_dir.display());
82
+ println!();
83
+
84
+ // Load config
85
+ let config = load_config(config_path)?;
86
+ println!("✓ Loaded config: {} (tokenizer: {})", config.version, config.tokenizer);
87
+
88
+ // Create output directory
89
+ std::fs::create_dir_all(&output_dir)?;
90
+
91
+ // Load and validate phrases
92
+ let (text_phrases, stats, unique_tokens) = load_and_validate_phrases(input_path, &config)?;
93
+
94
+ println!("\n📊 Build Statistics:");
95
+ println!(" Total input phrases: {}", stats.total_input);
96
+ if stats.filtered_low_count > 0 {
97
+ println!(" Filtered (low count): {}", stats.filtered_low_count);
98
+ }
99
+ if stats.filtered_low_salience > 0 {
100
+ println!(" Filtered (low salience): {}", stats.filtered_low_salience);
101
+ }
102
+ if stats.duplicate_phrase_ids > 0 {
103
+ println!(" Skipped (duplicate IDs): {}", stats.duplicate_phrase_ids);
104
+ }
105
+ if stats.invalid_tokens > 0 {
106
+ println!(" Skipped (invalid tokens): {}", stats.invalid_tokens);
107
+ }
108
+ println!(" Built patterns: {}", stats.built);
109
+
110
+ if text_phrases.is_empty() {
111
+ return Err("No valid phrases to build".into());
112
+ }
113
+
114
+ // Build vocabulary and assign token IDs
115
+ println!("\n📚 Building vocabulary...");
116
+ let vocabulary = build_vocabulary(unique_tokens, config.separator_id);
117
+ println!(" ✓ Built vocabulary ({} tokens)", vocabulary.vocab_size);
118
+
119
+ // Convert text tokens to IDs
120
+ let mut phrases: Vec<ProcessedPhrase> = Vec::new();
121
+ for phrase in text_phrases {
122
+ let token_ids: Vec<u32> = phrase.tokens.iter()
123
+ .map(|t| *vocabulary.tokens.get(&t.to_lowercase()).unwrap_or(&0))
124
+ .collect();
125
+
126
+ phrases.push(ProcessedPhrase {
127
+ token_ids,
128
+ phrase_id: phrase.phrase_id,
129
+ salience: phrase.salience,
130
+ count: phrase.count,
131
+ length: phrase.tokens.len() as u8,
132
+ });
133
+ }
134
+
135
+ // Build automaton
136
+ println!("\n🔨 Building automaton...");
137
+ let patterns: Vec<Vec<u8>> = phrases.iter()
138
+ .map(|p| encode_tokens(&p.token_ids, config.separator_id))
139
+ .collect();
140
+
141
+ let automaton: DoubleArrayAhoCorasick<u32> = DoubleArrayAhoCorasick::new(patterns)
142
+ .map_err(|e| format!("Failed to build automaton: {:?}", e))?;
143
+
144
+ let automaton_bytes = automaton.serialize();
145
+ let automaton_path = output_dir.join("phrases.daac");
146
+ std::fs::write(&automaton_path, &automaton_bytes)?;
147
+ println!(" ✓ Wrote automaton ({} bytes) to {}", automaton_bytes.len(), automaton_path.display());
148
+
149
+ // Write payloads
150
+ println!("\n💾 Writing payloads...");
151
+ let payloads: Vec<Payload> = phrases.iter()
152
+ .map(|p| Payload::new(p.phrase_id, p.salience, p.count, p.length))
153
+ .collect();
154
+
155
+ let payloads_path = output_dir.join("payloads.bin");
156
+ let mut payloads_file = File::create(&payloads_path)?;
157
+ for payload in &payloads {
158
+ payload.write_to(&mut payloads_file)?;
159
+ }
160
+ let payloads_size = payloads.len() * 17;
161
+ println!(" ✓ Wrote {} payloads ({} bytes) to {}", payloads.len(), payloads_size, payloads_path.display());
162
+
163
+ // Generate manifest with checksums
164
+ println!("\n📝 Generating manifest...");
165
+ let manifest = Manifest {
166
+ version: config.version.clone(),
167
+ tokenizer: config.tokenizer.clone(),
168
+ num_patterns: phrases.len(),
169
+ min_count: config.min_count,
170
+ salience_threshold: config.salience_threshold,
171
+ built_at: chrono::Utc::now().to_rfc3339(),
172
+ separator_id: config.separator_id,
173
+ };
174
+
175
+ let manifest_path = output_dir.join("manifest.json");
176
+ let manifest_json = serde_json::to_string_pretty(&manifest)?;
177
+ std::fs::write(&manifest_path, manifest_json)?;
178
+ println!(" ✓ Wrote manifest to {}", manifest_path.display());
179
+
180
+ // Write vocabulary
181
+ println!("\n💾 Writing vocabulary...");
182
+ let vocab_path = output_dir.join("vocab.json");
183
+ let vocab_json = serde_json::to_string_pretty(&vocabulary)?;
184
+ std::fs::write(&vocab_path, vocab_json)?;
185
+ println!(" ✓ Wrote vocabulary ({} tokens) to {}", vocabulary.vocab_size, vocab_path.display());
186
+
187
+ // Summary
188
+ println!("\n✅ Build complete!");
189
+ println!("\nArtifacts:");
190
+ println!(" {} ({} bytes)", automaton_path.display(), automaton_bytes.len());
191
+ println!(" {} ({} bytes)", payloads_path.display(), payloads_size);
192
+ println!(" {}", manifest_path.display());
193
+ println!(" {}", vocab_path.display());
194
+
195
+ println!("\n🚀 To use in PhraseKit:");
196
+ println!(" PhraseKit.load!(");
197
+ println!(" automaton_path: {:?},", automaton_path.to_str().unwrap());
198
+ println!(" payloads_path: {:?},", payloads_path.to_str().unwrap());
199
+ println!(" manifest_path: {:?},", manifest_path.to_str().unwrap());
200
+ println!(" vocab_path: {:?}", vocab_path.to_str().unwrap());
201
+ println!(" )");
202
+
203
+ Ok(())
204
+ }
205
+
206
+ fn load_config(path: &str) -> Result<BuildConfig, Box<dyn std::error::Error>> {
207
+ let file = File::open(path)?;
208
+ let config: BuildConfig = serde_json::from_reader(file)?;
209
+ Ok(config)
210
+ }
211
+
212
+ fn load_and_validate_phrases(
213
+ path: &str,
214
+ config: &BuildConfig,
215
+ ) -> Result<(Vec<PhraseInput>, BuildStats, HashSet<String>), Box<dyn std::error::Error>> {
216
+ let file = File::open(path)?;
217
+ let reader = BufReader::new(file);
218
+
219
+ let mut phrases = Vec::new();
220
+ let mut seen_ids = HashSet::new();
221
+ let mut unique_tokens: HashSet<String> = HashSet::new();
222
+ let mut stats = BuildStats {
223
+ total_input: 0,
224
+ filtered_low_count: 0,
225
+ filtered_low_salience: 0,
226
+ duplicate_phrase_ids: 0,
227
+ invalid_tokens: 0,
228
+ built: 0,
229
+ };
230
+
231
+ println!("\n📖 Loading phrases...");
232
+
233
+ for (line_num, line) in reader.lines().enumerate() {
234
+ let line = line?;
235
+ stats.total_input += 1;
236
+
237
+ let phrase: PhraseInput = match serde_json::from_str(&line) {
238
+ Ok(p) => p,
239
+ Err(e) => {
240
+ eprintln!("⚠️ Line {}: Failed to parse: {}", line_num + 1, e);
241
+ continue;
242
+ }
243
+ };
244
+
245
+ // Validate
246
+ if let Some(min_count) = config.min_count {
247
+ if phrase.count < min_count {
248
+ stats.filtered_low_count += 1;
249
+ continue;
250
+ }
251
+ }
252
+
253
+ if let Some(threshold) = config.salience_threshold {
254
+ if phrase.salience < threshold {
255
+ stats.filtered_low_salience += 1;
256
+ continue;
257
+ }
258
+ }
259
+
260
+ if phrase.tokens.is_empty() {
261
+ eprintln!("⚠️ Line {}: Empty token sequence", line_num + 1);
262
+ stats.invalid_tokens += 1;
263
+ continue;
264
+ }
265
+
266
+ for token in &phrase.tokens {
267
+ if token.is_empty() {
268
+ eprintln!("⚠️ Line {}: Empty token", line_num + 1);
269
+ stats.invalid_tokens += 1;
270
+ continue;
271
+ }
272
+ }
273
+
274
+ if !seen_ids.insert(phrase.phrase_id) {
275
+ eprintln!("⚠️ Line {}: Duplicate phrase_id {}", line_num + 1, phrase.phrase_id);
276
+ stats.duplicate_phrase_ids += 1;
277
+ continue;
278
+ }
279
+
280
+ for token in &phrase.tokens {
281
+ unique_tokens.insert(token.to_lowercase());
282
+ }
283
+
284
+ phrases.push(phrase);
285
+ stats.built += 1;
286
+
287
+ if stats.total_input % 10000 == 0 {
288
+ println!(" Processed {} lines...", stats.total_input);
289
+ }
290
+ }
291
+
292
+ println!(" ✓ Loaded {} phrases", stats.total_input);
293
+
294
+ Ok((phrases, stats, unique_tokens))
295
+ }
296
+
297
+ fn encode_tokens(tokens: &[u32], separator: u32) -> Vec<u8> {
298
+ let mut bytes = Vec::new();
299
+ for &token in tokens {
300
+ bytes.extend_from_slice(&token.to_le_bytes());
301
+ bytes.extend_from_slice(&separator.to_le_bytes());
302
+ }
303
+ bytes
304
+ }
305
+
306
+ fn build_vocabulary(unique_tokens: HashSet<String>, separator_id: u32) -> Vocabulary {
307
+ let mut tokens = HashMap::new();
308
+ let mut sorted_tokens: Vec<String> = unique_tokens.into_iter().collect();
309
+ sorted_tokens.sort();
310
+
311
+ for (idx, token) in sorted_tokens.iter().enumerate() {
312
+ tokens.insert(token.clone(), (idx + 1) as u32);
313
+ }
314
+
315
+ let mut special_tokens = HashMap::new();
316
+ special_tokens.insert("<UNK>".to_string(), 0);
317
+
318
+ let vocab_size = tokens.len() + special_tokens.len();
319
+
320
+ Vocabulary {
321
+ tokens,
322
+ special_tokens,
323
+ vocab_size,
324
+ separator_id,
325
+ }
326
+ }