phrasekit 0.2.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +131 -0
- data/ext/phrasekit/Cargo.toml +45 -0
- data/ext/phrasekit/extconf.rb +4 -0
- data/ext/phrasekit/src/bin/fixture_builder.rs +131 -0
- data/ext/phrasekit/src/bin/phrasekit_build.rs +326 -0
- data/ext/phrasekit/src/bin/phrasekit_mine.rs +199 -0
- data/ext/phrasekit/src/bin/phrasekit_score.rs +298 -0
- data/ext/phrasekit/src/bin/phrasekit_tag.rs +320 -0
- data/ext/phrasekit/src/lib.rs +104 -0
- data/ext/phrasekit/src/manifest.rs +88 -0
- data/ext/phrasekit/src/matcher.rs +227 -0
- data/ext/phrasekit/src/payload.rs +95 -0
- data/ext/phrasekit/src/policy.rs +190 -0
- data/lib/phrasekit/3.1/phrasekit.bundle +0 -0
- data/lib/phrasekit/3.2/phrasekit.bundle +0 -0
- data/lib/phrasekit/3.3/phrasekit.bundle +0 -0
- data/lib/phrasekit/3.4/phrasekit.bundle +0 -0
- data/lib/phrasekit/miner.rb +74 -0
- data/lib/phrasekit/scorer.rb +92 -0
- data/lib/phrasekit/tagger.rb +100 -0
- data/lib/phrasekit/version.rb +3 -0
- data/lib/phrasekit.rb +100 -0
- data/lib/spellkit_stub.rb +80 -0
- metadata +150 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 20b2c07b07fbbe98279a5aed095047c990c90bdf2ecb0a51ccf997b8951aca75
|
|
4
|
+
data.tar.gz: 4b857ea8548706f2df6ca2e146f9cf981d772452fc9f22bc09105c89aab96c21
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: af76f27a17d1356d91cb06ea45502700512eb5e55ea6c609fd10925a64afecb681a4bdec3a6b05a8fc5b35bbd4832825db9468b7ab863fc0269accceba069ca0
|
|
7
|
+
data.tar.gz: 14c7d437a0fd03d2b6981f68da3a76c3b5a6a72509ccb802582ba794bb438773d3a1aed6e2b7f6de176b1749e2e05054b1432be9f2a38f8a8bd032f75313cdf9
|
data/README.md
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# PhraseKit
|
|
2
|
+
|
|
3
|
+
Ultra-fast deterministic phrase matching for Ruby using Rust and Aho-Corasick automaton.
|
|
4
|
+
|
|
5
|
+
PhraseKit provides high-performance phrase recognition over token sequences, designed for search query understanding, NLP pipelines, and information extraction at scale.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Deterministic matching** using Double-Array Aho-Corasick (daachorse)
|
|
10
|
+
- **Sub-millisecond performance** for queries with millions of phrases
|
|
11
|
+
- **Hot-reloadable** artifacts with zero downtime
|
|
12
|
+
- **Thread-safe** operations via Magnus/Rust
|
|
13
|
+
- **Multiple matching policies**: leftmost-longest, leftmost-first, salience-max
|
|
14
|
+
- **Production-ready** with health checks, stats, and observability
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
Add to your Gemfile:
|
|
19
|
+
|
|
20
|
+
```ruby
|
|
21
|
+
gem 'phrasekit'
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Or install directly:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
gem install phrasekit
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Usage
|
|
31
|
+
|
|
32
|
+
### Basic Setup
|
|
33
|
+
|
|
34
|
+
```ruby
|
|
35
|
+
require 'phrasekit'
|
|
36
|
+
|
|
37
|
+
# Load phrase artifacts
|
|
38
|
+
PhraseKit.load!(
|
|
39
|
+
automaton_path: "/path/to/phrases.daac",
|
|
40
|
+
payloads_path: "/path/to/payloads.bin",
|
|
41
|
+
manifest_path: "/path/to/phrases.json"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Match tokens
|
|
45
|
+
token_ids = [1012, 441, 7788, 902, 1455] # Your tokenized input
|
|
46
|
+
matches = PhraseKit.match_tokens(
|
|
47
|
+
token_ids: token_ids,
|
|
48
|
+
policy: :leftmost_longest, # or :leftmost_first, :salience_max
|
|
49
|
+
max: 32 # Maximum matches to return
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Returns array of matches:
|
|
53
|
+
# [
|
|
54
|
+
# {start: 1, end: 3, phrase_id: 12345, salience: 2.13, count: 314, n: 2},
|
|
55
|
+
# {start: 3, end: 5, phrase_id: 67890, salience: 1.82, count: 271, n: 2}
|
|
56
|
+
# ]
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Integration with SpellKit
|
|
60
|
+
|
|
61
|
+
PhraseKit is designed to work with SpellKit for typo correction:
|
|
62
|
+
|
|
63
|
+
```ruby
|
|
64
|
+
class SearchTermExtractor
|
|
65
|
+
def call(text)
|
|
66
|
+
# 1. Tokenize
|
|
67
|
+
tokens = MyTokenizer.tokenize(text)
|
|
68
|
+
|
|
69
|
+
# 2. Spell correction (via SpellKit gem)
|
|
70
|
+
corrected = SpellKit.correct_tokens(tokens, guard: :domain)
|
|
71
|
+
|
|
72
|
+
# 3. Convert to token IDs
|
|
73
|
+
token_ids = MyTokenizer.to_ids(corrected)
|
|
74
|
+
|
|
75
|
+
# 4. Extract phrases
|
|
76
|
+
PhraseKit.match_tokens(token_ids: token_ids, policy: :leftmost_longest)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Monitoring
|
|
82
|
+
|
|
83
|
+
```ruby
|
|
84
|
+
# Check health
|
|
85
|
+
PhraseKit.healthcheck # Raises on issues
|
|
86
|
+
|
|
87
|
+
# Get statistics
|
|
88
|
+
PhraseKit.stats
|
|
89
|
+
# => {
|
|
90
|
+
# version: "pk-2025-09-25-01",
|
|
91
|
+
# loaded_at: Time,
|
|
92
|
+
# num_patterns: 1_287_345,
|
|
93
|
+
# heap_mb: 142.3,
|
|
94
|
+
# hits_total: 892341,
|
|
95
|
+
# p50_us: 47,
|
|
96
|
+
# p95_us: 189
|
|
97
|
+
# }
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Architecture
|
|
101
|
+
|
|
102
|
+
PhraseKit uses:
|
|
103
|
+
- **Rust** for core matching logic
|
|
104
|
+
- **Magnus** for Ruby-Rust bindings
|
|
105
|
+
- **Daachorse** for the Aho-Corasick automaton
|
|
106
|
+
- **Static linking** for reliability
|
|
107
|
+
|
|
108
|
+
## Performance
|
|
109
|
+
|
|
110
|
+
Target performance with 1-3M phrases:
|
|
111
|
+
- p50 < 100Âľs
|
|
112
|
+
- p95 < 500Âľs
|
|
113
|
+
- Memory < 300MB
|
|
114
|
+
|
|
115
|
+
## Development
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
# Setup
|
|
119
|
+
bundle install
|
|
120
|
+
bundle exec rake compile
|
|
121
|
+
|
|
122
|
+
# Run tests
|
|
123
|
+
bundle exec rspec
|
|
124
|
+
|
|
125
|
+
# Build gem
|
|
126
|
+
gem build phrasekit.gemspec
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## License
|
|
130
|
+
|
|
131
|
+
MIT License. See LICENSE.txt for details.
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "phrasekit"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
edition = "2021"
|
|
5
|
+
|
|
6
|
+
[lib]
|
|
7
|
+
crate-type = ["cdylib"]
|
|
8
|
+
name = "phrasekit"
|
|
9
|
+
|
|
10
|
+
[[bin]]
|
|
11
|
+
name = "fixture_builder"
|
|
12
|
+
path = "src/bin/fixture_builder.rs"
|
|
13
|
+
|
|
14
|
+
[[bin]]
|
|
15
|
+
name = "phrasekit_build"
|
|
16
|
+
path = "src/bin/phrasekit_build.rs"
|
|
17
|
+
|
|
18
|
+
[[bin]]
|
|
19
|
+
name = "phrasekit_mine"
|
|
20
|
+
path = "src/bin/phrasekit_mine.rs"
|
|
21
|
+
|
|
22
|
+
[[bin]]
|
|
23
|
+
name = "phrasekit_score"
|
|
24
|
+
path = "src/bin/phrasekit_score.rs"
|
|
25
|
+
|
|
26
|
+
[[bin]]
|
|
27
|
+
name = "phrasekit_tag"
|
|
28
|
+
path = "src/bin/phrasekit_tag.rs"
|
|
29
|
+
|
|
30
|
+
[dependencies]
|
|
31
|
+
magnus = { version = "0.7" }
|
|
32
|
+
daachorse = "1.0"
|
|
33
|
+
serde = { version = "1.0", features = ["derive"] }
|
|
34
|
+
serde_json = "1.0"
|
|
35
|
+
parking_lot = "0.12"
|
|
36
|
+
notify = "6.1"
|
|
37
|
+
thiserror = "1.0"
|
|
38
|
+
chrono = "0.4"
|
|
39
|
+
|
|
40
|
+
[dependencies.rb-sys]
|
|
41
|
+
version = "0.9"
|
|
42
|
+
features = ["stable-api-compiled-fallback"]
|
|
43
|
+
|
|
44
|
+
[dev-dependencies]
|
|
45
|
+
tempfile = "3.10"
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
use daachorse::DoubleArrayAhoCorasick;
|
|
2
|
+
use serde::{Deserialize, Serialize};
|
|
3
|
+
use std::collections::HashMap;
|
|
4
|
+
use std::fs::File;
|
|
5
|
+
use std::path::PathBuf;
|
|
6
|
+
|
|
7
|
+
#[path = "../payload.rs"]
|
|
8
|
+
mod payload;
|
|
9
|
+
|
|
10
|
+
#[path = "../manifest.rs"]
|
|
11
|
+
mod manifest;
|
|
12
|
+
|
|
13
|
+
use manifest::Manifest;
|
|
14
|
+
use payload::Payload;
|
|
15
|
+
|
|
16
|
+
#[derive(Debug, Serialize)]
|
|
17
|
+
struct Vocabulary {
|
|
18
|
+
tokens: HashMap<String, u32>,
|
|
19
|
+
special_tokens: HashMap<String, u32>,
|
|
20
|
+
vocab_size: usize,
|
|
21
|
+
separator_id: u32,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
25
|
+
let args: Vec<String> = std::env::args().collect();
|
|
26
|
+
let output_dir = if args.len() > 1 {
|
|
27
|
+
PathBuf::from(&args[1])
|
|
28
|
+
} else {
|
|
29
|
+
PathBuf::from("spec/fixtures")
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
std::fs::create_dir_all(&output_dir)?;
|
|
33
|
+
|
|
34
|
+
println!("Building test fixtures in: {}", output_dir.display());
|
|
35
|
+
|
|
36
|
+
// Define test patterns as byte sequences (token_id + separator)
|
|
37
|
+
let separator: u32 = 4294967294;
|
|
38
|
+
|
|
39
|
+
// Pattern 0: [100, 101] - "machine learning"
|
|
40
|
+
let pattern0 = encode_tokens(&[100, 101], separator);
|
|
41
|
+
|
|
42
|
+
// Pattern 1: [200, 101] - "deep learning"
|
|
43
|
+
let pattern1 = encode_tokens(&[200, 101], separator);
|
|
44
|
+
|
|
45
|
+
// Pattern 2: [100, 101, 102] - "machine learning algorithms"
|
|
46
|
+
let pattern2 = encode_tokens(&[100, 101, 102], separator);
|
|
47
|
+
|
|
48
|
+
let patterns = vec![pattern0, pattern1, pattern2];
|
|
49
|
+
let num_patterns = patterns.len();
|
|
50
|
+
|
|
51
|
+
// Build automaton
|
|
52
|
+
println!("Building automaton with {} patterns", num_patterns);
|
|
53
|
+
let automaton: DoubleArrayAhoCorasick<u32> = DoubleArrayAhoCorasick::new(patterns)
|
|
54
|
+
.map_err(|e| format!("Failed to build automaton: {:?}", e))?;
|
|
55
|
+
|
|
56
|
+
// Serialize automaton
|
|
57
|
+
let automaton_bytes = automaton.serialize();
|
|
58
|
+
let automaton_path = output_dir.join("phrases.daac");
|
|
59
|
+
std::fs::write(&automaton_path, &automaton_bytes)?;
|
|
60
|
+
println!("â Wrote automaton ({} bytes) to {}", automaton_bytes.len(), automaton_path.display());
|
|
61
|
+
|
|
62
|
+
// Create payloads
|
|
63
|
+
let payloads = vec![
|
|
64
|
+
Payload::new(100, 2.5, 150, 2), // "machine learning" - [100, 101]
|
|
65
|
+
Payload::new(200, 2.0, 100, 2), // "deep learning" - [200, 101]
|
|
66
|
+
Payload::new(300, 3.0, 200, 3), // "machine learning algorithms" - [100, 101, 102]
|
|
67
|
+
];
|
|
68
|
+
|
|
69
|
+
// Write payloads
|
|
70
|
+
let payloads_path = output_dir.join("payloads.bin");
|
|
71
|
+
let mut payloads_file = File::create(&payloads_path)?;
|
|
72
|
+
for payload in &payloads {
|
|
73
|
+
payload.write_to(&mut payloads_file)?;
|
|
74
|
+
}
|
|
75
|
+
println!("â Wrote {} payloads to {}", payloads.len(), payloads_path.display());
|
|
76
|
+
|
|
77
|
+
// Create manifest
|
|
78
|
+
let manifest = Manifest {
|
|
79
|
+
version: "test-v1".to_string(),
|
|
80
|
+
tokenizer: "test-tokenizer".to_string(),
|
|
81
|
+
num_patterns: num_patterns,
|
|
82
|
+
min_count: Some(10),
|
|
83
|
+
salience_threshold: Some(1.0),
|
|
84
|
+
built_at: "2025-09-25T00:00:00Z".to_string(),
|
|
85
|
+
separator_id: separator,
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
let manifest_path = output_dir.join("manifest.json");
|
|
89
|
+
let manifest_json = serde_json::to_string_pretty(&manifest)?;
|
|
90
|
+
std::fs::write(&manifest_path, manifest_json)?;
|
|
91
|
+
println!("â Wrote manifest to {}", manifest_path.display());
|
|
92
|
+
|
|
93
|
+
// Create vocabulary
|
|
94
|
+
let mut tokens = HashMap::new();
|
|
95
|
+
tokens.insert("machine".to_string(), 100);
|
|
96
|
+
tokens.insert("learning".to_string(), 101);
|
|
97
|
+
tokens.insert("algorithms".to_string(), 102);
|
|
98
|
+
tokens.insert("deep".to_string(), 200);
|
|
99
|
+
|
|
100
|
+
let mut special_tokens = HashMap::new();
|
|
101
|
+
special_tokens.insert("<UNK>".to_string(), 0);
|
|
102
|
+
|
|
103
|
+
let vocabulary = Vocabulary {
|
|
104
|
+
tokens,
|
|
105
|
+
special_tokens,
|
|
106
|
+
vocab_size: 5,
|
|
107
|
+
separator_id: separator,
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
let vocab_path = output_dir.join("vocab.json");
|
|
111
|
+
let vocab_json = serde_json::to_string_pretty(&vocabulary)?;
|
|
112
|
+
std::fs::write(&vocab_path, vocab_json)?;
|
|
113
|
+
println!("â Wrote vocabulary to {}", vocab_path.display());
|
|
114
|
+
|
|
115
|
+
println!("\nâ
Test fixtures generated successfully!");
|
|
116
|
+
println!("\nTest patterns:");
|
|
117
|
+
println!(" Pattern 0: tokens [100, 101] â phrase_id 100 (salience 2.5) - 'machine learning'");
|
|
118
|
+
println!(" Pattern 1: tokens [200, 101] â phrase_id 200 (salience 2.0) - 'deep learning'");
|
|
119
|
+
println!(" Pattern 2: tokens [100, 101, 102] â phrase_id 300 (salience 3.0) - 'machine learning algorithms'");
|
|
120
|
+
|
|
121
|
+
Ok(())
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
fn encode_tokens(tokens: &[u32], separator: u32) -> Vec<u8> {
|
|
125
|
+
let mut bytes = Vec::new();
|
|
126
|
+
for &token in tokens {
|
|
127
|
+
bytes.extend_from_slice(&token.to_le_bytes());
|
|
128
|
+
bytes.extend_from_slice(&separator.to_le_bytes());
|
|
129
|
+
}
|
|
130
|
+
bytes
|
|
131
|
+
}
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
use daachorse::DoubleArrayAhoCorasick;
|
|
2
|
+
use serde::{Deserialize, Serialize};
|
|
3
|
+
use std::collections::{HashMap, HashSet};
|
|
4
|
+
use std::fs::File;
|
|
5
|
+
use std::io::{BufRead, BufReader, Write};
|
|
6
|
+
use std::path::{Path, PathBuf};
|
|
7
|
+
|
|
8
|
+
#[path = "../payload.rs"]
|
|
9
|
+
mod payload;
|
|
10
|
+
|
|
11
|
+
#[path = "../manifest.rs"]
|
|
12
|
+
mod manifest;
|
|
13
|
+
|
|
14
|
+
use manifest::Manifest;
|
|
15
|
+
use payload::Payload;
|
|
16
|
+
|
|
17
|
+
#[derive(Debug, Deserialize)]
|
|
18
|
+
struct PhraseInput {
|
|
19
|
+
tokens: Vec<String>,
|
|
20
|
+
phrase_id: u32,
|
|
21
|
+
salience: f32,
|
|
22
|
+
#[serde(alias = "domain_count")]
|
|
23
|
+
count: u32,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
struct ProcessedPhrase {
|
|
27
|
+
token_ids: Vec<u32>,
|
|
28
|
+
phrase_id: u32,
|
|
29
|
+
salience: f32,
|
|
30
|
+
count: u32,
|
|
31
|
+
length: u8,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
#[derive(Debug, Deserialize)]
|
|
35
|
+
struct BuildConfig {
|
|
36
|
+
version: String,
|
|
37
|
+
tokenizer: String,
|
|
38
|
+
separator_id: u32,
|
|
39
|
+
#[serde(default)]
|
|
40
|
+
min_count: Option<u32>,
|
|
41
|
+
#[serde(default)]
|
|
42
|
+
salience_threshold: Option<f32>,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
#[derive(Debug)]
|
|
46
|
+
struct BuildStats {
|
|
47
|
+
total_input: usize,
|
|
48
|
+
filtered_low_count: usize,
|
|
49
|
+
filtered_low_salience: usize,
|
|
50
|
+
duplicate_phrase_ids: usize,
|
|
51
|
+
invalid_tokens: usize,
|
|
52
|
+
built: usize,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
#[derive(Debug, Serialize)]
|
|
56
|
+
struct Vocabulary {
|
|
57
|
+
tokens: HashMap<String, u32>,
|
|
58
|
+
special_tokens: HashMap<String, u32>,
|
|
59
|
+
vocab_size: usize,
|
|
60
|
+
separator_id: u32,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
64
|
+
let args: Vec<String> = std::env::args().collect();
|
|
65
|
+
|
|
66
|
+
if args.len() < 4 {
|
|
67
|
+
eprintln!("Usage: phrasekit_build <input.jsonl> <config.json> <output_dir>");
|
|
68
|
+
eprintln!("\nExample:");
|
|
69
|
+
eprintln!(" phrasekit_build phrases.jsonl config.json ./artifacts/");
|
|
70
|
+
std::process::exit(1);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
let input_path = &args[1];
|
|
74
|
+
let config_path = &args[2];
|
|
75
|
+
let output_dir = PathBuf::from(&args[3]);
|
|
76
|
+
|
|
77
|
+
println!("đŚ PhraseKit Artifact Builder");
|
|
78
|
+
println!("ââââââââââââââââââââââââââââââââââââââââ");
|
|
79
|
+
println!("Input: {}", input_path);
|
|
80
|
+
println!("Config: {}", config_path);
|
|
81
|
+
println!("Output: {}", output_dir.display());
|
|
82
|
+
println!();
|
|
83
|
+
|
|
84
|
+
// Load config
|
|
85
|
+
let config = load_config(config_path)?;
|
|
86
|
+
println!("â Loaded config: {} (tokenizer: {})", config.version, config.tokenizer);
|
|
87
|
+
|
|
88
|
+
// Create output directory
|
|
89
|
+
std::fs::create_dir_all(&output_dir)?;
|
|
90
|
+
|
|
91
|
+
// Load and validate phrases
|
|
92
|
+
let (text_phrases, stats, unique_tokens) = load_and_validate_phrases(input_path, &config)?;
|
|
93
|
+
|
|
94
|
+
println!("\nđ Build Statistics:");
|
|
95
|
+
println!(" Total input phrases: {}", stats.total_input);
|
|
96
|
+
if stats.filtered_low_count > 0 {
|
|
97
|
+
println!(" Filtered (low count): {}", stats.filtered_low_count);
|
|
98
|
+
}
|
|
99
|
+
if stats.filtered_low_salience > 0 {
|
|
100
|
+
println!(" Filtered (low salience): {}", stats.filtered_low_salience);
|
|
101
|
+
}
|
|
102
|
+
if stats.duplicate_phrase_ids > 0 {
|
|
103
|
+
println!(" Skipped (duplicate IDs): {}", stats.duplicate_phrase_ids);
|
|
104
|
+
}
|
|
105
|
+
if stats.invalid_tokens > 0 {
|
|
106
|
+
println!(" Skipped (invalid tokens): {}", stats.invalid_tokens);
|
|
107
|
+
}
|
|
108
|
+
println!(" Built patterns: {}", stats.built);
|
|
109
|
+
|
|
110
|
+
if text_phrases.is_empty() {
|
|
111
|
+
return Err("No valid phrases to build".into());
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Build vocabulary and assign token IDs
|
|
115
|
+
println!("\nđ Building vocabulary...");
|
|
116
|
+
let vocabulary = build_vocabulary(unique_tokens, config.separator_id);
|
|
117
|
+
println!(" â Built vocabulary ({} tokens)", vocabulary.vocab_size);
|
|
118
|
+
|
|
119
|
+
// Convert text tokens to IDs
|
|
120
|
+
let mut phrases: Vec<ProcessedPhrase> = Vec::new();
|
|
121
|
+
for phrase in text_phrases {
|
|
122
|
+
let token_ids: Vec<u32> = phrase.tokens.iter()
|
|
123
|
+
.map(|t| *vocabulary.tokens.get(&t.to_lowercase()).unwrap_or(&0))
|
|
124
|
+
.collect();
|
|
125
|
+
|
|
126
|
+
phrases.push(ProcessedPhrase {
|
|
127
|
+
token_ids,
|
|
128
|
+
phrase_id: phrase.phrase_id,
|
|
129
|
+
salience: phrase.salience,
|
|
130
|
+
count: phrase.count,
|
|
131
|
+
length: phrase.tokens.len() as u8,
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Build automaton
|
|
136
|
+
println!("\nđ¨ Building automaton...");
|
|
137
|
+
let patterns: Vec<Vec<u8>> = phrases.iter()
|
|
138
|
+
.map(|p| encode_tokens(&p.token_ids, config.separator_id))
|
|
139
|
+
.collect();
|
|
140
|
+
|
|
141
|
+
let automaton: DoubleArrayAhoCorasick<u32> = DoubleArrayAhoCorasick::new(patterns)
|
|
142
|
+
.map_err(|e| format!("Failed to build automaton: {:?}", e))?;
|
|
143
|
+
|
|
144
|
+
let automaton_bytes = automaton.serialize();
|
|
145
|
+
let automaton_path = output_dir.join("phrases.daac");
|
|
146
|
+
std::fs::write(&automaton_path, &automaton_bytes)?;
|
|
147
|
+
println!(" â Wrote automaton ({} bytes) to {}", automaton_bytes.len(), automaton_path.display());
|
|
148
|
+
|
|
149
|
+
// Write payloads
|
|
150
|
+
println!("\nđž Writing payloads...");
|
|
151
|
+
let payloads: Vec<Payload> = phrases.iter()
|
|
152
|
+
.map(|p| Payload::new(p.phrase_id, p.salience, p.count, p.length))
|
|
153
|
+
.collect();
|
|
154
|
+
|
|
155
|
+
let payloads_path = output_dir.join("payloads.bin");
|
|
156
|
+
let mut payloads_file = File::create(&payloads_path)?;
|
|
157
|
+
for payload in &payloads {
|
|
158
|
+
payload.write_to(&mut payloads_file)?;
|
|
159
|
+
}
|
|
160
|
+
let payloads_size = payloads.len() * 17;
|
|
161
|
+
println!(" â Wrote {} payloads ({} bytes) to {}", payloads.len(), payloads_size, payloads_path.display());
|
|
162
|
+
|
|
163
|
+
// Generate manifest with checksums
|
|
164
|
+
println!("\nđ Generating manifest...");
|
|
165
|
+
let manifest = Manifest {
|
|
166
|
+
version: config.version.clone(),
|
|
167
|
+
tokenizer: config.tokenizer.clone(),
|
|
168
|
+
num_patterns: phrases.len(),
|
|
169
|
+
min_count: config.min_count,
|
|
170
|
+
salience_threshold: config.salience_threshold,
|
|
171
|
+
built_at: chrono::Utc::now().to_rfc3339(),
|
|
172
|
+
separator_id: config.separator_id,
|
|
173
|
+
};
|
|
174
|
+
|
|
175
|
+
let manifest_path = output_dir.join("manifest.json");
|
|
176
|
+
let manifest_json = serde_json::to_string_pretty(&manifest)?;
|
|
177
|
+
std::fs::write(&manifest_path, manifest_json)?;
|
|
178
|
+
println!(" â Wrote manifest to {}", manifest_path.display());
|
|
179
|
+
|
|
180
|
+
// Write vocabulary
|
|
181
|
+
println!("\nđž Writing vocabulary...");
|
|
182
|
+
let vocab_path = output_dir.join("vocab.json");
|
|
183
|
+
let vocab_json = serde_json::to_string_pretty(&vocabulary)?;
|
|
184
|
+
std::fs::write(&vocab_path, vocab_json)?;
|
|
185
|
+
println!(" â Wrote vocabulary ({} tokens) to {}", vocabulary.vocab_size, vocab_path.display());
|
|
186
|
+
|
|
187
|
+
// Summary
|
|
188
|
+
println!("\nâ
Build complete!");
|
|
189
|
+
println!("\nArtifacts:");
|
|
190
|
+
println!(" {} ({} bytes)", automaton_path.display(), automaton_bytes.len());
|
|
191
|
+
println!(" {} ({} bytes)", payloads_path.display(), payloads_size);
|
|
192
|
+
println!(" {}", manifest_path.display());
|
|
193
|
+
println!(" {}", vocab_path.display());
|
|
194
|
+
|
|
195
|
+
println!("\nđ To use in PhraseKit:");
|
|
196
|
+
println!(" PhraseKit.load!(");
|
|
197
|
+
println!(" automaton_path: {:?},", automaton_path.to_str().unwrap());
|
|
198
|
+
println!(" payloads_path: {:?},", payloads_path.to_str().unwrap());
|
|
199
|
+
println!(" manifest_path: {:?},", manifest_path.to_str().unwrap());
|
|
200
|
+
println!(" vocab_path: {:?}", vocab_path.to_str().unwrap());
|
|
201
|
+
println!(" )");
|
|
202
|
+
|
|
203
|
+
Ok(())
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
fn load_config(path: &str) -> Result<BuildConfig, Box<dyn std::error::Error>> {
|
|
207
|
+
let file = File::open(path)?;
|
|
208
|
+
let config: BuildConfig = serde_json::from_reader(file)?;
|
|
209
|
+
Ok(config)
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
fn load_and_validate_phrases(
|
|
213
|
+
path: &str,
|
|
214
|
+
config: &BuildConfig,
|
|
215
|
+
) -> Result<(Vec<PhraseInput>, BuildStats, HashSet<String>), Box<dyn std::error::Error>> {
|
|
216
|
+
let file = File::open(path)?;
|
|
217
|
+
let reader = BufReader::new(file);
|
|
218
|
+
|
|
219
|
+
let mut phrases = Vec::new();
|
|
220
|
+
let mut seen_ids = HashSet::new();
|
|
221
|
+
let mut unique_tokens: HashSet<String> = HashSet::new();
|
|
222
|
+
let mut stats = BuildStats {
|
|
223
|
+
total_input: 0,
|
|
224
|
+
filtered_low_count: 0,
|
|
225
|
+
filtered_low_salience: 0,
|
|
226
|
+
duplicate_phrase_ids: 0,
|
|
227
|
+
invalid_tokens: 0,
|
|
228
|
+
built: 0,
|
|
229
|
+
};
|
|
230
|
+
|
|
231
|
+
println!("\nđ Loading phrases...");
|
|
232
|
+
|
|
233
|
+
for (line_num, line) in reader.lines().enumerate() {
|
|
234
|
+
let line = line?;
|
|
235
|
+
stats.total_input += 1;
|
|
236
|
+
|
|
237
|
+
let phrase: PhraseInput = match serde_json::from_str(&line) {
|
|
238
|
+
Ok(p) => p,
|
|
239
|
+
Err(e) => {
|
|
240
|
+
eprintln!("â ď¸ Line {}: Failed to parse: {}", line_num + 1, e);
|
|
241
|
+
continue;
|
|
242
|
+
}
|
|
243
|
+
};
|
|
244
|
+
|
|
245
|
+
// Validate
|
|
246
|
+
if let Some(min_count) = config.min_count {
|
|
247
|
+
if phrase.count < min_count {
|
|
248
|
+
stats.filtered_low_count += 1;
|
|
249
|
+
continue;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
if let Some(threshold) = config.salience_threshold {
|
|
254
|
+
if phrase.salience < threshold {
|
|
255
|
+
stats.filtered_low_salience += 1;
|
|
256
|
+
continue;
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
if phrase.tokens.is_empty() {
|
|
261
|
+
eprintln!("â ď¸ Line {}: Empty token sequence", line_num + 1);
|
|
262
|
+
stats.invalid_tokens += 1;
|
|
263
|
+
continue;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
for token in &phrase.tokens {
|
|
267
|
+
if token.is_empty() {
|
|
268
|
+
eprintln!("â ď¸ Line {}: Empty token", line_num + 1);
|
|
269
|
+
stats.invalid_tokens += 1;
|
|
270
|
+
continue;
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
if !seen_ids.insert(phrase.phrase_id) {
|
|
275
|
+
eprintln!("â ď¸ Line {}: Duplicate phrase_id {}", line_num + 1, phrase.phrase_id);
|
|
276
|
+
stats.duplicate_phrase_ids += 1;
|
|
277
|
+
continue;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
for token in &phrase.tokens {
|
|
281
|
+
unique_tokens.insert(token.to_lowercase());
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
phrases.push(phrase);
|
|
285
|
+
stats.built += 1;
|
|
286
|
+
|
|
287
|
+
if stats.total_input % 10000 == 0 {
|
|
288
|
+
println!(" Processed {} lines...", stats.total_input);
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
println!(" â Loaded {} phrases", stats.total_input);
|
|
293
|
+
|
|
294
|
+
Ok((phrases, stats, unique_tokens))
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
fn encode_tokens(tokens: &[u32], separator: u32) -> Vec<u8> {
|
|
298
|
+
let mut bytes = Vec::new();
|
|
299
|
+
for &token in tokens {
|
|
300
|
+
bytes.extend_from_slice(&token.to_le_bytes());
|
|
301
|
+
bytes.extend_from_slice(&separator.to_le_bytes());
|
|
302
|
+
}
|
|
303
|
+
bytes
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
fn build_vocabulary(unique_tokens: HashSet<String>, separator_id: u32) -> Vocabulary {
|
|
307
|
+
let mut tokens = HashMap::new();
|
|
308
|
+
let mut sorted_tokens: Vec<String> = unique_tokens.into_iter().collect();
|
|
309
|
+
sorted_tokens.sort();
|
|
310
|
+
|
|
311
|
+
for (idx, token) in sorted_tokens.iter().enumerate() {
|
|
312
|
+
tokens.insert(token.clone(), (idx + 1) as u32);
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
let mut special_tokens = HashMap::new();
|
|
316
|
+
special_tokens.insert("<UNK>".to_string(), 0);
|
|
317
|
+
|
|
318
|
+
let vocab_size = tokens.len() + special_tokens.len();
|
|
319
|
+
|
|
320
|
+
Vocabulary {
|
|
321
|
+
tokens,
|
|
322
|
+
special_tokens,
|
|
323
|
+
vocab_size,
|
|
324
|
+
separator_id,
|
|
325
|
+
}
|
|
326
|
+
}
|