phrasekit 0.2.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,227 @@
1
+ use crate::manifest::Manifest;
2
+ use crate::payload::{load_payloads, Payload};
3
+ use crate::policy::{resolve_overlaps, Match, MatchPolicy};
4
+ use daachorse::DoubleArrayAhoCorasick;
5
+ use std::fs::File;
6
+ use std::io::BufReader;
7
+ use std::path::Path;
8
+ use std::time::SystemTime;
9
+ use thiserror::Error;
10
+
11
+ #[derive(Error, Debug)]
12
+ pub enum MatcherError {
13
+ #[error("IO error: {0}")]
14
+ Io(#[from] std::io::Error),
15
+
16
+ #[error("Manifest error: {0}")]
17
+ Manifest(#[from] crate::manifest::ManifestError),
18
+
19
+ #[error("Automaton error: {0}")]
20
+ Automaton(String),
21
+
22
+ #[error("Matcher not loaded")]
23
+ #[allow(dead_code)]
24
+ NotLoaded,
25
+ }
26
+
27
+ pub struct Matcher {
28
+ automaton: DoubleArrayAhoCorasick<u32>,
29
+ payloads: Vec<Payload>,
30
+ manifest: Manifest,
31
+ loaded_at: SystemTime,
32
+ }
33
+
34
+ impl Matcher {
35
+ pub fn load<P: AsRef<Path>>(
36
+ automaton_path: P,
37
+ payloads_path: P,
38
+ manifest_path: P,
39
+ ) -> Result<Self, MatcherError> {
40
+ let manifest = Manifest::load(manifest_path)?;
41
+
42
+ let automaton_bytes = std::fs::read(automaton_path)?;
43
+ let (automaton, _): (DoubleArrayAhoCorasick<u32>, _) = unsafe {
44
+ DoubleArrayAhoCorasick::deserialize_unchecked(&automaton_bytes)
45
+ };
46
+
47
+ let payloads_file = File::open(payloads_path)?;
48
+ let payloads_reader = BufReader::new(payloads_file);
49
+ let payloads = load_payloads(payloads_reader)?;
50
+
51
+ if payloads.len() != manifest.num_patterns {
52
+ return Err(MatcherError::Automaton(format!(
53
+ "Payload count mismatch: manifest says {}, got {}",
54
+ manifest.num_patterns,
55
+ payloads.len()
56
+ )));
57
+ }
58
+
59
+ Ok(Self {
60
+ automaton,
61
+ payloads,
62
+ manifest,
63
+ loaded_at: SystemTime::now(),
64
+ })
65
+ }
66
+
67
+ pub fn match_tokens(
68
+ &self,
69
+ token_ids: &[u32],
70
+ policy: MatchPolicy,
71
+ max: usize,
72
+ ) -> Vec<Match> {
73
+ if token_ids.is_empty() {
74
+ return Vec::new();
75
+ }
76
+
77
+ let separator = self.manifest.separator_id;
78
+ let mut bytes = Vec::with_capacity(token_ids.len() * 5);
79
+ for &token_id in token_ids {
80
+ bytes.extend_from_slice(&token_id.to_le_bytes());
81
+ bytes.extend_from_slice(&separator.to_le_bytes());
82
+ }
83
+
84
+ let matches: Vec<Match> = self
85
+ .automaton
86
+ .find_overlapping_iter(&bytes)
87
+ .filter_map(|m| {
88
+ let pattern_id = m.value() as usize;
89
+ let start_token = m.start() / 8;
90
+ let end_token = (m.end() + 7) / 8;
91
+
92
+ self.payloads
93
+ .get(pattern_id)
94
+ .map(|payload| Match::new(start_token, end_token, pattern_id, payload.clone()))
95
+ })
96
+ .collect();
97
+
98
+ let mut resolved = resolve_overlaps(matches, policy);
99
+
100
+ if resolved.len() > max {
101
+ resolved.truncate(max);
102
+ }
103
+
104
+ resolved
105
+ }
106
+
107
+ #[allow(dead_code)]
108
+ pub fn manifest(&self) -> &Manifest {
109
+ &self.manifest
110
+ }
111
+
112
+ pub fn num_patterns(&self) -> usize {
113
+ self.payloads.len()
114
+ }
115
+
116
+ #[allow(dead_code)]
117
+ pub fn loaded_at(&self) -> SystemTime {
118
+ self.loaded_at
119
+ }
120
+
121
+ pub fn memory_usage_mb(&self) -> f64 {
122
+ let automaton_size = std::mem::size_of_val(&self.automaton);
123
+ let payloads_size = self.payloads.len() * std::mem::size_of::<Payload>();
124
+ ((automaton_size + payloads_size) as f64) / 1_048_576.0
125
+ }
126
+ }
127
+
128
+ pub struct Stats {
129
+ pub version: String,
130
+ pub loaded_at: SystemTime,
131
+ pub num_patterns: usize,
132
+ pub heap_mb: f64,
133
+ pub hits_total: u64,
134
+ pub p50_us: u64,
135
+ pub p95_us: u64,
136
+ pub p99_us: u64,
137
+ }
138
+
139
+ impl Stats {
140
+ pub fn from_matcher(matcher: &Matcher) -> Self {
141
+ Self {
142
+ version: matcher.manifest.version.clone(),
143
+ loaded_at: matcher.loaded_at,
144
+ num_patterns: matcher.num_patterns(),
145
+ heap_mb: matcher.memory_usage_mb(),
146
+ hits_total: 0,
147
+ p50_us: 0,
148
+ p95_us: 0,
149
+ p99_us: 0,
150
+ }
151
+ }
152
+ }
153
+
154
+ #[cfg(test)]
155
+ mod tests {
156
+ use super::*;
157
+ use crate::payload::Payload;
158
+ use std::io::Write;
159
+ use tempfile::NamedTempFile;
160
+
161
+ fn create_test_artifacts() -> (NamedTempFile, NamedTempFile, NamedTempFile) {
162
+ let patterns = vec![vec![1u32, 2u32], vec![2u32, 3u32]];
163
+
164
+ let automaton = DoubleArrayAhoCorasick::new(patterns).unwrap();
165
+ let automaton_bytes = automaton.serialize();
166
+
167
+ let mut automaton_file = NamedTempFile::new().unwrap();
168
+ automaton_file.write_all(&automaton_bytes).unwrap();
169
+ automaton_file.flush().unwrap();
170
+
171
+ let mut payloads_file = NamedTempFile::new().unwrap();
172
+ let payload1 = Payload::new(100, 1.5, 50, 2);
173
+ let payload2 = Payload::new(200, 2.0, 100, 2);
174
+ payload1.write_to(&mut payloads_file).unwrap();
175
+ payload2.write_to(&mut payloads_file).unwrap();
176
+ payloads_file.flush().unwrap();
177
+
178
+ let mut manifest_file = NamedTempFile::new().unwrap();
179
+ let manifest_json = r#"{
180
+ "version": "test-v1",
181
+ "tokenizer": "test-tokenizer",
182
+ "num_patterns": 2,
183
+ "built_at": "2025-01-01T00:00:00Z",
184
+ "separator_id": 4294967294
185
+ }"#;
186
+ manifest_file.write_all(manifest_json.as_bytes()).unwrap();
187
+ manifest_file.flush().unwrap();
188
+
189
+ (automaton_file, payloads_file, manifest_file)
190
+ }
191
+
192
+ #[test]
193
+ fn test_matcher_load() {
194
+ let (automaton_file, payloads_file, manifest_file) = create_test_artifacts();
195
+
196
+ let matcher = Matcher::load(
197
+ automaton_file.path(),
198
+ payloads_file.path(),
199
+ manifest_file.path(),
200
+ )
201
+ .unwrap();
202
+
203
+ assert_eq!(matcher.num_patterns(), 2);
204
+ assert_eq!(matcher.manifest().version, "test-v1");
205
+ }
206
+
207
+ #[test]
208
+ fn test_matcher_match_tokens() {
209
+ let (automaton_file, payloads_file, manifest_file) = create_test_artifacts();
210
+
211
+ let matcher = Matcher::load(
212
+ automaton_file.path(),
213
+ payloads_file.path(),
214
+ manifest_file.path(),
215
+ )
216
+ .unwrap();
217
+
218
+ let token_ids = vec![1, 2, 3, 4];
219
+ let matches = matcher.match_tokens(&token_ids, MatchPolicy::LeftmostLongest, 10);
220
+
221
+ assert_eq!(matches.len(), 2);
222
+ assert_eq!(matches[0].start, 0);
223
+ assert_eq!(matches[0].end, 2);
224
+ assert_eq!(matches[1].start, 1);
225
+ assert_eq!(matches[1].end, 3);
226
+ }
227
+ }
@@ -0,0 +1,95 @@
1
+ use serde::{Deserialize, Serialize};
2
+ use std::io::{Read, Write};
3
+
4
+ #[derive(Debug, Clone, Serialize, Deserialize)]
5
+ pub struct Payload {
6
+ pub phrase_id: u32,
7
+ pub salience: f32,
8
+ pub count: u32,
9
+ pub n: u8,
10
+ }
11
+
12
+ impl Payload {
13
+ #[allow(dead_code)]
14
+ pub fn new(phrase_id: u32, salience: f32, count: u32, n: u8) -> Self {
15
+ Self {
16
+ phrase_id,
17
+ salience,
18
+ count,
19
+ n,
20
+ }
21
+ }
22
+
23
+ pub fn salience_score(&self) -> f32 {
24
+ self.salience * ((self.count + 1) as f32).ln()
25
+ }
26
+
27
+ pub fn read_from<R: Read>(reader: &mut R) -> std::io::Result<Self> {
28
+ let mut buf = [0u8; 17];
29
+ reader.read_exact(&mut buf)?;
30
+
31
+ let phrase_id = u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]]);
32
+ let salience = f32::from_le_bytes([buf[4], buf[5], buf[6], buf[7]]);
33
+ let count = u32::from_le_bytes([buf[8], buf[9], buf[10], buf[11]]);
34
+ let n = buf[16];
35
+
36
+ Ok(Self {
37
+ phrase_id,
38
+ salience,
39
+ count,
40
+ n,
41
+ })
42
+ }
43
+
44
+ #[allow(dead_code)]
45
+ pub fn write_to<W: Write>(&self, writer: &mut W) -> std::io::Result<()> {
46
+ writer.write_all(&self.phrase_id.to_le_bytes())?;
47
+ writer.write_all(&self.salience.to_le_bytes())?;
48
+ writer.write_all(&self.count.to_le_bytes())?;
49
+ writer.write_all(&[0u8; 4])?;
50
+ writer.write_all(&[self.n])?;
51
+ Ok(())
52
+ }
53
+ }
54
+
55
+ pub fn load_payloads<R: Read>(mut reader: R) -> std::io::Result<Vec<Payload>> {
56
+ let mut payloads = Vec::new();
57
+
58
+ loop {
59
+ match Payload::read_from(&mut reader) {
60
+ Ok(payload) => payloads.push(payload),
61
+ Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break,
62
+ Err(e) => return Err(e),
63
+ }
64
+ }
65
+
66
+ Ok(payloads)
67
+ }
68
+
69
+ #[cfg(test)]
70
+ mod tests {
71
+ use super::*;
72
+
73
+ #[test]
74
+ fn test_payload_roundtrip() {
75
+ let payload = Payload::new(12345, 2.13, 314, 2);
76
+
77
+ let mut buf = Vec::new();
78
+ payload.write_to(&mut buf).unwrap();
79
+
80
+ let mut cursor = std::io::Cursor::new(buf);
81
+ let loaded = Payload::read_from(&mut cursor).unwrap();
82
+
83
+ assert_eq!(loaded.phrase_id, 12345);
84
+ assert_eq!(loaded.count, 314);
85
+ assert_eq!(loaded.n, 2);
86
+ assert!((loaded.salience - 2.13).abs() < 0.001);
87
+ }
88
+
89
+ #[test]
90
+ fn test_salience_score() {
91
+ let payload = Payload::new(1, 2.0, 99, 2);
92
+ let score = payload.salience_score();
93
+ assert!((score - (2.0 * 100.0_f32.ln())).abs() < 0.001);
94
+ }
95
+ }
@@ -0,0 +1,190 @@
1
+ use crate::payload::Payload;
2
+
3
+ #[derive(Debug, Clone, Copy, PartialEq, Eq)]
4
+ pub enum MatchPolicy {
5
+ LeftmostLongest,
6
+ LeftmostFirst,
7
+ SalienceMax,
8
+ }
9
+
10
+ impl MatchPolicy {
11
+ pub fn from_str(s: &str) -> Option<Self> {
12
+ match s {
13
+ "leftmost_longest" => Some(Self::LeftmostLongest),
14
+ "leftmost_first" => Some(Self::LeftmostFirst),
15
+ "salience_max" => Some(Self::SalienceMax),
16
+ _ => None,
17
+ }
18
+ }
19
+ }
20
+
21
+ #[derive(Debug, Clone)]
22
+ pub struct Match {
23
+ pub start: usize,
24
+ pub end: usize,
25
+ #[allow(dead_code)]
26
+ pub pattern_id: usize,
27
+ pub payload: Payload,
28
+ }
29
+
30
+ impl Match {
31
+ pub fn new(start: usize, end: usize, pattern_id: usize, payload: Payload) -> Self {
32
+ Self {
33
+ start,
34
+ end,
35
+ pattern_id,
36
+ payload,
37
+ }
38
+ }
39
+
40
+ pub fn len(&self) -> usize {
41
+ self.end - self.start
42
+ }
43
+
44
+ pub fn overlaps(&self, other: &Match) -> bool {
45
+ !(self.end <= other.start || other.end <= self.start)
46
+ }
47
+ }
48
+
49
+ pub fn resolve_overlaps(mut matches: Vec<Match>, policy: MatchPolicy) -> Vec<Match> {
50
+ if matches.is_empty() {
51
+ return matches;
52
+ }
53
+
54
+ matches.sort_by_key(|m| m.start);
55
+
56
+ match policy {
57
+ MatchPolicy::LeftmostLongest => resolve_leftmost_longest(matches),
58
+ MatchPolicy::LeftmostFirst => resolve_leftmost_first(matches),
59
+ MatchPolicy::SalienceMax => resolve_salience_max(matches),
60
+ }
61
+ }
62
+
63
+ fn resolve_leftmost_longest(matches: Vec<Match>) -> Vec<Match> {
64
+ let mut result = Vec::new();
65
+ let mut current_end = 0;
66
+
67
+ for group_start in 0..matches.len() {
68
+ if matches[group_start].start < current_end {
69
+ continue;
70
+ }
71
+
72
+ let group_end = matches[group_start..]
73
+ .iter()
74
+ .position(|m| m.start != matches[group_start].start)
75
+ .map(|i| group_start + i)
76
+ .unwrap_or(matches.len());
77
+
78
+ let longest = matches[group_start..group_end]
79
+ .iter()
80
+ .max_by_key(|m| m.len())
81
+ .unwrap()
82
+ .clone();
83
+
84
+ current_end = longest.end;
85
+ result.push(longest);
86
+ }
87
+
88
+ result
89
+ }
90
+
91
+ fn resolve_leftmost_first(matches: Vec<Match>) -> Vec<Match> {
92
+ let mut result = Vec::new();
93
+ let mut current_end = 0;
94
+
95
+ for m in matches {
96
+ if m.start >= current_end {
97
+ current_end = m.end;
98
+ result.push(m);
99
+ }
100
+ }
101
+
102
+ result
103
+ }
104
+
105
+ fn resolve_salience_max(matches: Vec<Match>) -> Vec<Match> {
106
+ let mut result = Vec::new();
107
+ let mut i = 0;
108
+
109
+ while i < matches.len() {
110
+ let mut j = i + 1;
111
+ while j < matches.len() && matches[j].overlaps(&matches[i]) {
112
+ j += 1;
113
+ }
114
+
115
+ let best = matches[i..j]
116
+ .iter()
117
+ .max_by(|a, b| {
118
+ a.payload
119
+ .salience_score()
120
+ .partial_cmp(&b.payload.salience_score())
121
+ .unwrap_or(std::cmp::Ordering::Equal)
122
+ })
123
+ .unwrap()
124
+ .clone();
125
+
126
+ result.push(best.clone());
127
+ i = matches[i..]
128
+ .iter()
129
+ .position(|m| m.start >= best.end)
130
+ .map(|idx| i + idx)
131
+ .unwrap_or(matches.len());
132
+ }
133
+
134
+ result
135
+ }
136
+
137
+ #[cfg(test)]
138
+ mod tests {
139
+ use super::*;
140
+
141
+ fn make_match(start: usize, end: usize, salience: f32, count: u32) -> Match {
142
+ Match::new(
143
+ start,
144
+ end,
145
+ 0,
146
+ Payload::new(0, salience, count, (end - start) as u8),
147
+ )
148
+ }
149
+
150
+ #[test]
151
+ fn test_leftmost_longest() {
152
+ let matches = vec![
153
+ make_match(0, 2, 1.0, 100),
154
+ make_match(0, 3, 1.0, 100),
155
+ make_match(5, 7, 1.0, 100),
156
+ ];
157
+
158
+ let resolved = resolve_overlaps(matches, MatchPolicy::LeftmostLongest);
159
+ assert_eq!(resolved.len(), 2);
160
+ assert_eq!(resolved[0].len(), 3);
161
+ assert_eq!(resolved[1].start, 5);
162
+ }
163
+
164
+ #[test]
165
+ fn test_leftmost_first() {
166
+ let matches = vec![
167
+ make_match(0, 2, 1.0, 100),
168
+ make_match(1, 3, 1.0, 100),
169
+ make_match(3, 5, 1.0, 100),
170
+ ];
171
+
172
+ let resolved = resolve_overlaps(matches, MatchPolicy::LeftmostFirst);
173
+ assert_eq!(resolved.len(), 2);
174
+ assert_eq!(resolved[0].end, 2);
175
+ assert_eq!(resolved[1].start, 3);
176
+ }
177
+
178
+ #[test]
179
+ fn test_salience_max() {
180
+ let matches = vec![
181
+ make_match(0, 2, 1.0, 100),
182
+ make_match(0, 3, 2.0, 200),
183
+ make_match(5, 7, 1.0, 100),
184
+ ];
185
+
186
+ let resolved = resolve_overlaps(matches, MatchPolicy::SalienceMax);
187
+ assert_eq!(resolved.len(), 2);
188
+ assert_eq!(resolved[0].len(), 3);
189
+ }
190
+ }
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,74 @@
1
+ require "shellwords"
2
+
3
+ module PhraseKit
4
+ class Miner
5
+ class Error < StandardError; end
6
+
7
+ class << self
8
+ def mine(input_path:, output_path:, min_n: 2, max_n: 5, min_count: 10, config_path: nil)
9
+ binary_path = find_binary
10
+
11
+ # Create temporary config if not provided
12
+ if config_path.nil?
13
+ require "tempfile"
14
+ require "json"
15
+
16
+ config_file = Tempfile.new(["mine_config", ".json"])
17
+ config_file.write(JSON.generate({
18
+ min_n: min_n,
19
+ max_n: max_n,
20
+ min_count: min_count
21
+ }))
22
+ config_file.flush
23
+ config_path = config_file.path
24
+ end
25
+
26
+ # Run mining
27
+ cmd = [binary_path, input_path.to_s, config_path.to_s, output_path.to_s]
28
+ output = `#{cmd.shelljoin} 2>&1`
29
+
30
+ unless $?.success?
31
+ config_file.close! if config_file
32
+ raise Error, "Mining failed: #{output}"
33
+ end
34
+
35
+ config_file.close! if config_file
36
+
37
+ # Parse stats from output
38
+ parse_stats(output)
39
+ end
40
+
41
+ private
42
+
43
+ def find_binary
44
+ # Search paths relative to this file
45
+ # __dir__ is lib/phrasekit, so go up 2 levels to get to gem root
46
+ base_dir = File.expand_path("../..", __dir__)
47
+
48
+ candidates = [
49
+ File.join(base_dir, "ext/phrasekit/target/release/phrasekit_mine"),
50
+ File.join(base_dir, "ext/phrasekit/target/debug/phrasekit_mine"),
51
+ # For installed gems
52
+ File.join(base_dir, "bin/phrasekit_mine")
53
+ ]
54
+
55
+ candidates.each do |binary|
56
+ return binary if File.exist?(binary) && File.executable?(binary)
57
+ end
58
+
59
+ raise Error, "phrasekit_mine binary not found. Run: cargo build --release --bin phrasekit_mine --manifest-path ext/phrasekit/Cargo.toml"
60
+ end
61
+
62
+ def parse_stats(output)
63
+ stats = {}
64
+
65
+ output.scan(/Total documents:\s+(\d+)/) { stats[:total_docs] = $1.to_i }
66
+ output.scan(/Total tokens:\s+(\d+)/) { stats[:total_tokens] = $1.to_i }
67
+ output.scan(/Unique n-grams:\s+(\d+)/) { stats[:unique_ngrams] = $1.to_i }
68
+ output.scan(/After min_count=\d+:\s+(\d+)/) { stats[:ngrams_after_filter] = $1.to_i }
69
+
70
+ stats
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,92 @@
1
+ require "shellwords"
2
+
3
+ module PhraseKit
4
+ class Scorer
5
+ class Error < StandardError; end
6
+
7
+ class << self
8
+ def score(
9
+ domain_path:,
10
+ background_path:,
11
+ output_path:,
12
+ method: :ratio,
13
+ min_salience: 2.0,
14
+ min_domain_count: 10,
15
+ assign_phrase_ids: true,
16
+ starting_phrase_id: 1000,
17
+ config_path: nil
18
+ )
19
+ binary_path = find_binary
20
+
21
+ # Create temporary config if not provided
22
+ if config_path.nil?
23
+ require "tempfile"
24
+ require "json"
25
+
26
+ config_file = Tempfile.new(["score_config", ".json"])
27
+ config_file.write(JSON.generate({
28
+ method: method.to_s,
29
+ min_salience: min_salience,
30
+ min_domain_count: min_domain_count,
31
+ assign_phrase_ids: assign_phrase_ids,
32
+ starting_phrase_id: starting_phrase_id
33
+ }))
34
+ config_file.flush
35
+ config_path = config_file.path
36
+ end
37
+
38
+ # Run scoring
39
+ cmd = [
40
+ binary_path,
41
+ domain_path.to_s,
42
+ background_path.to_s,
43
+ config_path.to_s,
44
+ output_path.to_s
45
+ ]
46
+ output = `#{cmd.shelljoin} 2>&1`
47
+
48
+ unless $?.success?
49
+ config_file.close! if config_file
50
+ raise Error, "Scoring failed: #{output}"
51
+ end
52
+
53
+ config_file.close! if config_file
54
+
55
+ # Parse stats from output
56
+ parse_stats(output)
57
+ end
58
+
59
+ private
60
+
61
+ def find_binary
62
+ # Search paths relative to this file
63
+ # __dir__ is lib/phrasekit, so go up 2 levels to get to gem root
64
+ base_dir = File.expand_path("../..", __dir__)
65
+
66
+ candidates = [
67
+ File.join(base_dir, "ext/phrasekit/target/release/phrasekit_score"),
68
+ File.join(base_dir, "ext/phrasekit/target/debug/phrasekit_score"),
69
+ # For installed gems
70
+ File.join(base_dir, "bin/phrasekit_score")
71
+ ]
72
+
73
+ candidates.each do |binary|
74
+ return binary if File.exist?(binary) && File.executable?(binary)
75
+ end
76
+
77
+ raise Error, "phrasekit_score binary not found. Run: cargo build --release --bin phrasekit_score --manifest-path ext/phrasekit/Cargo.toml"
78
+ end
79
+
80
+ def parse_stats(output)
81
+ stats = {}
82
+
83
+ output.scan(/Domain phrases:\s+(\d+)/) { stats[:domain_phrases] = $1.to_i }
84
+ output.scan(/Background phrases:\s+(\d+)/) { stats[:background_phrases] = $1.to_i }
85
+ output.scan(/After domain filter:\s+(\d+)/) { stats[:after_domain_filter] = $1.to_i }
86
+ output.scan(/After salience filter:\s+(\d+)/) { stats[:after_salience_filter] = $1.to_i }
87
+
88
+ stats
89
+ end
90
+ end
91
+ end
92
+ end