phrasekit 0.2.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +131 -0
- data/ext/phrasekit/Cargo.toml +45 -0
- data/ext/phrasekit/extconf.rb +4 -0
- data/ext/phrasekit/src/bin/fixture_builder.rs +131 -0
- data/ext/phrasekit/src/bin/phrasekit_build.rs +326 -0
- data/ext/phrasekit/src/bin/phrasekit_mine.rs +199 -0
- data/ext/phrasekit/src/bin/phrasekit_score.rs +298 -0
- data/ext/phrasekit/src/bin/phrasekit_tag.rs +320 -0
- data/ext/phrasekit/src/lib.rs +104 -0
- data/ext/phrasekit/src/manifest.rs +88 -0
- data/ext/phrasekit/src/matcher.rs +227 -0
- data/ext/phrasekit/src/payload.rs +95 -0
- data/ext/phrasekit/src/policy.rs +190 -0
- data/lib/phrasekit/3.1/phrasekit.so +0 -0
- data/lib/phrasekit/3.2/phrasekit.so +0 -0
- data/lib/phrasekit/3.3/phrasekit.so +0 -0
- data/lib/phrasekit/3.4/phrasekit.so +0 -0
- data/lib/phrasekit/miner.rb +74 -0
- data/lib/phrasekit/scorer.rb +92 -0
- data/lib/phrasekit/tagger.rb +100 -0
- data/lib/phrasekit/version.rb +3 -0
- data/lib/phrasekit.rb +100 -0
- data/lib/spellkit_stub.rb +80 -0
- metadata +156 -0
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
use crate::manifest::Manifest;
|
|
2
|
+
use crate::payload::{load_payloads, Payload};
|
|
3
|
+
use crate::policy::{resolve_overlaps, Match, MatchPolicy};
|
|
4
|
+
use daachorse::DoubleArrayAhoCorasick;
|
|
5
|
+
use std::fs::File;
|
|
6
|
+
use std::io::BufReader;
|
|
7
|
+
use std::path::Path;
|
|
8
|
+
use std::time::SystemTime;
|
|
9
|
+
use thiserror::Error;
|
|
10
|
+
|
|
11
|
+
#[derive(Error, Debug)]
|
|
12
|
+
pub enum MatcherError {
|
|
13
|
+
#[error("IO error: {0}")]
|
|
14
|
+
Io(#[from] std::io::Error),
|
|
15
|
+
|
|
16
|
+
#[error("Manifest error: {0}")]
|
|
17
|
+
Manifest(#[from] crate::manifest::ManifestError),
|
|
18
|
+
|
|
19
|
+
#[error("Automaton error: {0}")]
|
|
20
|
+
Automaton(String),
|
|
21
|
+
|
|
22
|
+
#[error("Matcher not loaded")]
|
|
23
|
+
#[allow(dead_code)]
|
|
24
|
+
NotLoaded,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
pub struct Matcher {
|
|
28
|
+
automaton: DoubleArrayAhoCorasick<u32>,
|
|
29
|
+
payloads: Vec<Payload>,
|
|
30
|
+
manifest: Manifest,
|
|
31
|
+
loaded_at: SystemTime,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
impl Matcher {
|
|
35
|
+
pub fn load<P: AsRef<Path>>(
|
|
36
|
+
automaton_path: P,
|
|
37
|
+
payloads_path: P,
|
|
38
|
+
manifest_path: P,
|
|
39
|
+
) -> Result<Self, MatcherError> {
|
|
40
|
+
let manifest = Manifest::load(manifest_path)?;
|
|
41
|
+
|
|
42
|
+
let automaton_bytes = std::fs::read(automaton_path)?;
|
|
43
|
+
let (automaton, _): (DoubleArrayAhoCorasick<u32>, _) = unsafe {
|
|
44
|
+
DoubleArrayAhoCorasick::deserialize_unchecked(&automaton_bytes)
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
let payloads_file = File::open(payloads_path)?;
|
|
48
|
+
let payloads_reader = BufReader::new(payloads_file);
|
|
49
|
+
let payloads = load_payloads(payloads_reader)?;
|
|
50
|
+
|
|
51
|
+
if payloads.len() != manifest.num_patterns {
|
|
52
|
+
return Err(MatcherError::Automaton(format!(
|
|
53
|
+
"Payload count mismatch: manifest says {}, got {}",
|
|
54
|
+
manifest.num_patterns,
|
|
55
|
+
payloads.len()
|
|
56
|
+
)));
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
Ok(Self {
|
|
60
|
+
automaton,
|
|
61
|
+
payloads,
|
|
62
|
+
manifest,
|
|
63
|
+
loaded_at: SystemTime::now(),
|
|
64
|
+
})
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
pub fn match_tokens(
|
|
68
|
+
&self,
|
|
69
|
+
token_ids: &[u32],
|
|
70
|
+
policy: MatchPolicy,
|
|
71
|
+
max: usize,
|
|
72
|
+
) -> Vec<Match> {
|
|
73
|
+
if token_ids.is_empty() {
|
|
74
|
+
return Vec::new();
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
let separator = self.manifest.separator_id;
|
|
78
|
+
let mut bytes = Vec::with_capacity(token_ids.len() * 5);
|
|
79
|
+
for &token_id in token_ids {
|
|
80
|
+
bytes.extend_from_slice(&token_id.to_le_bytes());
|
|
81
|
+
bytes.extend_from_slice(&separator.to_le_bytes());
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
let matches: Vec<Match> = self
|
|
85
|
+
.automaton
|
|
86
|
+
.find_overlapping_iter(&bytes)
|
|
87
|
+
.filter_map(|m| {
|
|
88
|
+
let pattern_id = m.value() as usize;
|
|
89
|
+
let start_token = m.start() / 8;
|
|
90
|
+
let end_token = (m.end() + 7) / 8;
|
|
91
|
+
|
|
92
|
+
self.payloads
|
|
93
|
+
.get(pattern_id)
|
|
94
|
+
.map(|payload| Match::new(start_token, end_token, pattern_id, payload.clone()))
|
|
95
|
+
})
|
|
96
|
+
.collect();
|
|
97
|
+
|
|
98
|
+
let mut resolved = resolve_overlaps(matches, policy);
|
|
99
|
+
|
|
100
|
+
if resolved.len() > max {
|
|
101
|
+
resolved.truncate(max);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
resolved
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
#[allow(dead_code)]
|
|
108
|
+
pub fn manifest(&self) -> &Manifest {
|
|
109
|
+
&self.manifest
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
pub fn num_patterns(&self) -> usize {
|
|
113
|
+
self.payloads.len()
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
#[allow(dead_code)]
|
|
117
|
+
pub fn loaded_at(&self) -> SystemTime {
|
|
118
|
+
self.loaded_at
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
pub fn memory_usage_mb(&self) -> f64 {
|
|
122
|
+
let automaton_size = std::mem::size_of_val(&self.automaton);
|
|
123
|
+
let payloads_size = self.payloads.len() * std::mem::size_of::<Payload>();
|
|
124
|
+
((automaton_size + payloads_size) as f64) / 1_048_576.0
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
pub struct Stats {
|
|
129
|
+
pub version: String,
|
|
130
|
+
pub loaded_at: SystemTime,
|
|
131
|
+
pub num_patterns: usize,
|
|
132
|
+
pub heap_mb: f64,
|
|
133
|
+
pub hits_total: u64,
|
|
134
|
+
pub p50_us: u64,
|
|
135
|
+
pub p95_us: u64,
|
|
136
|
+
pub p99_us: u64,
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
impl Stats {
|
|
140
|
+
pub fn from_matcher(matcher: &Matcher) -> Self {
|
|
141
|
+
Self {
|
|
142
|
+
version: matcher.manifest.version.clone(),
|
|
143
|
+
loaded_at: matcher.loaded_at,
|
|
144
|
+
num_patterns: matcher.num_patterns(),
|
|
145
|
+
heap_mb: matcher.memory_usage_mb(),
|
|
146
|
+
hits_total: 0,
|
|
147
|
+
p50_us: 0,
|
|
148
|
+
p95_us: 0,
|
|
149
|
+
p99_us: 0,
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
#[cfg(test)]
|
|
155
|
+
mod tests {
|
|
156
|
+
use super::*;
|
|
157
|
+
use crate::payload::Payload;
|
|
158
|
+
use std::io::Write;
|
|
159
|
+
use tempfile::NamedTempFile;
|
|
160
|
+
|
|
161
|
+
fn create_test_artifacts() -> (NamedTempFile, NamedTempFile, NamedTempFile) {
|
|
162
|
+
let patterns = vec![vec![1u32, 2u32], vec![2u32, 3u32]];
|
|
163
|
+
|
|
164
|
+
let automaton = DoubleArrayAhoCorasick::new(patterns).unwrap();
|
|
165
|
+
let automaton_bytes = automaton.serialize();
|
|
166
|
+
|
|
167
|
+
let mut automaton_file = NamedTempFile::new().unwrap();
|
|
168
|
+
automaton_file.write_all(&automaton_bytes).unwrap();
|
|
169
|
+
automaton_file.flush().unwrap();
|
|
170
|
+
|
|
171
|
+
let mut payloads_file = NamedTempFile::new().unwrap();
|
|
172
|
+
let payload1 = Payload::new(100, 1.5, 50, 2);
|
|
173
|
+
let payload2 = Payload::new(200, 2.0, 100, 2);
|
|
174
|
+
payload1.write_to(&mut payloads_file).unwrap();
|
|
175
|
+
payload2.write_to(&mut payloads_file).unwrap();
|
|
176
|
+
payloads_file.flush().unwrap();
|
|
177
|
+
|
|
178
|
+
let mut manifest_file = NamedTempFile::new().unwrap();
|
|
179
|
+
let manifest_json = r#"{
|
|
180
|
+
"version": "test-v1",
|
|
181
|
+
"tokenizer": "test-tokenizer",
|
|
182
|
+
"num_patterns": 2,
|
|
183
|
+
"built_at": "2025-01-01T00:00:00Z",
|
|
184
|
+
"separator_id": 4294967294
|
|
185
|
+
}"#;
|
|
186
|
+
manifest_file.write_all(manifest_json.as_bytes()).unwrap();
|
|
187
|
+
manifest_file.flush().unwrap();
|
|
188
|
+
|
|
189
|
+
(automaton_file, payloads_file, manifest_file)
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
#[test]
|
|
193
|
+
fn test_matcher_load() {
|
|
194
|
+
let (automaton_file, payloads_file, manifest_file) = create_test_artifacts();
|
|
195
|
+
|
|
196
|
+
let matcher = Matcher::load(
|
|
197
|
+
automaton_file.path(),
|
|
198
|
+
payloads_file.path(),
|
|
199
|
+
manifest_file.path(),
|
|
200
|
+
)
|
|
201
|
+
.unwrap();
|
|
202
|
+
|
|
203
|
+
assert_eq!(matcher.num_patterns(), 2);
|
|
204
|
+
assert_eq!(matcher.manifest().version, "test-v1");
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
#[test]
|
|
208
|
+
fn test_matcher_match_tokens() {
|
|
209
|
+
let (automaton_file, payloads_file, manifest_file) = create_test_artifacts();
|
|
210
|
+
|
|
211
|
+
let matcher = Matcher::load(
|
|
212
|
+
automaton_file.path(),
|
|
213
|
+
payloads_file.path(),
|
|
214
|
+
manifest_file.path(),
|
|
215
|
+
)
|
|
216
|
+
.unwrap();
|
|
217
|
+
|
|
218
|
+
let token_ids = vec![1, 2, 3, 4];
|
|
219
|
+
let matches = matcher.match_tokens(&token_ids, MatchPolicy::LeftmostLongest, 10);
|
|
220
|
+
|
|
221
|
+
assert_eq!(matches.len(), 2);
|
|
222
|
+
assert_eq!(matches[0].start, 0);
|
|
223
|
+
assert_eq!(matches[0].end, 2);
|
|
224
|
+
assert_eq!(matches[1].start, 1);
|
|
225
|
+
assert_eq!(matches[1].end, 3);
|
|
226
|
+
}
|
|
227
|
+
}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
use serde::{Deserialize, Serialize};
|
|
2
|
+
use std::io::{Read, Write};
|
|
3
|
+
|
|
4
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
5
|
+
pub struct Payload {
|
|
6
|
+
pub phrase_id: u32,
|
|
7
|
+
pub salience: f32,
|
|
8
|
+
pub count: u32,
|
|
9
|
+
pub n: u8,
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
impl Payload {
|
|
13
|
+
#[allow(dead_code)]
|
|
14
|
+
pub fn new(phrase_id: u32, salience: f32, count: u32, n: u8) -> Self {
|
|
15
|
+
Self {
|
|
16
|
+
phrase_id,
|
|
17
|
+
salience,
|
|
18
|
+
count,
|
|
19
|
+
n,
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
pub fn salience_score(&self) -> f32 {
|
|
24
|
+
self.salience * ((self.count + 1) as f32).ln()
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
pub fn read_from<R: Read>(reader: &mut R) -> std::io::Result<Self> {
|
|
28
|
+
let mut buf = [0u8; 17];
|
|
29
|
+
reader.read_exact(&mut buf)?;
|
|
30
|
+
|
|
31
|
+
let phrase_id = u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]]);
|
|
32
|
+
let salience = f32::from_le_bytes([buf[4], buf[5], buf[6], buf[7]]);
|
|
33
|
+
let count = u32::from_le_bytes([buf[8], buf[9], buf[10], buf[11]]);
|
|
34
|
+
let n = buf[16];
|
|
35
|
+
|
|
36
|
+
Ok(Self {
|
|
37
|
+
phrase_id,
|
|
38
|
+
salience,
|
|
39
|
+
count,
|
|
40
|
+
n,
|
|
41
|
+
})
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
#[allow(dead_code)]
|
|
45
|
+
pub fn write_to<W: Write>(&self, writer: &mut W) -> std::io::Result<()> {
|
|
46
|
+
writer.write_all(&self.phrase_id.to_le_bytes())?;
|
|
47
|
+
writer.write_all(&self.salience.to_le_bytes())?;
|
|
48
|
+
writer.write_all(&self.count.to_le_bytes())?;
|
|
49
|
+
writer.write_all(&[0u8; 4])?;
|
|
50
|
+
writer.write_all(&[self.n])?;
|
|
51
|
+
Ok(())
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
pub fn load_payloads<R: Read>(mut reader: R) -> std::io::Result<Vec<Payload>> {
|
|
56
|
+
let mut payloads = Vec::new();
|
|
57
|
+
|
|
58
|
+
loop {
|
|
59
|
+
match Payload::read_from(&mut reader) {
|
|
60
|
+
Ok(payload) => payloads.push(payload),
|
|
61
|
+
Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break,
|
|
62
|
+
Err(e) => return Err(e),
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
Ok(payloads)
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
#[cfg(test)]
|
|
70
|
+
mod tests {
|
|
71
|
+
use super::*;
|
|
72
|
+
|
|
73
|
+
#[test]
|
|
74
|
+
fn test_payload_roundtrip() {
|
|
75
|
+
let payload = Payload::new(12345, 2.13, 314, 2);
|
|
76
|
+
|
|
77
|
+
let mut buf = Vec::new();
|
|
78
|
+
payload.write_to(&mut buf).unwrap();
|
|
79
|
+
|
|
80
|
+
let mut cursor = std::io::Cursor::new(buf);
|
|
81
|
+
let loaded = Payload::read_from(&mut cursor).unwrap();
|
|
82
|
+
|
|
83
|
+
assert_eq!(loaded.phrase_id, 12345);
|
|
84
|
+
assert_eq!(loaded.count, 314);
|
|
85
|
+
assert_eq!(loaded.n, 2);
|
|
86
|
+
assert!((loaded.salience - 2.13).abs() < 0.001);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
#[test]
|
|
90
|
+
fn test_salience_score() {
|
|
91
|
+
let payload = Payload::new(1, 2.0, 99, 2);
|
|
92
|
+
let score = payload.salience_score();
|
|
93
|
+
assert!((score - (2.0 * 100.0_f32.ln())).abs() < 0.001);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
use crate::payload::Payload;
|
|
2
|
+
|
|
3
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
4
|
+
pub enum MatchPolicy {
|
|
5
|
+
LeftmostLongest,
|
|
6
|
+
LeftmostFirst,
|
|
7
|
+
SalienceMax,
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
impl MatchPolicy {
|
|
11
|
+
pub fn from_str(s: &str) -> Option<Self> {
|
|
12
|
+
match s {
|
|
13
|
+
"leftmost_longest" => Some(Self::LeftmostLongest),
|
|
14
|
+
"leftmost_first" => Some(Self::LeftmostFirst),
|
|
15
|
+
"salience_max" => Some(Self::SalienceMax),
|
|
16
|
+
_ => None,
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
#[derive(Debug, Clone)]
|
|
22
|
+
pub struct Match {
|
|
23
|
+
pub start: usize,
|
|
24
|
+
pub end: usize,
|
|
25
|
+
#[allow(dead_code)]
|
|
26
|
+
pub pattern_id: usize,
|
|
27
|
+
pub payload: Payload,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
impl Match {
|
|
31
|
+
pub fn new(start: usize, end: usize, pattern_id: usize, payload: Payload) -> Self {
|
|
32
|
+
Self {
|
|
33
|
+
start,
|
|
34
|
+
end,
|
|
35
|
+
pattern_id,
|
|
36
|
+
payload,
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
pub fn len(&self) -> usize {
|
|
41
|
+
self.end - self.start
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
pub fn overlaps(&self, other: &Match) -> bool {
|
|
45
|
+
!(self.end <= other.start || other.end <= self.start)
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
pub fn resolve_overlaps(mut matches: Vec<Match>, policy: MatchPolicy) -> Vec<Match> {
|
|
50
|
+
if matches.is_empty() {
|
|
51
|
+
return matches;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
matches.sort_by_key(|m| m.start);
|
|
55
|
+
|
|
56
|
+
match policy {
|
|
57
|
+
MatchPolicy::LeftmostLongest => resolve_leftmost_longest(matches),
|
|
58
|
+
MatchPolicy::LeftmostFirst => resolve_leftmost_first(matches),
|
|
59
|
+
MatchPolicy::SalienceMax => resolve_salience_max(matches),
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
fn resolve_leftmost_longest(matches: Vec<Match>) -> Vec<Match> {
|
|
64
|
+
let mut result = Vec::new();
|
|
65
|
+
let mut current_end = 0;
|
|
66
|
+
|
|
67
|
+
for group_start in 0..matches.len() {
|
|
68
|
+
if matches[group_start].start < current_end {
|
|
69
|
+
continue;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
let group_end = matches[group_start..]
|
|
73
|
+
.iter()
|
|
74
|
+
.position(|m| m.start != matches[group_start].start)
|
|
75
|
+
.map(|i| group_start + i)
|
|
76
|
+
.unwrap_or(matches.len());
|
|
77
|
+
|
|
78
|
+
let longest = matches[group_start..group_end]
|
|
79
|
+
.iter()
|
|
80
|
+
.max_by_key(|m| m.len())
|
|
81
|
+
.unwrap()
|
|
82
|
+
.clone();
|
|
83
|
+
|
|
84
|
+
current_end = longest.end;
|
|
85
|
+
result.push(longest);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
result
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
fn resolve_leftmost_first(matches: Vec<Match>) -> Vec<Match> {
|
|
92
|
+
let mut result = Vec::new();
|
|
93
|
+
let mut current_end = 0;
|
|
94
|
+
|
|
95
|
+
for m in matches {
|
|
96
|
+
if m.start >= current_end {
|
|
97
|
+
current_end = m.end;
|
|
98
|
+
result.push(m);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
result
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
fn resolve_salience_max(matches: Vec<Match>) -> Vec<Match> {
|
|
106
|
+
let mut result = Vec::new();
|
|
107
|
+
let mut i = 0;
|
|
108
|
+
|
|
109
|
+
while i < matches.len() {
|
|
110
|
+
let mut j = i + 1;
|
|
111
|
+
while j < matches.len() && matches[j].overlaps(&matches[i]) {
|
|
112
|
+
j += 1;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
let best = matches[i..j]
|
|
116
|
+
.iter()
|
|
117
|
+
.max_by(|a, b| {
|
|
118
|
+
a.payload
|
|
119
|
+
.salience_score()
|
|
120
|
+
.partial_cmp(&b.payload.salience_score())
|
|
121
|
+
.unwrap_or(std::cmp::Ordering::Equal)
|
|
122
|
+
})
|
|
123
|
+
.unwrap()
|
|
124
|
+
.clone();
|
|
125
|
+
|
|
126
|
+
result.push(best.clone());
|
|
127
|
+
i = matches[i..]
|
|
128
|
+
.iter()
|
|
129
|
+
.position(|m| m.start >= best.end)
|
|
130
|
+
.map(|idx| i + idx)
|
|
131
|
+
.unwrap_or(matches.len());
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
result
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
#[cfg(test)]
|
|
138
|
+
mod tests {
|
|
139
|
+
use super::*;
|
|
140
|
+
|
|
141
|
+
fn make_match(start: usize, end: usize, salience: f32, count: u32) -> Match {
|
|
142
|
+
Match::new(
|
|
143
|
+
start,
|
|
144
|
+
end,
|
|
145
|
+
0,
|
|
146
|
+
Payload::new(0, salience, count, (end - start) as u8),
|
|
147
|
+
)
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
#[test]
|
|
151
|
+
fn test_leftmost_longest() {
|
|
152
|
+
let matches = vec![
|
|
153
|
+
make_match(0, 2, 1.0, 100),
|
|
154
|
+
make_match(0, 3, 1.0, 100),
|
|
155
|
+
make_match(5, 7, 1.0, 100),
|
|
156
|
+
];
|
|
157
|
+
|
|
158
|
+
let resolved = resolve_overlaps(matches, MatchPolicy::LeftmostLongest);
|
|
159
|
+
assert_eq!(resolved.len(), 2);
|
|
160
|
+
assert_eq!(resolved[0].len(), 3);
|
|
161
|
+
assert_eq!(resolved[1].start, 5);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
#[test]
|
|
165
|
+
fn test_leftmost_first() {
|
|
166
|
+
let matches = vec![
|
|
167
|
+
make_match(0, 2, 1.0, 100),
|
|
168
|
+
make_match(1, 3, 1.0, 100),
|
|
169
|
+
make_match(3, 5, 1.0, 100),
|
|
170
|
+
];
|
|
171
|
+
|
|
172
|
+
let resolved = resolve_overlaps(matches, MatchPolicy::LeftmostFirst);
|
|
173
|
+
assert_eq!(resolved.len(), 2);
|
|
174
|
+
assert_eq!(resolved[0].end, 2);
|
|
175
|
+
assert_eq!(resolved[1].start, 3);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
#[test]
|
|
179
|
+
fn test_salience_max() {
|
|
180
|
+
let matches = vec![
|
|
181
|
+
make_match(0, 2, 1.0, 100),
|
|
182
|
+
make_match(0, 3, 2.0, 200),
|
|
183
|
+
make_match(5, 7, 1.0, 100),
|
|
184
|
+
];
|
|
185
|
+
|
|
186
|
+
let resolved = resolve_overlaps(matches, MatchPolicy::SalienceMax);
|
|
187
|
+
assert_eq!(resolved.len(), 2);
|
|
188
|
+
assert_eq!(resolved[0].len(), 3);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
require "shellwords"
|
|
2
|
+
|
|
3
|
+
module PhraseKit
|
|
4
|
+
class Miner
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
class << self
|
|
8
|
+
def mine(input_path:, output_path:, min_n: 2, max_n: 5, min_count: 10, config_path: nil)
|
|
9
|
+
binary_path = find_binary
|
|
10
|
+
|
|
11
|
+
# Create temporary config if not provided
|
|
12
|
+
if config_path.nil?
|
|
13
|
+
require "tempfile"
|
|
14
|
+
require "json"
|
|
15
|
+
|
|
16
|
+
config_file = Tempfile.new(["mine_config", ".json"])
|
|
17
|
+
config_file.write(JSON.generate({
|
|
18
|
+
min_n: min_n,
|
|
19
|
+
max_n: max_n,
|
|
20
|
+
min_count: min_count
|
|
21
|
+
}))
|
|
22
|
+
config_file.flush
|
|
23
|
+
config_path = config_file.path
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Run mining
|
|
27
|
+
cmd = [binary_path, input_path.to_s, config_path.to_s, output_path.to_s]
|
|
28
|
+
output = `#{cmd.shelljoin} 2>&1`
|
|
29
|
+
|
|
30
|
+
unless $?.success?
|
|
31
|
+
config_file.close! if config_file
|
|
32
|
+
raise Error, "Mining failed: #{output}"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
config_file.close! if config_file
|
|
36
|
+
|
|
37
|
+
# Parse stats from output
|
|
38
|
+
parse_stats(output)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
def find_binary
|
|
44
|
+
# Search paths relative to this file
|
|
45
|
+
# __dir__ is lib/phrasekit, so go up 2 levels to get to gem root
|
|
46
|
+
base_dir = File.expand_path("../..", __dir__)
|
|
47
|
+
|
|
48
|
+
candidates = [
|
|
49
|
+
File.join(base_dir, "ext/phrasekit/target/release/phrasekit_mine"),
|
|
50
|
+
File.join(base_dir, "ext/phrasekit/target/debug/phrasekit_mine"),
|
|
51
|
+
# For installed gems
|
|
52
|
+
File.join(base_dir, "bin/phrasekit_mine")
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
candidates.each do |binary|
|
|
56
|
+
return binary if File.exist?(binary) && File.executable?(binary)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
raise Error, "phrasekit_mine binary not found. Run: cargo build --release --bin phrasekit_mine --manifest-path ext/phrasekit/Cargo.toml"
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def parse_stats(output)
|
|
63
|
+
stats = {}
|
|
64
|
+
|
|
65
|
+
output.scan(/Total documents:\s+(\d+)/) { stats[:total_docs] = $1.to_i }
|
|
66
|
+
output.scan(/Total tokens:\s+(\d+)/) { stats[:total_tokens] = $1.to_i }
|
|
67
|
+
output.scan(/Unique n-grams:\s+(\d+)/) { stats[:unique_ngrams] = $1.to_i }
|
|
68
|
+
output.scan(/After min_count=\d+:\s+(\d+)/) { stats[:ngrams_after_filter] = $1.to_i }
|
|
69
|
+
|
|
70
|
+
stats
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
require "shellwords"
|
|
2
|
+
|
|
3
|
+
module PhraseKit
|
|
4
|
+
class Scorer
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
class << self
|
|
8
|
+
def score(
|
|
9
|
+
domain_path:,
|
|
10
|
+
background_path:,
|
|
11
|
+
output_path:,
|
|
12
|
+
method: :ratio,
|
|
13
|
+
min_salience: 2.0,
|
|
14
|
+
min_domain_count: 10,
|
|
15
|
+
assign_phrase_ids: true,
|
|
16
|
+
starting_phrase_id: 1000,
|
|
17
|
+
config_path: nil
|
|
18
|
+
)
|
|
19
|
+
binary_path = find_binary
|
|
20
|
+
|
|
21
|
+
# Create temporary config if not provided
|
|
22
|
+
if config_path.nil?
|
|
23
|
+
require "tempfile"
|
|
24
|
+
require "json"
|
|
25
|
+
|
|
26
|
+
config_file = Tempfile.new(["score_config", ".json"])
|
|
27
|
+
config_file.write(JSON.generate({
|
|
28
|
+
method: method.to_s,
|
|
29
|
+
min_salience: min_salience,
|
|
30
|
+
min_domain_count: min_domain_count,
|
|
31
|
+
assign_phrase_ids: assign_phrase_ids,
|
|
32
|
+
starting_phrase_id: starting_phrase_id
|
|
33
|
+
}))
|
|
34
|
+
config_file.flush
|
|
35
|
+
config_path = config_file.path
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Run scoring
|
|
39
|
+
cmd = [
|
|
40
|
+
binary_path,
|
|
41
|
+
domain_path.to_s,
|
|
42
|
+
background_path.to_s,
|
|
43
|
+
config_path.to_s,
|
|
44
|
+
output_path.to_s
|
|
45
|
+
]
|
|
46
|
+
output = `#{cmd.shelljoin} 2>&1`
|
|
47
|
+
|
|
48
|
+
unless $?.success?
|
|
49
|
+
config_file.close! if config_file
|
|
50
|
+
raise Error, "Scoring failed: #{output}"
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
config_file.close! if config_file
|
|
54
|
+
|
|
55
|
+
# Parse stats from output
|
|
56
|
+
parse_stats(output)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
private
|
|
60
|
+
|
|
61
|
+
def find_binary
|
|
62
|
+
# Search paths relative to this file
|
|
63
|
+
# __dir__ is lib/phrasekit, so go up 2 levels to get to gem root
|
|
64
|
+
base_dir = File.expand_path("../..", __dir__)
|
|
65
|
+
|
|
66
|
+
candidates = [
|
|
67
|
+
File.join(base_dir, "ext/phrasekit/target/release/phrasekit_score"),
|
|
68
|
+
File.join(base_dir, "ext/phrasekit/target/debug/phrasekit_score"),
|
|
69
|
+
# For installed gems
|
|
70
|
+
File.join(base_dir, "bin/phrasekit_score")
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
candidates.each do |binary|
|
|
74
|
+
return binary if File.exist?(binary) && File.executable?(binary)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
raise Error, "phrasekit_score binary not found. Run: cargo build --release --bin phrasekit_score --manifest-path ext/phrasekit/Cargo.toml"
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def parse_stats(output)
|
|
81
|
+
stats = {}
|
|
82
|
+
|
|
83
|
+
output.scan(/Domain phrases:\s+(\d+)/) { stats[:domain_phrases] = $1.to_i }
|
|
84
|
+
output.scan(/Background phrases:\s+(\d+)/) { stats[:background_phrases] = $1.to_i }
|
|
85
|
+
output.scan(/After domain filter:\s+(\d+)/) { stats[:after_domain_filter] = $1.to_i }
|
|
86
|
+
output.scan(/After salience filter:\s+(\d+)/) { stats[:after_salience_filter] = $1.to_i }
|
|
87
|
+
|
|
88
|
+
stats
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|