phrasekit 0.2.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +131 -0
- data/ext/phrasekit/Cargo.toml +45 -0
- data/ext/phrasekit/extconf.rb +4 -0
- data/ext/phrasekit/src/bin/fixture_builder.rs +131 -0
- data/ext/phrasekit/src/bin/phrasekit_build.rs +326 -0
- data/ext/phrasekit/src/bin/phrasekit_mine.rs +199 -0
- data/ext/phrasekit/src/bin/phrasekit_score.rs +298 -0
- data/ext/phrasekit/src/bin/phrasekit_tag.rs +320 -0
- data/ext/phrasekit/src/lib.rs +104 -0
- data/ext/phrasekit/src/manifest.rs +88 -0
- data/ext/phrasekit/src/matcher.rs +227 -0
- data/ext/phrasekit/src/payload.rs +95 -0
- data/ext/phrasekit/src/policy.rs +190 -0
- data/lib/phrasekit/3.1/phrasekit.so +0 -0
- data/lib/phrasekit/3.2/phrasekit.so +0 -0
- data/lib/phrasekit/3.3/phrasekit.so +0 -0
- data/lib/phrasekit/3.4/phrasekit.so +0 -0
- data/lib/phrasekit/miner.rb +74 -0
- data/lib/phrasekit/scorer.rb +92 -0
- data/lib/phrasekit/tagger.rb +100 -0
- data/lib/phrasekit/version.rb +3 -0
- data/lib/phrasekit.rb +100 -0
- data/lib/spellkit_stub.rb +80 -0
- metadata +156 -0
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
use serde::{Deserialize, Serialize};
|
|
2
|
+
use std::collections::HashMap;
|
|
3
|
+
use std::fs::File;
|
|
4
|
+
use std::io::{BufRead, BufReader, BufWriter, Write};
|
|
5
|
+
|
|
6
|
+
#[derive(Debug, Deserialize)]
|
|
7
|
+
struct Document {
|
|
8
|
+
tokens: Vec<String>,
|
|
9
|
+
#[serde(default)]
|
|
10
|
+
doc_id: Option<String>,
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
#[derive(Debug, Deserialize)]
|
|
14
|
+
struct MineConfig {
|
|
15
|
+
#[serde(default = "default_min_n")]
|
|
16
|
+
min_n: usize,
|
|
17
|
+
#[serde(default = "default_max_n")]
|
|
18
|
+
max_n: usize,
|
|
19
|
+
#[serde(default = "default_min_count")]
|
|
20
|
+
min_count: u32,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
fn default_min_n() -> usize {
|
|
24
|
+
2
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
fn default_max_n() -> usize {
|
|
28
|
+
5
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
fn default_min_count() -> u32 {
|
|
32
|
+
10
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
#[derive(Debug, Serialize)]
|
|
36
|
+
struct Ngram {
|
|
37
|
+
tokens: Vec<String>,
|
|
38
|
+
count: u32,
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
#[derive(Debug)]
|
|
42
|
+
struct MiningStats {
|
|
43
|
+
total_docs: usize,
|
|
44
|
+
total_tokens: usize,
|
|
45
|
+
total_ngrams_extracted: usize,
|
|
46
|
+
unique_ngrams: usize,
|
|
47
|
+
ngrams_after_filter: usize,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
51
|
+
let args: Vec<String> = std::env::args().collect();
|
|
52
|
+
|
|
53
|
+
if args.len() < 4 {
|
|
54
|
+
eprintln!("Usage: phrasekit_mine <corpus.jsonl> <config.json> <output.jsonl>");
|
|
55
|
+
eprintln!("\nExample:");
|
|
56
|
+
eprintln!(" phrasekit_mine corpus.jsonl mine_config.json candidate_phrases.jsonl");
|
|
57
|
+
std::process::exit(1);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
let corpus_path = &args[1];
|
|
61
|
+
let config_path = &args[2];
|
|
62
|
+
let output_path = &args[3];
|
|
63
|
+
|
|
64
|
+
println!("š PhraseKit N-gram Miner");
|
|
65
|
+
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā");
|
|
66
|
+
println!("Corpus: {}", corpus_path);
|
|
67
|
+
println!("Config: {}", config_path);
|
|
68
|
+
println!("Output: {}", output_path);
|
|
69
|
+
println!();
|
|
70
|
+
|
|
71
|
+
// Load config
|
|
72
|
+
let config = load_config(config_path)?;
|
|
73
|
+
println!("ā Loaded config:");
|
|
74
|
+
println!(" min_n: {}", config.min_n);
|
|
75
|
+
println!(" max_n: {}", config.max_n);
|
|
76
|
+
println!(" min_count: {}", config.min_count);
|
|
77
|
+
|
|
78
|
+
if config.min_n < 1 || config.max_n > 10 || config.min_n > config.max_n {
|
|
79
|
+
return Err("Invalid config: min_n must be >= 1, max_n must be <= 10, and min_n <= max_n".into());
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Mine n-grams
|
|
83
|
+
println!("\nš Mining n-grams...");
|
|
84
|
+
let (ngram_counts, mut stats) = mine_ngrams(corpus_path, &config)?;
|
|
85
|
+
|
|
86
|
+
// Write results
|
|
87
|
+
println!("\nš¾ Writing results...");
|
|
88
|
+
stats.ngrams_after_filter = write_ngrams(output_path, ngram_counts, config.min_count)?;
|
|
89
|
+
|
|
90
|
+
// Summary
|
|
91
|
+
println!("\nā
Mining complete!");
|
|
92
|
+
println!("\nš Statistics:");
|
|
93
|
+
println!(" Total documents: {}", stats.total_docs);
|
|
94
|
+
println!(" Total tokens: {}", stats.total_tokens);
|
|
95
|
+
println!(" N-grams extracted: {}", stats.total_ngrams_extracted);
|
|
96
|
+
println!(" Unique n-grams: {}", stats.unique_ngrams);
|
|
97
|
+
println!(" After min_count={}: {}", config.min_count, stats.ngrams_after_filter);
|
|
98
|
+
println!("\nš” Next step: Run salience scoring on {}", output_path);
|
|
99
|
+
|
|
100
|
+
Ok(())
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
fn load_config(path: &str) -> Result<MineConfig, Box<dyn std::error::Error>> {
|
|
104
|
+
let file = File::open(path)?;
|
|
105
|
+
let config: MineConfig = serde_json::from_reader(file)?;
|
|
106
|
+
Ok(config)
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
fn mine_ngrams(
|
|
110
|
+
corpus_path: &str,
|
|
111
|
+
config: &MineConfig,
|
|
112
|
+
) -> Result<(HashMap<Vec<String>, u32>, MiningStats), Box<dyn std::error::Error>> {
|
|
113
|
+
let file = File::open(corpus_path)?;
|
|
114
|
+
let reader = BufReader::new(file);
|
|
115
|
+
|
|
116
|
+
let mut ngram_counts: HashMap<Vec<String>, u32> = HashMap::new();
|
|
117
|
+
let mut stats = MiningStats {
|
|
118
|
+
total_docs: 0,
|
|
119
|
+
total_tokens: 0,
|
|
120
|
+
total_ngrams_extracted: 0,
|
|
121
|
+
unique_ngrams: 0,
|
|
122
|
+
ngrams_after_filter: 0,
|
|
123
|
+
};
|
|
124
|
+
|
|
125
|
+
for (line_num, line) in reader.lines().enumerate() {
|
|
126
|
+
let line = line?;
|
|
127
|
+
|
|
128
|
+
if line.trim().is_empty() {
|
|
129
|
+
continue;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
let doc: Document = match serde_json::from_str(&line) {
|
|
133
|
+
Ok(d) => d,
|
|
134
|
+
Err(e) => {
|
|
135
|
+
eprintln!("ā ļø Line {}: Failed to parse: {}", line_num + 1, e);
|
|
136
|
+
continue;
|
|
137
|
+
}
|
|
138
|
+
};
|
|
139
|
+
|
|
140
|
+
stats.total_docs += 1;
|
|
141
|
+
stats.total_tokens += doc.tokens.len();
|
|
142
|
+
|
|
143
|
+
// Extract n-grams from document
|
|
144
|
+
for n in config.min_n..=config.max_n {
|
|
145
|
+
if doc.tokens.len() < n {
|
|
146
|
+
continue;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
for i in 0..=(doc.tokens.len() - n) {
|
|
150
|
+
let ngram: Vec<String> = doc.tokens[i..i + n]
|
|
151
|
+
.iter()
|
|
152
|
+
.map(|t| t.to_lowercase())
|
|
153
|
+
.collect();
|
|
154
|
+
|
|
155
|
+
*ngram_counts.entry(ngram).or_insert(0) += 1;
|
|
156
|
+
stats.total_ngrams_extracted += 1;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if stats.total_docs % 10000 == 0 {
|
|
161
|
+
println!(" Processed {} documents...", stats.total_docs);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
stats.unique_ngrams = ngram_counts.len();
|
|
166
|
+
println!(" ā Processed {} documents", stats.total_docs);
|
|
167
|
+
println!(" ā Extracted {} unique n-grams", stats.unique_ngrams);
|
|
168
|
+
|
|
169
|
+
Ok((ngram_counts, stats))
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
fn write_ngrams(
|
|
173
|
+
output_path: &str,
|
|
174
|
+
ngram_counts: HashMap<Vec<String>, u32>,
|
|
175
|
+
min_count: u32,
|
|
176
|
+
) -> Result<usize, Box<dyn std::error::Error>> {
|
|
177
|
+
let file = File::create(output_path)?;
|
|
178
|
+
let mut writer = BufWriter::new(file);
|
|
179
|
+
|
|
180
|
+
// Sort by count (descending) for better readability
|
|
181
|
+
let mut ngrams: Vec<(Vec<String>, u32)> = ngram_counts
|
|
182
|
+
.into_iter()
|
|
183
|
+
.filter(|(_, count)| *count >= min_count)
|
|
184
|
+
.collect();
|
|
185
|
+
|
|
186
|
+
ngrams.sort_by(|a, b| b.1.cmp(&a.1));
|
|
187
|
+
|
|
188
|
+
let count = ngrams.len();
|
|
189
|
+
for (tokens, count) in ngrams {
|
|
190
|
+
let ngram = Ngram { tokens, count };
|
|
191
|
+
let json = serde_json::to_string(&ngram)?;
|
|
192
|
+
writeln!(writer, "{}", json)?;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
writer.flush()?;
|
|
196
|
+
println!(" ā Wrote {} n-grams to {}", count, output_path);
|
|
197
|
+
|
|
198
|
+
Ok(count)
|
|
199
|
+
}
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
use serde::{Deserialize, Serialize};
|
|
2
|
+
use std::collections::HashMap;
|
|
3
|
+
use std::fs::File;
|
|
4
|
+
use std::io::{BufRead, BufReader, BufWriter, Write};
|
|
5
|
+
|
|
6
|
+
#[derive(Debug, Deserialize)]
|
|
7
|
+
struct InputNgram {
|
|
8
|
+
tokens: Vec<String>,
|
|
9
|
+
count: u32,
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
#[derive(Debug, Deserialize)]
|
|
13
|
+
struct ScoreConfig {
|
|
14
|
+
#[serde(default = "default_method")]
|
|
15
|
+
method: String,
|
|
16
|
+
#[serde(default = "default_min_salience")]
|
|
17
|
+
min_salience: f32,
|
|
18
|
+
#[serde(default = "default_min_domain_count")]
|
|
19
|
+
min_domain_count: u32,
|
|
20
|
+
#[serde(default = "default_assign_phrase_ids")]
|
|
21
|
+
assign_phrase_ids: bool,
|
|
22
|
+
#[serde(default = "default_starting_phrase_id")]
|
|
23
|
+
starting_phrase_id: u32,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
fn default_method() -> String {
|
|
27
|
+
"ratio".to_string()
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
fn default_min_salience() -> f32 {
|
|
31
|
+
2.0
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
fn default_min_domain_count() -> u32 {
|
|
35
|
+
10
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
fn default_assign_phrase_ids() -> bool {
|
|
39
|
+
true
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
fn default_starting_phrase_id() -> u32 {
|
|
43
|
+
1000
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
#[derive(Debug, Serialize, Deserialize)]
|
|
47
|
+
struct OutputPhrase {
|
|
48
|
+
tokens: Vec<String>,
|
|
49
|
+
salience: f32,
|
|
50
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
51
|
+
phrase_id: Option<u32>,
|
|
52
|
+
domain_count: u32,
|
|
53
|
+
background_count: u32,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
#[derive(Debug)]
|
|
57
|
+
struct ScoringStats {
|
|
58
|
+
domain_phrases: usize,
|
|
59
|
+
background_phrases: usize,
|
|
60
|
+
after_domain_filter: usize,
|
|
61
|
+
after_salience_filter: usize,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
65
|
+
let args: Vec<String> = std::env::args().collect();
|
|
66
|
+
|
|
67
|
+
if args.len() < 5 {
|
|
68
|
+
eprintln!("Usage: phrasekit_score <domain.jsonl> <background.jsonl> <config.json> <output.jsonl>");
|
|
69
|
+
eprintln!("\nExample:");
|
|
70
|
+
eprintln!(" phrasekit_score candidate_phrases.jsonl background_phrases.jsonl score_config.json phrases.jsonl");
|
|
71
|
+
std::process::exit(1);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
let domain_path = &args[1];
|
|
75
|
+
let background_path = &args[2];
|
|
76
|
+
let config_path = &args[3];
|
|
77
|
+
let output_path = &args[4];
|
|
78
|
+
|
|
79
|
+
println!("šÆ PhraseKit Salience Scoring");
|
|
80
|
+
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā");
|
|
81
|
+
println!("Domain: {}", domain_path);
|
|
82
|
+
println!("Background: {}", background_path);
|
|
83
|
+
println!("Config: {}", config_path);
|
|
84
|
+
println!("Output: {}", output_path);
|
|
85
|
+
println!();
|
|
86
|
+
|
|
87
|
+
// Load config
|
|
88
|
+
let config = load_config(config_path)?;
|
|
89
|
+
println!("ā Loaded config:");
|
|
90
|
+
println!(" method: {}", config.method);
|
|
91
|
+
println!(" min_salience: {}", config.min_salience);
|
|
92
|
+
println!(" min_domain_count: {}", config.min_domain_count);
|
|
93
|
+
|
|
94
|
+
// Validate method
|
|
95
|
+
if !["ratio", "pmi", "tfidf"].contains(&config.method.as_str()) {
|
|
96
|
+
return Err(format!("Invalid method: {}. Must be 'ratio', 'pmi', or 'tfidf'", config.method).into());
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Load phrases
|
|
100
|
+
println!("\nš Loading phrases...");
|
|
101
|
+
let domain_phrases = load_phrases(domain_path)?;
|
|
102
|
+
println!(" ā Loaded {} domain phrases", domain_phrases.len());
|
|
103
|
+
|
|
104
|
+
let background_phrases = load_phrases(background_path)?;
|
|
105
|
+
println!(" ā Loaded {} background phrases", background_phrases.len());
|
|
106
|
+
|
|
107
|
+
// Score and filter
|
|
108
|
+
println!("\nšÆ Scoring...");
|
|
109
|
+
let (scored_phrases, stats) = score_phrases(domain_phrases, background_phrases, &config)?;
|
|
110
|
+
|
|
111
|
+
// Write output
|
|
112
|
+
println!("\nš¾ Writing results...");
|
|
113
|
+
write_phrases(output_path, scored_phrases, &config)?;
|
|
114
|
+
|
|
115
|
+
// Summary
|
|
116
|
+
println!("\nā
Scoring complete!");
|
|
117
|
+
println!("\nš Statistics:");
|
|
118
|
+
println!(" Domain phrases: {}", stats.domain_phrases);
|
|
119
|
+
println!(" Background phrases: {}", stats.background_phrases);
|
|
120
|
+
println!(" After domain filter: {}", stats.after_domain_filter);
|
|
121
|
+
println!(" After salience filter: {}", stats.after_salience_filter);
|
|
122
|
+
|
|
123
|
+
if config.assign_phrase_ids && stats.after_salience_filter > 0 {
|
|
124
|
+
let end_id = config.starting_phrase_id + stats.after_salience_filter as u32 - 1;
|
|
125
|
+
println!(" Phrase IDs assigned: {} - {}", config.starting_phrase_id, end_id);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
println!("\nš” Next step: Build matching artifacts with phrasekit_build");
|
|
129
|
+
|
|
130
|
+
Ok(())
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
fn load_config(path: &str) -> Result<ScoreConfig, Box<dyn std::error::Error>> {
|
|
134
|
+
let file = File::open(path)?;
|
|
135
|
+
let config: ScoreConfig = serde_json::from_reader(file)?;
|
|
136
|
+
Ok(config)
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
fn load_phrases(path: &str) -> Result<HashMap<Vec<String>, u32>, Box<dyn std::error::Error>> {
|
|
140
|
+
let file = File::open(path)?;
|
|
141
|
+
let reader = BufReader::new(file);
|
|
142
|
+
let mut phrases = HashMap::new();
|
|
143
|
+
|
|
144
|
+
for (line_num, line) in reader.lines().enumerate() {
|
|
145
|
+
let line = line?;
|
|
146
|
+
if line.trim().is_empty() {
|
|
147
|
+
continue;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
let ngram: InputNgram = match serde_json::from_str(&line) {
|
|
151
|
+
Ok(n) => n,
|
|
152
|
+
Err(e) => {
|
|
153
|
+
eprintln!("ā ļø Line {}: Failed to parse: {}", line_num + 1, e);
|
|
154
|
+
continue;
|
|
155
|
+
}
|
|
156
|
+
};
|
|
157
|
+
|
|
158
|
+
// Normalize to lowercase
|
|
159
|
+
let tokens: Vec<String> = ngram.tokens.iter().map(|t| t.to_lowercase()).collect();
|
|
160
|
+
phrases.insert(tokens, ngram.count);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
Ok(phrases)
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
fn score_phrases(
|
|
167
|
+
domain_phrases: HashMap<Vec<String>, u32>,
|
|
168
|
+
background_phrases: HashMap<Vec<String>, u32>,
|
|
169
|
+
config: &ScoreConfig,
|
|
170
|
+
) -> Result<(Vec<OutputPhrase>, ScoringStats), Box<dyn std::error::Error>> {
|
|
171
|
+
let mut scored = Vec::new();
|
|
172
|
+
let mut stats = ScoringStats {
|
|
173
|
+
domain_phrases: domain_phrases.len(),
|
|
174
|
+
background_phrases: background_phrases.len(),
|
|
175
|
+
after_domain_filter: 0,
|
|
176
|
+
after_salience_filter: 0,
|
|
177
|
+
};
|
|
178
|
+
|
|
179
|
+
// Compute total counts for PMI
|
|
180
|
+
let total_domain: u64 = domain_phrases.values().map(|&c| c as u64).sum();
|
|
181
|
+
let total_background: u64 = background_phrases.values().map(|&c| c as u64).sum();
|
|
182
|
+
|
|
183
|
+
for (tokens, domain_count) in domain_phrases {
|
|
184
|
+
// Filter by minimum domain count
|
|
185
|
+
if domain_count < config.min_domain_count {
|
|
186
|
+
continue;
|
|
187
|
+
}
|
|
188
|
+
stats.after_domain_filter += 1;
|
|
189
|
+
|
|
190
|
+
// Get background count (default to 0 if not found)
|
|
191
|
+
let background_count = background_phrases.get(&tokens).copied().unwrap_or(0);
|
|
192
|
+
|
|
193
|
+
// Compute salience based on method
|
|
194
|
+
let salience = match config.method.as_str() {
|
|
195
|
+
"ratio" => compute_ratio_salience(domain_count, background_count),
|
|
196
|
+
"pmi" => compute_pmi_salience(
|
|
197
|
+
domain_count,
|
|
198
|
+
background_count,
|
|
199
|
+
total_domain,
|
|
200
|
+
total_background,
|
|
201
|
+
),
|
|
202
|
+
"tfidf" => compute_tfidf_salience(domain_count, background_count, total_domain),
|
|
203
|
+
_ => unreachable!(),
|
|
204
|
+
};
|
|
205
|
+
|
|
206
|
+
// Filter by minimum salience
|
|
207
|
+
if salience < config.min_salience {
|
|
208
|
+
continue;
|
|
209
|
+
}
|
|
210
|
+
stats.after_salience_filter += 1;
|
|
211
|
+
|
|
212
|
+
scored.push(OutputPhrase {
|
|
213
|
+
tokens,
|
|
214
|
+
salience,
|
|
215
|
+
phrase_id: None, // Will be assigned later if needed
|
|
216
|
+
domain_count,
|
|
217
|
+
background_count,
|
|
218
|
+
});
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// Sort by salience (descending)
|
|
222
|
+
scored.sort_by(|a, b| b.salience.partial_cmp(&a.salience).unwrap());
|
|
223
|
+
|
|
224
|
+
Ok((scored, stats))
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
fn compute_ratio_salience(domain_count: u32, background_count: u32) -> f32 {
|
|
228
|
+
domain_count as f32 / (background_count + 1) as f32
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
fn compute_pmi_salience(
|
|
232
|
+
domain_count: u32,
|
|
233
|
+
background_count: u32,
|
|
234
|
+
total_domain: u64,
|
|
235
|
+
total_background: u64,
|
|
236
|
+
) -> f32 {
|
|
237
|
+
if background_count == 0 {
|
|
238
|
+
return 10.0; // High salience for phrases not in background
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
let p_domain = domain_count as f64 / total_domain as f64;
|
|
242
|
+
let p_background = background_count as f64 / total_background as f64;
|
|
243
|
+
|
|
244
|
+
let pmi = (p_domain / p_background).log2();
|
|
245
|
+
pmi as f32
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
fn compute_tfidf_salience(domain_count: u32, background_count: u32, total_domain: u64) -> f32 {
|
|
249
|
+
let tf = domain_count as f32 / total_domain as f32;
|
|
250
|
+
let idf = ((total_domain + 1) as f32 / (background_count + 1) as f32).ln();
|
|
251
|
+
tf * idf
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
fn write_phrases(
|
|
255
|
+
output_path: &str,
|
|
256
|
+
mut phrases: Vec<OutputPhrase>,
|
|
257
|
+
config: &ScoreConfig,
|
|
258
|
+
) -> Result<(), Box<dyn std::error::Error>> {
|
|
259
|
+
let file = File::create(output_path)?;
|
|
260
|
+
let mut writer = BufWriter::new(file);
|
|
261
|
+
|
|
262
|
+
// Assign phrase IDs if requested
|
|
263
|
+
if config.assign_phrase_ids {
|
|
264
|
+
for (i, phrase) in phrases.iter_mut().enumerate() {
|
|
265
|
+
phrase.phrase_id = Some(config.starting_phrase_id + i as u32);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
let count = phrases.len();
|
|
270
|
+
for phrase in phrases {
|
|
271
|
+
let json = serde_json::to_string(&phrase)?;
|
|
272
|
+
writeln!(writer, "{}", json)?;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
writer.flush()?;
|
|
276
|
+
println!(" ā Wrote {} phrases to {}", count, output_path);
|
|
277
|
+
|
|
278
|
+
// Print top 10 phrases
|
|
279
|
+
if count > 0 {
|
|
280
|
+
println!("\nš Top phrases by salience:");
|
|
281
|
+
let output_file = File::open(output_path)?;
|
|
282
|
+
let reader = BufReader::new(output_file);
|
|
283
|
+
for (i, line) in reader.lines().enumerate().take(10) {
|
|
284
|
+
let line = line?;
|
|
285
|
+
let phrase: OutputPhrase = serde_json::from_str(&line)?;
|
|
286
|
+
println!(
|
|
287
|
+
" {}. {} ā salience={:.2}, domain={}, background={}",
|
|
288
|
+
i + 1,
|
|
289
|
+
phrase.tokens.join(" "),
|
|
290
|
+
phrase.salience,
|
|
291
|
+
phrase.domain_count,
|
|
292
|
+
phrase.background_count
|
|
293
|
+
);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
Ok(())
|
|
298
|
+
}
|