phrasekit 0.2.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,199 @@
1
+ use serde::{Deserialize, Serialize};
2
+ use std::collections::HashMap;
3
+ use std::fs::File;
4
+ use std::io::{BufRead, BufReader, BufWriter, Write};
5
+
6
+ #[derive(Debug, Deserialize)]
7
+ struct Document {
8
+ tokens: Vec<String>,
9
+ #[serde(default)]
10
+ doc_id: Option<String>,
11
+ }
12
+
13
+ #[derive(Debug, Deserialize)]
14
+ struct MineConfig {
15
+ #[serde(default = "default_min_n")]
16
+ min_n: usize,
17
+ #[serde(default = "default_max_n")]
18
+ max_n: usize,
19
+ #[serde(default = "default_min_count")]
20
+ min_count: u32,
21
+ }
22
+
23
+ fn default_min_n() -> usize {
24
+ 2
25
+ }
26
+
27
+ fn default_max_n() -> usize {
28
+ 5
29
+ }
30
+
31
+ fn default_min_count() -> u32 {
32
+ 10
33
+ }
34
+
35
+ #[derive(Debug, Serialize)]
36
+ struct Ngram {
37
+ tokens: Vec<String>,
38
+ count: u32,
39
+ }
40
+
41
+ #[derive(Debug)]
42
+ struct MiningStats {
43
+ total_docs: usize,
44
+ total_tokens: usize,
45
+ total_ngrams_extracted: usize,
46
+ unique_ngrams: usize,
47
+ ngrams_after_filter: usize,
48
+ }
49
+
50
+ fn main() -> Result<(), Box<dyn std::error::Error>> {
51
+ let args: Vec<String> = std::env::args().collect();
52
+
53
+ if args.len() < 4 {
54
+ eprintln!("Usage: phrasekit_mine <corpus.jsonl> <config.json> <output.jsonl>");
55
+ eprintln!("\nExample:");
56
+ eprintln!(" phrasekit_mine corpus.jsonl mine_config.json candidate_phrases.jsonl");
57
+ std::process::exit(1);
58
+ }
59
+
60
+ let corpus_path = &args[1];
61
+ let config_path = &args[2];
62
+ let output_path = &args[3];
63
+
64
+ println!("šŸ” PhraseKit N-gram Miner");
65
+ println!("════════════════════════════════════════");
66
+ println!("Corpus: {}", corpus_path);
67
+ println!("Config: {}", config_path);
68
+ println!("Output: {}", output_path);
69
+ println!();
70
+
71
+ // Load config
72
+ let config = load_config(config_path)?;
73
+ println!("āœ“ Loaded config:");
74
+ println!(" min_n: {}", config.min_n);
75
+ println!(" max_n: {}", config.max_n);
76
+ println!(" min_count: {}", config.min_count);
77
+
78
+ if config.min_n < 1 || config.max_n > 10 || config.min_n > config.max_n {
79
+ return Err("Invalid config: min_n must be >= 1, max_n must be <= 10, and min_n <= max_n".into());
80
+ }
81
+
82
+ // Mine n-grams
83
+ println!("\nšŸ“Š Mining n-grams...");
84
+ let (ngram_counts, mut stats) = mine_ngrams(corpus_path, &config)?;
85
+
86
+ // Write results
87
+ println!("\nšŸ’¾ Writing results...");
88
+ stats.ngrams_after_filter = write_ngrams(output_path, ngram_counts, config.min_count)?;
89
+
90
+ // Summary
91
+ println!("\nāœ… Mining complete!");
92
+ println!("\nšŸ“ˆ Statistics:");
93
+ println!(" Total documents: {}", stats.total_docs);
94
+ println!(" Total tokens: {}", stats.total_tokens);
95
+ println!(" N-grams extracted: {}", stats.total_ngrams_extracted);
96
+ println!(" Unique n-grams: {}", stats.unique_ngrams);
97
+ println!(" After min_count={}: {}", config.min_count, stats.ngrams_after_filter);
98
+ println!("\nšŸ’” Next step: Run salience scoring on {}", output_path);
99
+
100
+ Ok(())
101
+ }
102
+
103
+ fn load_config(path: &str) -> Result<MineConfig, Box<dyn std::error::Error>> {
104
+ let file = File::open(path)?;
105
+ let config: MineConfig = serde_json::from_reader(file)?;
106
+ Ok(config)
107
+ }
108
+
109
+ fn mine_ngrams(
110
+ corpus_path: &str,
111
+ config: &MineConfig,
112
+ ) -> Result<(HashMap<Vec<String>, u32>, MiningStats), Box<dyn std::error::Error>> {
113
+ let file = File::open(corpus_path)?;
114
+ let reader = BufReader::new(file);
115
+
116
+ let mut ngram_counts: HashMap<Vec<String>, u32> = HashMap::new();
117
+ let mut stats = MiningStats {
118
+ total_docs: 0,
119
+ total_tokens: 0,
120
+ total_ngrams_extracted: 0,
121
+ unique_ngrams: 0,
122
+ ngrams_after_filter: 0,
123
+ };
124
+
125
+ for (line_num, line) in reader.lines().enumerate() {
126
+ let line = line?;
127
+
128
+ if line.trim().is_empty() {
129
+ continue;
130
+ }
131
+
132
+ let doc: Document = match serde_json::from_str(&line) {
133
+ Ok(d) => d,
134
+ Err(e) => {
135
+ eprintln!("āš ļø Line {}: Failed to parse: {}", line_num + 1, e);
136
+ continue;
137
+ }
138
+ };
139
+
140
+ stats.total_docs += 1;
141
+ stats.total_tokens += doc.tokens.len();
142
+
143
+ // Extract n-grams from document
144
+ for n in config.min_n..=config.max_n {
145
+ if doc.tokens.len() < n {
146
+ continue;
147
+ }
148
+
149
+ for i in 0..=(doc.tokens.len() - n) {
150
+ let ngram: Vec<String> = doc.tokens[i..i + n]
151
+ .iter()
152
+ .map(|t| t.to_lowercase())
153
+ .collect();
154
+
155
+ *ngram_counts.entry(ngram).or_insert(0) += 1;
156
+ stats.total_ngrams_extracted += 1;
157
+ }
158
+ }
159
+
160
+ if stats.total_docs % 10000 == 0 {
161
+ println!(" Processed {} documents...", stats.total_docs);
162
+ }
163
+ }
164
+
165
+ stats.unique_ngrams = ngram_counts.len();
166
+ println!(" āœ“ Processed {} documents", stats.total_docs);
167
+ println!(" āœ“ Extracted {} unique n-grams", stats.unique_ngrams);
168
+
169
+ Ok((ngram_counts, stats))
170
+ }
171
+
172
+ fn write_ngrams(
173
+ output_path: &str,
174
+ ngram_counts: HashMap<Vec<String>, u32>,
175
+ min_count: u32,
176
+ ) -> Result<usize, Box<dyn std::error::Error>> {
177
+ let file = File::create(output_path)?;
178
+ let mut writer = BufWriter::new(file);
179
+
180
+ // Sort by count (descending) for better readability
181
+ let mut ngrams: Vec<(Vec<String>, u32)> = ngram_counts
182
+ .into_iter()
183
+ .filter(|(_, count)| *count >= min_count)
184
+ .collect();
185
+
186
+ ngrams.sort_by(|a, b| b.1.cmp(&a.1));
187
+
188
+ let count = ngrams.len();
189
+ for (tokens, count) in ngrams {
190
+ let ngram = Ngram { tokens, count };
191
+ let json = serde_json::to_string(&ngram)?;
192
+ writeln!(writer, "{}", json)?;
193
+ }
194
+
195
+ writer.flush()?;
196
+ println!(" āœ“ Wrote {} n-grams to {}", count, output_path);
197
+
198
+ Ok(count)
199
+ }
@@ -0,0 +1,298 @@
1
+ use serde::{Deserialize, Serialize};
2
+ use std::collections::HashMap;
3
+ use std::fs::File;
4
+ use std::io::{BufRead, BufReader, BufWriter, Write};
5
+
6
+ #[derive(Debug, Deserialize)]
7
+ struct InputNgram {
8
+ tokens: Vec<String>,
9
+ count: u32,
10
+ }
11
+
12
+ #[derive(Debug, Deserialize)]
13
+ struct ScoreConfig {
14
+ #[serde(default = "default_method")]
15
+ method: String,
16
+ #[serde(default = "default_min_salience")]
17
+ min_salience: f32,
18
+ #[serde(default = "default_min_domain_count")]
19
+ min_domain_count: u32,
20
+ #[serde(default = "default_assign_phrase_ids")]
21
+ assign_phrase_ids: bool,
22
+ #[serde(default = "default_starting_phrase_id")]
23
+ starting_phrase_id: u32,
24
+ }
25
+
26
+ fn default_method() -> String {
27
+ "ratio".to_string()
28
+ }
29
+
30
+ fn default_min_salience() -> f32 {
31
+ 2.0
32
+ }
33
+
34
+ fn default_min_domain_count() -> u32 {
35
+ 10
36
+ }
37
+
38
+ fn default_assign_phrase_ids() -> bool {
39
+ true
40
+ }
41
+
42
+ fn default_starting_phrase_id() -> u32 {
43
+ 1000
44
+ }
45
+
46
+ #[derive(Debug, Serialize, Deserialize)]
47
+ struct OutputPhrase {
48
+ tokens: Vec<String>,
49
+ salience: f32,
50
+ #[serde(skip_serializing_if = "Option::is_none")]
51
+ phrase_id: Option<u32>,
52
+ domain_count: u32,
53
+ background_count: u32,
54
+ }
55
+
56
+ #[derive(Debug)]
57
+ struct ScoringStats {
58
+ domain_phrases: usize,
59
+ background_phrases: usize,
60
+ after_domain_filter: usize,
61
+ after_salience_filter: usize,
62
+ }
63
+
64
+ fn main() -> Result<(), Box<dyn std::error::Error>> {
65
+ let args: Vec<String> = std::env::args().collect();
66
+
67
+ if args.len() < 5 {
68
+ eprintln!("Usage: phrasekit_score <domain.jsonl> <background.jsonl> <config.json> <output.jsonl>");
69
+ eprintln!("\nExample:");
70
+ eprintln!(" phrasekit_score candidate_phrases.jsonl background_phrases.jsonl score_config.json phrases.jsonl");
71
+ std::process::exit(1);
72
+ }
73
+
74
+ let domain_path = &args[1];
75
+ let background_path = &args[2];
76
+ let config_path = &args[3];
77
+ let output_path = &args[4];
78
+
79
+ println!("šŸŽÆ PhraseKit Salience Scoring");
80
+ println!("════════════════════════════════════════");
81
+ println!("Domain: {}", domain_path);
82
+ println!("Background: {}", background_path);
83
+ println!("Config: {}", config_path);
84
+ println!("Output: {}", output_path);
85
+ println!();
86
+
87
+ // Load config
88
+ let config = load_config(config_path)?;
89
+ println!("āœ“ Loaded config:");
90
+ println!(" method: {}", config.method);
91
+ println!(" min_salience: {}", config.min_salience);
92
+ println!(" min_domain_count: {}", config.min_domain_count);
93
+
94
+ // Validate method
95
+ if !["ratio", "pmi", "tfidf"].contains(&config.method.as_str()) {
96
+ return Err(format!("Invalid method: {}. Must be 'ratio', 'pmi', or 'tfidf'", config.method).into());
97
+ }
98
+
99
+ // Load phrases
100
+ println!("\nšŸ“Š Loading phrases...");
101
+ let domain_phrases = load_phrases(domain_path)?;
102
+ println!(" āœ“ Loaded {} domain phrases", domain_phrases.len());
103
+
104
+ let background_phrases = load_phrases(background_path)?;
105
+ println!(" āœ“ Loaded {} background phrases", background_phrases.len());
106
+
107
+ // Score and filter
108
+ println!("\nšŸŽÆ Scoring...");
109
+ let (scored_phrases, stats) = score_phrases(domain_phrases, background_phrases, &config)?;
110
+
111
+ // Write output
112
+ println!("\nšŸ’¾ Writing results...");
113
+ write_phrases(output_path, scored_phrases, &config)?;
114
+
115
+ // Summary
116
+ println!("\nāœ… Scoring complete!");
117
+ println!("\nšŸ“ˆ Statistics:");
118
+ println!(" Domain phrases: {}", stats.domain_phrases);
119
+ println!(" Background phrases: {}", stats.background_phrases);
120
+ println!(" After domain filter: {}", stats.after_domain_filter);
121
+ println!(" After salience filter: {}", stats.after_salience_filter);
122
+
123
+ if config.assign_phrase_ids && stats.after_salience_filter > 0 {
124
+ let end_id = config.starting_phrase_id + stats.after_salience_filter as u32 - 1;
125
+ println!(" Phrase IDs assigned: {} - {}", config.starting_phrase_id, end_id);
126
+ }
127
+
128
+ println!("\nšŸ’” Next step: Build matching artifacts with phrasekit_build");
129
+
130
+ Ok(())
131
+ }
132
+
133
+ fn load_config(path: &str) -> Result<ScoreConfig, Box<dyn std::error::Error>> {
134
+ let file = File::open(path)?;
135
+ let config: ScoreConfig = serde_json::from_reader(file)?;
136
+ Ok(config)
137
+ }
138
+
139
+ fn load_phrases(path: &str) -> Result<HashMap<Vec<String>, u32>, Box<dyn std::error::Error>> {
140
+ let file = File::open(path)?;
141
+ let reader = BufReader::new(file);
142
+ let mut phrases = HashMap::new();
143
+
144
+ for (line_num, line) in reader.lines().enumerate() {
145
+ let line = line?;
146
+ if line.trim().is_empty() {
147
+ continue;
148
+ }
149
+
150
+ let ngram: InputNgram = match serde_json::from_str(&line) {
151
+ Ok(n) => n,
152
+ Err(e) => {
153
+ eprintln!("āš ļø Line {}: Failed to parse: {}", line_num + 1, e);
154
+ continue;
155
+ }
156
+ };
157
+
158
+ // Normalize to lowercase
159
+ let tokens: Vec<String> = ngram.tokens.iter().map(|t| t.to_lowercase()).collect();
160
+ phrases.insert(tokens, ngram.count);
161
+ }
162
+
163
+ Ok(phrases)
164
+ }
165
+
166
+ fn score_phrases(
167
+ domain_phrases: HashMap<Vec<String>, u32>,
168
+ background_phrases: HashMap<Vec<String>, u32>,
169
+ config: &ScoreConfig,
170
+ ) -> Result<(Vec<OutputPhrase>, ScoringStats), Box<dyn std::error::Error>> {
171
+ let mut scored = Vec::new();
172
+ let mut stats = ScoringStats {
173
+ domain_phrases: domain_phrases.len(),
174
+ background_phrases: background_phrases.len(),
175
+ after_domain_filter: 0,
176
+ after_salience_filter: 0,
177
+ };
178
+
179
+ // Compute total counts for PMI
180
+ let total_domain: u64 = domain_phrases.values().map(|&c| c as u64).sum();
181
+ let total_background: u64 = background_phrases.values().map(|&c| c as u64).sum();
182
+
183
+ for (tokens, domain_count) in domain_phrases {
184
+ // Filter by minimum domain count
185
+ if domain_count < config.min_domain_count {
186
+ continue;
187
+ }
188
+ stats.after_domain_filter += 1;
189
+
190
+ // Get background count (default to 0 if not found)
191
+ let background_count = background_phrases.get(&tokens).copied().unwrap_or(0);
192
+
193
+ // Compute salience based on method
194
+ let salience = match config.method.as_str() {
195
+ "ratio" => compute_ratio_salience(domain_count, background_count),
196
+ "pmi" => compute_pmi_salience(
197
+ domain_count,
198
+ background_count,
199
+ total_domain,
200
+ total_background,
201
+ ),
202
+ "tfidf" => compute_tfidf_salience(domain_count, background_count, total_domain),
203
+ _ => unreachable!(),
204
+ };
205
+
206
+ // Filter by minimum salience
207
+ if salience < config.min_salience {
208
+ continue;
209
+ }
210
+ stats.after_salience_filter += 1;
211
+
212
+ scored.push(OutputPhrase {
213
+ tokens,
214
+ salience,
215
+ phrase_id: None, // Will be assigned later if needed
216
+ domain_count,
217
+ background_count,
218
+ });
219
+ }
220
+
221
+ // Sort by salience (descending)
222
+ scored.sort_by(|a, b| b.salience.partial_cmp(&a.salience).unwrap());
223
+
224
+ Ok((scored, stats))
225
+ }
226
+
227
+ fn compute_ratio_salience(domain_count: u32, background_count: u32) -> f32 {
228
+ domain_count as f32 / (background_count + 1) as f32
229
+ }
230
+
231
+ fn compute_pmi_salience(
232
+ domain_count: u32,
233
+ background_count: u32,
234
+ total_domain: u64,
235
+ total_background: u64,
236
+ ) -> f32 {
237
+ if background_count == 0 {
238
+ return 10.0; // High salience for phrases not in background
239
+ }
240
+
241
+ let p_domain = domain_count as f64 / total_domain as f64;
242
+ let p_background = background_count as f64 / total_background as f64;
243
+
244
+ let pmi = (p_domain / p_background).log2();
245
+ pmi as f32
246
+ }
247
+
248
+ fn compute_tfidf_salience(domain_count: u32, background_count: u32, total_domain: u64) -> f32 {
249
+ let tf = domain_count as f32 / total_domain as f32;
250
+ let idf = ((total_domain + 1) as f32 / (background_count + 1) as f32).ln();
251
+ tf * idf
252
+ }
253
+
254
+ fn write_phrases(
255
+ output_path: &str,
256
+ mut phrases: Vec<OutputPhrase>,
257
+ config: &ScoreConfig,
258
+ ) -> Result<(), Box<dyn std::error::Error>> {
259
+ let file = File::create(output_path)?;
260
+ let mut writer = BufWriter::new(file);
261
+
262
+ // Assign phrase IDs if requested
263
+ if config.assign_phrase_ids {
264
+ for (i, phrase) in phrases.iter_mut().enumerate() {
265
+ phrase.phrase_id = Some(config.starting_phrase_id + i as u32);
266
+ }
267
+ }
268
+
269
+ let count = phrases.len();
270
+ for phrase in phrases {
271
+ let json = serde_json::to_string(&phrase)?;
272
+ writeln!(writer, "{}", json)?;
273
+ }
274
+
275
+ writer.flush()?;
276
+ println!(" āœ“ Wrote {} phrases to {}", count, output_path);
277
+
278
+ // Print top 10 phrases
279
+ if count > 0 {
280
+ println!("\nšŸ† Top phrases by salience:");
281
+ let output_file = File::open(output_path)?;
282
+ let reader = BufReader::new(output_file);
283
+ for (i, line) in reader.lines().enumerate().take(10) {
284
+ let line = line?;
285
+ let phrase: OutputPhrase = serde_json::from_str(&line)?;
286
+ println!(
287
+ " {}. {} → salience={:.2}, domain={}, background={}",
288
+ i + 1,
289
+ phrase.tokens.join(" "),
290
+ phrase.salience,
291
+ phrase.domain_count,
292
+ phrase.background_count
293
+ );
294
+ }
295
+ }
296
+
297
+ Ok(())
298
+ }