@sjcrh/proteinpaint-rust 2.129.2 → 2.129.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +3 -1
- package/package.json +1 -1
- package/src/cerno.rs +181 -115
- package/src/gdcGRIN2.rs +402 -133
- package/src/stats_functions.rs +91 -17
- package/src/test_cerno.rs +131 -0
- package/src/wilcoxon.rs +5 -16
package/src/gdcGRIN2.rs
CHANGED
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
Output mutations as JSON array.
|
|
14
14
|
|
|
15
15
|
Example of usage:
|
|
16
|
-
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}},"mafOptions": {"minTotalDepth":
|
|
16
|
+
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000}}' | ./target/release/gdcGRIN2
|
|
17
17
|
*/
|
|
18
18
|
|
|
19
19
|
use flate2::read::GzDecoder;
|
|
@@ -40,21 +40,6 @@ struct ErrorEntry {
|
|
|
40
40
|
attempts_made: u32,
|
|
41
41
|
}
|
|
42
42
|
|
|
43
|
-
// Struct for the final output that includes both successful data and errors
|
|
44
|
-
#[derive(serde::Serialize)]
|
|
45
|
-
struct GdcOutput {
|
|
46
|
-
successful_data: Vec<Vec<Vec<String>>>, // Array of successful file data arrays
|
|
47
|
-
failed_files: Vec<ErrorEntry>,
|
|
48
|
-
summary: OutputSummary,
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
#[derive(serde::Serialize)]
|
|
52
|
-
struct OutputSummary {
|
|
53
|
-
total_files: usize,
|
|
54
|
-
successful_files: usize,
|
|
55
|
-
failed_files: usize,
|
|
56
|
-
}
|
|
57
|
-
|
|
58
43
|
// Define the structure for datadd
|
|
59
44
|
#[derive(Deserialize, Debug)]
|
|
60
45
|
struct DataType {
|
|
@@ -69,6 +54,39 @@ struct MafOptions {
|
|
|
69
54
|
min_total_depth: i32,
|
|
70
55
|
#[serde(rename = "minAltAlleleCount")]
|
|
71
56
|
min_alt_allele_count: i32,
|
|
57
|
+
consequences: Option<Vec<String>>, // Optional list of consequences to filter MAF files
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Define the structure for cnvOptions
|
|
61
|
+
#[derive(Deserialize, Debug)]
|
|
62
|
+
struct CnvOptions {
|
|
63
|
+
#[serde(rename = "lossThreshold")]
|
|
64
|
+
loss_threshold: f32,
|
|
65
|
+
#[serde(rename = "gainThreshold")]
|
|
66
|
+
gain_threshold: f32,
|
|
67
|
+
#[serde(rename = "segLength")]
|
|
68
|
+
seg_length: i32,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Individual successful file output (JSONL format)
|
|
72
|
+
#[derive(serde::Serialize)]
|
|
73
|
+
struct SuccessfulFileOutput {
|
|
74
|
+
#[serde(rename = "type")]
|
|
75
|
+
output_type: String, // Always "data"
|
|
76
|
+
case_id: String,
|
|
77
|
+
data_type: String,
|
|
78
|
+
data: Vec<Vec<String>>,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Final summary output (JSONL format)
|
|
82
|
+
#[derive(serde::Serialize)]
|
|
83
|
+
struct FinalSummary {
|
|
84
|
+
#[serde(rename = "type")]
|
|
85
|
+
output_type: String, // Always "summary"
|
|
86
|
+
total_files: usize,
|
|
87
|
+
successful_files: usize,
|
|
88
|
+
failed_files: usize,
|
|
89
|
+
errors: Vec<ErrorEntry>,
|
|
72
90
|
}
|
|
73
91
|
|
|
74
92
|
// Define the top-level input structure
|
|
@@ -78,104 +96,114 @@ struct InputData {
|
|
|
78
96
|
case_files: HashMap<String, DataType>,
|
|
79
97
|
#[serde(rename = "mafOptions")]
|
|
80
98
|
maf_options: Option<MafOptions>,
|
|
99
|
+
#[serde(rename = "cnvOptions")]
|
|
100
|
+
cnv_options: Option<CnvOptions>,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Configuration for different data types
|
|
104
|
+
#[derive(Deserialize, Debug)]
|
|
105
|
+
struct DataTypeConfig {
|
|
106
|
+
header_marker: &'static str,
|
|
107
|
+
output_columns: Vec<&'static str>,
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Function to check if CNV file has Segment_Mean column
|
|
111
|
+
fn has_segment_mean_column(content: &str) -> bool {
|
|
112
|
+
for line in content.lines() {
|
|
113
|
+
// Check if this line contains Segment_Mean (likely the header)
|
|
114
|
+
if line.contains("Segment_Mean") {
|
|
115
|
+
return true;
|
|
116
|
+
}
|
|
117
|
+
// Stop checking after a few non-comment lines to avoid parsing entire file
|
|
118
|
+
if !line.trim().is_empty() {
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
false
|
|
81
123
|
}
|
|
82
124
|
|
|
83
125
|
// Function to parse TSV content
|
|
126
|
+
// Updated parse_content function with better consequence filtering
|
|
84
127
|
fn parse_content(
|
|
85
128
|
content: &str,
|
|
86
129
|
case_id: &str,
|
|
87
130
|
data_type: &str,
|
|
88
131
|
min_total_depth: i32,
|
|
89
132
|
min_alt_allele_count: i32,
|
|
133
|
+
consequences: &Option<Vec<String>>,
|
|
134
|
+
gain_threshold: f32,
|
|
135
|
+
loss_threshold: f32,
|
|
136
|
+
seg_length: i32,
|
|
90
137
|
) -> Result<Vec<Vec<String>>, (String, String, String)> {
|
|
138
|
+
// Early filter for CNV files - only process files with Segment_Mean
|
|
139
|
+
if data_type == "cnv" && !has_segment_mean_column(content) {
|
|
140
|
+
return Ok(Vec::new()); // Return empty result, no error
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
let config = match data_type {
|
|
144
|
+
"cnv" => DataTypeConfig {
|
|
145
|
+
header_marker: "Segment_Mean",
|
|
146
|
+
output_columns: vec!["Chromosome", "Start", "End", "Segment_Mean"],
|
|
147
|
+
},
|
|
148
|
+
"maf" => DataTypeConfig {
|
|
149
|
+
header_marker: "Hugo_Symbol",
|
|
150
|
+
output_columns: vec!["Chromosome", "Start_Position", "End_Position", "t_depth", "t_alt_count"],
|
|
151
|
+
},
|
|
152
|
+
_ => {
|
|
153
|
+
return Err((
|
|
154
|
+
case_id.to_string(),
|
|
155
|
+
data_type.to_string(),
|
|
156
|
+
"Invalid data type".to_string(),
|
|
157
|
+
));
|
|
158
|
+
}
|
|
159
|
+
};
|
|
160
|
+
|
|
91
161
|
let lines = content.lines();
|
|
92
162
|
let mut parsed_data = Vec::new();
|
|
93
163
|
let mut columns_indices: Vec<usize> = Vec::new();
|
|
94
|
-
let mut
|
|
95
|
-
let mut
|
|
96
|
-
|
|
97
|
-
if data_type == "cnv" {
|
|
98
|
-
header_mk = "GDC_Aliquot_ID";
|
|
99
|
-
columns = vec!["Chromosome", "Start", "End", "Segment_Mean"]
|
|
100
|
-
} else if data_type == "maf" {
|
|
101
|
-
header_mk = "Hugo_Symbol";
|
|
102
|
-
columns = vec!["Chromosome", "Start_Position", "End_Position", "t_depth", "t_alt_count"]
|
|
103
|
-
};
|
|
164
|
+
let mut variant_classification_index: Option<usize> = None;
|
|
165
|
+
//let mut header_mk: &str = "";
|
|
166
|
+
//let mut columns = Vec::new();
|
|
104
167
|
|
|
105
168
|
let mut header: Vec<String> = Vec::new();
|
|
106
169
|
|
|
107
170
|
for line in lines {
|
|
108
171
|
if line.starts_with("#") {
|
|
109
172
|
continue;
|
|
110
|
-
}
|
|
173
|
+
};
|
|
174
|
+
if line.contains(config.header_marker) {
|
|
111
175
|
header = line.split("\t").map(|s| s.to_string()).collect();
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
} else {
|
|
124
|
-
let mut keep_ck: bool = true;
|
|
125
|
-
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
126
|
-
let mut out_lst: Vec<String> = Vec::new();
|
|
127
|
-
out_lst.push(case_id.to_string());
|
|
128
|
-
|
|
129
|
-
for x in columns_indices.iter() {
|
|
130
|
-
let mut element = cont_lst[*x].to_string();
|
|
131
|
-
|
|
132
|
-
if data_type == "cnv" && &header[*x] == "Segment_Mean" {
|
|
133
|
-
let seg_mean = match element.parse::<f32>() {
|
|
134
|
-
Ok(val) => val,
|
|
135
|
-
Err(_e) => {
|
|
136
|
-
let error_msg = "Segment_Mean in cnv file is not float".to_string();
|
|
137
|
-
return Err((case_id.to_string(), data_type.to_string(), error_msg));
|
|
138
|
-
}
|
|
139
|
-
};
|
|
140
|
-
if seg_mean >= 0.3 {
|
|
141
|
-
element = "gain".to_string();
|
|
142
|
-
} else if seg_mean <= -0.4 {
|
|
143
|
-
element = "loss".to_string();
|
|
144
|
-
} else {
|
|
145
|
-
keep_ck = false;
|
|
146
|
-
}
|
|
147
|
-
}
|
|
148
|
-
out_lst.push(element);
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
if data_type == "maf" {
|
|
152
|
-
let alle_depth = match out_lst[4].parse::<i32>() {
|
|
153
|
-
Ok(value) => value,
|
|
154
|
-
Err(_) => {
|
|
155
|
-
let error_msg = "Failed to convert t_depth to i32.".to_string();
|
|
156
|
-
return Err((case_id.to_string(), data_type.to_string(), error_msg));
|
|
157
|
-
}
|
|
158
|
-
};
|
|
159
|
-
let alt_count = match out_lst[5].parse::<i32>() {
|
|
160
|
-
Ok(value) => value,
|
|
161
|
-
Err(_) => {
|
|
162
|
-
let error_msg = "Failed to convert t_alt_count to i32.".to_string();
|
|
163
|
-
return Err((case_id.to_string(), data_type.to_string(), error_msg));
|
|
164
|
-
}
|
|
165
|
-
};
|
|
166
|
-
|
|
167
|
-
if alle_depth >= min_total_depth && alt_count >= min_alt_allele_count {
|
|
168
|
-
out_lst = out_lst[0..4].to_vec();
|
|
169
|
-
out_lst.push("mutation".to_string());
|
|
170
|
-
} else {
|
|
171
|
-
keep_ck = false;
|
|
172
|
-
}
|
|
176
|
+
if let Err(err) = setup_columns(
|
|
177
|
+
&header,
|
|
178
|
+
&config,
|
|
179
|
+
&mut columns_indices,
|
|
180
|
+
&mut variant_classification_index,
|
|
181
|
+
case_id,
|
|
182
|
+
data_type,
|
|
183
|
+
) {
|
|
184
|
+
return Err(err);
|
|
173
185
|
}
|
|
186
|
+
continue;
|
|
187
|
+
};
|
|
174
188
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
189
|
+
let row = process_row(
|
|
190
|
+
line,
|
|
191
|
+
case_id,
|
|
192
|
+
data_type,
|
|
193
|
+
&header,
|
|
194
|
+
&columns_indices,
|
|
195
|
+
variant_classification_index,
|
|
196
|
+
consequences,
|
|
197
|
+
min_total_depth,
|
|
198
|
+
min_alt_allele_count,
|
|
199
|
+
gain_threshold,
|
|
200
|
+
loss_threshold,
|
|
201
|
+
seg_length,
|
|
202
|
+
)?;
|
|
203
|
+
|
|
204
|
+
if let Some(out_lst) = row {
|
|
205
|
+
parsed_data.push(out_lst);
|
|
206
|
+
};
|
|
179
207
|
}
|
|
180
208
|
|
|
181
209
|
if columns_indices.is_empty() {
|
|
@@ -189,6 +217,204 @@ fn parse_content(
|
|
|
189
217
|
Ok(parsed_data)
|
|
190
218
|
}
|
|
191
219
|
|
|
220
|
+
// Set up column indices for processing
|
|
221
|
+
fn setup_columns(
|
|
222
|
+
header: &[String],
|
|
223
|
+
config: &DataTypeConfig,
|
|
224
|
+
columns_indices: &mut Vec<usize>,
|
|
225
|
+
variant_classification_index: &mut Option<usize>,
|
|
226
|
+
case_id: &str,
|
|
227
|
+
data_type: &str,
|
|
228
|
+
) -> Result<(), (String, String, String)> {
|
|
229
|
+
for col in &config.output_columns {
|
|
230
|
+
match header.iter().position(|x| x == col) {
|
|
231
|
+
Some(index) => columns_indices.push(index),
|
|
232
|
+
None => {
|
|
233
|
+
return Err((
|
|
234
|
+
case_id.to_string(),
|
|
235
|
+
data_type.to_string(),
|
|
236
|
+
format!("Column {} was not found", col),
|
|
237
|
+
));
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
if data_type == "maf" {
|
|
243
|
+
*variant_classification_index = header.iter().position(|x| x == "Variant_Classification");
|
|
244
|
+
if variant_classification_index.is_none() {
|
|
245
|
+
return Err((
|
|
246
|
+
case_id.to_string(),
|
|
247
|
+
data_type.to_string(),
|
|
248
|
+
"Column Variant_Classification was not found".to_string(),
|
|
249
|
+
));
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
Ok(())
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// Process a single row of data
|
|
257
|
+
fn process_row(
|
|
258
|
+
line: &str,
|
|
259
|
+
case_id: &str,
|
|
260
|
+
data_type: &str,
|
|
261
|
+
header: &[String],
|
|
262
|
+
columns_indices: &[usize],
|
|
263
|
+
variant_classification_index: Option<usize>,
|
|
264
|
+
consequences: &Option<Vec<String>>,
|
|
265
|
+
min_total_depth: i32,
|
|
266
|
+
min_alt_allele_count: i32,
|
|
267
|
+
gain_threshold: f32,
|
|
268
|
+
loss_threshold: f32,
|
|
269
|
+
seg_length: i32,
|
|
270
|
+
) -> Result<Option<Vec<String>>, (String, String, String)> {
|
|
271
|
+
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
272
|
+
let mut out_lst = vec![case_id.to_string()];
|
|
273
|
+
|
|
274
|
+
// Check consequence filtering for MAF files
|
|
275
|
+
if data_type == "maf" && !is_valid_consequence(&cont_lst, variant_classification_index, consequences) {
|
|
276
|
+
return Ok(None);
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// Extract relevant columns
|
|
280
|
+
for &x in columns_indices {
|
|
281
|
+
if x >= cont_lst.len() {
|
|
282
|
+
return Ok(None); // Invalid row
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
let mut element = cont_lst[x].to_string();
|
|
286
|
+
if data_type == "cnv" && header[x] == "Segment_Mean" {
|
|
287
|
+
element = process_segment_mean(&element, case_id, data_type, gain_threshold, loss_threshold)?;
|
|
288
|
+
if element.is_empty() {
|
|
289
|
+
return Ok(None);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
out_lst.push(element);
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Additional MAF-specific processing
|
|
296
|
+
if data_type == "maf" {
|
|
297
|
+
if out_lst.len() < 6 {
|
|
298
|
+
return Ok(None); // Not enough columns
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
|
|
302
|
+
(
|
|
303
|
+
case_id.to_string(),
|
|
304
|
+
data_type.to_string(),
|
|
305
|
+
"Failed to convert t_depth to integer.".to_string(),
|
|
306
|
+
)
|
|
307
|
+
})?;
|
|
308
|
+
|
|
309
|
+
let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
|
|
310
|
+
(
|
|
311
|
+
case_id.to_string(),
|
|
312
|
+
data_type.to_string(),
|
|
313
|
+
"Failed to convert t_alt_count to integer.".to_string(),
|
|
314
|
+
)
|
|
315
|
+
})?;
|
|
316
|
+
|
|
317
|
+
if alle_depth < min_total_depth || alt_count < min_alt_allele_count {
|
|
318
|
+
return Ok(None);
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
// Keep case_id, chr, start, end, and add "mutation"
|
|
322
|
+
out_lst = out_lst[0..4].to_vec();
|
|
323
|
+
out_lst.push("mutation".to_string());
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
// filter cnvs based on segment length. Default: 2000000
|
|
327
|
+
if data_type == "cnv" {
|
|
328
|
+
// calculate segment length (End_Position - Start_Position)
|
|
329
|
+
let end_position = out_lst[3].parse::<i32>().map_err(|_| {
|
|
330
|
+
(
|
|
331
|
+
case_id.to_string(),
|
|
332
|
+
data_type.to_string(),
|
|
333
|
+
"Failed to convert End Position of cnv to integer.".to_string(),
|
|
334
|
+
)
|
|
335
|
+
})?;
|
|
336
|
+
|
|
337
|
+
let start_position = out_lst[2].parse::<i32>().map_err(|_| {
|
|
338
|
+
(
|
|
339
|
+
case_id.to_string(),
|
|
340
|
+
data_type.to_string(),
|
|
341
|
+
"Failed to convert Start Position of cnv to integer.".to_string(),
|
|
342
|
+
)
|
|
343
|
+
})?;
|
|
344
|
+
let cnv_length = end_position - start_position;
|
|
345
|
+
if cnv_length > seg_length {
|
|
346
|
+
return Ok(None);
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
Ok(Some(out_lst))
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
// Check if the row meets consequence filtering criteria
|
|
354
|
+
fn is_valid_consequence(
|
|
355
|
+
cont_lst: &[String],
|
|
356
|
+
variant_classification_index: Option<usize>,
|
|
357
|
+
consequences: &Option<Vec<String>>,
|
|
358
|
+
) -> bool {
|
|
359
|
+
if let Some(consequence_filter) = consequences {
|
|
360
|
+
if !consequence_filter.is_empty() {
|
|
361
|
+
if let Some(var_class_idx) = variant_classification_index {
|
|
362
|
+
if var_class_idx < cont_lst.len() {
|
|
363
|
+
let variant_classification = &cont_lst[var_class_idx];
|
|
364
|
+
if let Some(normalized_consequence) = normalize_consequence(variant_classification) {
|
|
365
|
+
return consequence_filter.contains(&normalized_consequence);
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
return false; // Invalid row or unknown consequence
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
true // No filtering or empty filter
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
// Process Segment_Mean for CNV files
|
|
376
|
+
fn process_segment_mean(
|
|
377
|
+
element: &str,
|
|
378
|
+
case_id: &str,
|
|
379
|
+
data_type: &str,
|
|
380
|
+
gain_threshold: f32,
|
|
381
|
+
loss_threshold: f32,
|
|
382
|
+
) -> Result<String, (String, String, String)> {
|
|
383
|
+
let seg_mean = element.parse::<f32>().map_err(|_| {
|
|
384
|
+
(
|
|
385
|
+
case_id.to_string(),
|
|
386
|
+
data_type.to_string(),
|
|
387
|
+
"Segment_Mean in cnv file is not float".to_string(),
|
|
388
|
+
)
|
|
389
|
+
})?;
|
|
390
|
+
|
|
391
|
+
if seg_mean >= gain_threshold {
|
|
392
|
+
Ok("gain".to_string())
|
|
393
|
+
} else if seg_mean <= loss_threshold {
|
|
394
|
+
Ok("loss".to_string())
|
|
395
|
+
} else {
|
|
396
|
+
Ok(String::new())
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
/// Updated helper function to normalize MAF consequence types to frontend format
|
|
401
|
+
/// Returns None for unknown consequence types (which will be filtered out)
|
|
402
|
+
fn normalize_consequence(maf_consequence: &str) -> Option<String> {
|
|
403
|
+
match maf_consequence.to_lowercase().as_str() {
|
|
404
|
+
// Only map the consequence types we actually support
|
|
405
|
+
"missense_mutation" => Some("missense".to_string()),
|
|
406
|
+
"nonsense_mutation" | "stop_gained" | "stop_lost" => Some("nonsense".to_string()),
|
|
407
|
+
"frame_shift_del" | "frame_shift_ins" | "frameshift_variant" => Some("frameshift".to_string()),
|
|
408
|
+
"silent" | "synonymous_variant" => Some("silent".to_string()),
|
|
409
|
+
"in_frame_del" => Some("deletion".to_string()),
|
|
410
|
+
"in_frame_ins" => Some("insertion".to_string()),
|
|
411
|
+
"splice_site" | "splice_acceptor_variant" | "splice_donor_variant" => Some("splice_site".to_string()),
|
|
412
|
+
"tandem_duplication" | "duplication" => Some("duplication".to_string()),
|
|
413
|
+
"inversion" => Some("inversion".to_string()),
|
|
414
|
+
// Return None for all unknown consequence types - they will be filtered out
|
|
415
|
+
_ => None,
|
|
416
|
+
}
|
|
417
|
+
}
|
|
192
418
|
/// Downloads a single file with minimal retry logic for transient failures
|
|
193
419
|
async fn download_single_file(
|
|
194
420
|
case_id: String,
|
|
@@ -291,14 +517,19 @@ async fn download_single_file(
|
|
|
291
517
|
))
|
|
292
518
|
}
|
|
293
519
|
|
|
294
|
-
///
|
|
295
|
-
|
|
520
|
+
/// NEW: Phase 1 streaming download function
|
|
521
|
+
/// Outputs JSONL format: one JSON object per line
|
|
522
|
+
/// Node.js will read this line-by-line but still wait for completion
|
|
523
|
+
async fn download_data_streaming(
|
|
296
524
|
data4dl: HashMap<String, DataType>,
|
|
297
525
|
host: &str,
|
|
298
526
|
min_total_depth: i32,
|
|
299
527
|
min_alt_allele_count: i32,
|
|
528
|
+
consequences: &Option<Vec<String>>,
|
|
529
|
+
gain_threshold: f32,
|
|
530
|
+
loss_threshold: f32,
|
|
531
|
+
seg_length: i32,
|
|
300
532
|
) {
|
|
301
|
-
// Generate URLs from data4dl, handling optional cnv and maf
|
|
302
533
|
let data_urls: Vec<(String, String, String)> = data4dl
|
|
303
534
|
.into_iter()
|
|
304
535
|
.flat_map(|(case_id, data_types)| {
|
|
@@ -315,42 +546,63 @@ async fn download_data(
|
|
|
315
546
|
|
|
316
547
|
let total_files = data_urls.len();
|
|
317
548
|
|
|
318
|
-
//
|
|
549
|
+
// Counters for final summary
|
|
319
550
|
let successful_downloads = Arc::new(AtomicUsize::new(0));
|
|
320
551
|
let failed_downloads = Arc::new(AtomicUsize::new(0));
|
|
321
552
|
|
|
322
|
-
//
|
|
323
|
-
let successful_data = Arc::new(Mutex::new(Vec::<Vec<Vec<String>>>::new()));
|
|
553
|
+
// Only collect errors (successful data is output immediately)
|
|
324
554
|
let errors = Arc::new(Mutex::new(Vec::<ErrorEntry>::new()));
|
|
325
555
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
}
|
|
332
|
-
}));
|
|
556
|
+
let download_futures = futures::stream::iter(
|
|
557
|
+
data_urls
|
|
558
|
+
.into_iter()
|
|
559
|
+
.map(|(case_id, data_type, url)| async move { download_single_file(case_id, data_type, url, 2).await }),
|
|
560
|
+
);
|
|
333
561
|
|
|
334
|
-
//
|
|
562
|
+
// Process downloads and output results immediately as JSONL
|
|
335
563
|
download_futures
|
|
336
|
-
.buffer_unordered(
|
|
564
|
+
.buffer_unordered(20) // Increased concurrency for better performance
|
|
337
565
|
.for_each(|download_result| {
|
|
338
566
|
let successful_downloads = Arc::clone(&successful_downloads);
|
|
339
567
|
let failed_downloads = Arc::clone(&failed_downloads);
|
|
340
|
-
let successful_data = Arc::clone(&successful_data);
|
|
341
568
|
let errors = Arc::clone(&errors);
|
|
342
569
|
|
|
343
570
|
async move {
|
|
344
571
|
match download_result {
|
|
345
572
|
Ok((case_id, data_type, content)) => {
|
|
346
|
-
//
|
|
347
|
-
match parse_content(
|
|
573
|
+
// Try to parse the content
|
|
574
|
+
match parse_content(
|
|
575
|
+
&content,
|
|
576
|
+
&case_id,
|
|
577
|
+
&data_type,
|
|
578
|
+
min_total_depth,
|
|
579
|
+
min_alt_allele_count,
|
|
580
|
+
&consequences,
|
|
581
|
+
gain_threshold,
|
|
582
|
+
loss_threshold,
|
|
583
|
+
seg_length,
|
|
584
|
+
) {
|
|
348
585
|
Ok(parsed_data) => {
|
|
349
|
-
//
|
|
350
|
-
|
|
586
|
+
// SUCCESS: Output immediately as JSONL
|
|
587
|
+
let success_output = SuccessfulFileOutput {
|
|
588
|
+
output_type: "data".to_string(),
|
|
589
|
+
case_id: case_id.clone(),
|
|
590
|
+
data_type: data_type.clone(),
|
|
591
|
+
data: parsed_data,
|
|
592
|
+
};
|
|
593
|
+
|
|
594
|
+
// Output this successful result immediately - Node.js will see this in real-time
|
|
595
|
+
if let Ok(json) = serde_json::to_string(&success_output) {
|
|
596
|
+
println!("{}", json); // IMMEDIATE output to stdout
|
|
597
|
+
// Force flush to ensure Node.js sees it immediately
|
|
598
|
+
use std::io::Write;
|
|
599
|
+
let _ = std::io::stdout().flush();
|
|
600
|
+
}
|
|
601
|
+
|
|
351
602
|
successful_downloads.fetch_add(1, Ordering::Relaxed);
|
|
352
603
|
}
|
|
353
604
|
Err((cid, dtp, error)) => {
|
|
605
|
+
// Parsing failed - add to errors
|
|
354
606
|
failed_downloads.fetch_add(1, Ordering::Relaxed);
|
|
355
607
|
let error = ErrorEntry {
|
|
356
608
|
case_id: cid,
|
|
@@ -364,9 +616,9 @@ async fn download_data(
|
|
|
364
616
|
}
|
|
365
617
|
}
|
|
366
618
|
Err((case_id, data_type, error_details, attempts)) => {
|
|
619
|
+
// Download failed - add to errors
|
|
367
620
|
failed_downloads.fetch_add(1, Ordering::Relaxed);
|
|
368
621
|
|
|
369
|
-
// Parse error type from error details
|
|
370
622
|
let (error_type, clean_details) = if error_details.contains(":") {
|
|
371
623
|
let parts: Vec<&str> = error_details.splitn(2, ": ").collect();
|
|
372
624
|
(parts[0].to_string(), parts[1].to_string())
|
|
@@ -388,27 +640,23 @@ async fn download_data(
|
|
|
388
640
|
})
|
|
389
641
|
.await;
|
|
390
642
|
|
|
391
|
-
//
|
|
643
|
+
// Output final summary as the last line
|
|
392
644
|
let success_count = successful_downloads.load(Ordering::Relaxed);
|
|
393
645
|
let failed_count = failed_downloads.load(Ordering::Relaxed);
|
|
394
646
|
|
|
395
|
-
let
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
failed_files: failed_count,
|
|
402
|
-
},
|
|
647
|
+
let summary = FinalSummary {
|
|
648
|
+
output_type: "summary".to_string(),
|
|
649
|
+
total_files,
|
|
650
|
+
successful_files: success_count,
|
|
651
|
+
failed_files: failed_count,
|
|
652
|
+
errors: errors.lock().await.clone(),
|
|
403
653
|
};
|
|
404
654
|
|
|
405
|
-
// Output
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
std::process::exit(1);
|
|
411
|
-
}
|
|
655
|
+
// Output final summary - Node.js will know processing is complete when it sees this
|
|
656
|
+
if let Ok(json) = serde_json::to_string(&summary) {
|
|
657
|
+
println!("{}", json);
|
|
658
|
+
use std::io::Write;
|
|
659
|
+
let _ = std::io::stdout().flush();
|
|
412
660
|
}
|
|
413
661
|
}
|
|
414
662
|
|
|
@@ -455,13 +703,34 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
455
703
|
let case_files = input_js.case_files;
|
|
456
704
|
|
|
457
705
|
// Set default maf_options
|
|
458
|
-
let (min_total_depth, min_alt_allele_count) = match input_js.maf_options {
|
|
459
|
-
Some(options) => (
|
|
460
|
-
|
|
706
|
+
let (min_total_depth, min_alt_allele_count, consequences) = match input_js.maf_options {
|
|
707
|
+
Some(options) => (
|
|
708
|
+
options.min_total_depth,
|
|
709
|
+
options.min_alt_allele_count,
|
|
710
|
+
options.consequences.clone(),
|
|
711
|
+
),
|
|
712
|
+
None => (10, 2, None), // Default values
|
|
713
|
+
};
|
|
714
|
+
|
|
715
|
+
// Set default cnv_options
|
|
716
|
+
let (gain_threshold, loss_threshold, seg_length) = match input_js.cnv_options {
|
|
717
|
+
Some(options) => (options.gain_threshold, options.loss_threshold, options.seg_length),
|
|
718
|
+
None => (0.3, -0.4, 2000000), // Default values
|
|
461
719
|
};
|
|
462
720
|
|
|
463
721
|
// Download data - this will now handle errors gracefully
|
|
464
|
-
download_data(case_files, HOST, min_total_depth, min_alt_allele_count).await;
|
|
722
|
+
// download_data(case_files, HOST, min_total_depth, min_alt_allele_count, &consequences).await;
|
|
723
|
+
download_data_streaming(
|
|
724
|
+
case_files,
|
|
725
|
+
HOST,
|
|
726
|
+
min_total_depth,
|
|
727
|
+
min_alt_allele_count,
|
|
728
|
+
&consequences,
|
|
729
|
+
gain_threshold,
|
|
730
|
+
loss_threshold,
|
|
731
|
+
seg_length,
|
|
732
|
+
)
|
|
733
|
+
.await;
|
|
465
734
|
|
|
466
735
|
// Always exit successfully - individual file failures are logged but don't stop the process
|
|
467
736
|
Ok(())
|