@sjcrh/proteinpaint-rust 2.129.6-2b2fdc7ee.0 → 2.130.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -3
- package/package.json +1 -1
- package/src/gdcGRIN2.rs +482 -136
- package/src/test_cerno.rs +102 -21185
- package/src/wilcoxon.rs +5 -16
package/src/gdcGRIN2.rs
CHANGED
|
@@ -13,18 +13,19 @@
|
|
|
13
13
|
Output mutations as JSON array.
|
|
14
14
|
|
|
15
15
|
Example of usage:
|
|
16
|
-
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}},"mafOptions": {"minTotalDepth":
|
|
16
|
+
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000}}' | ./target/release/gdcGRIN2
|
|
17
17
|
*/
|
|
18
18
|
|
|
19
19
|
use flate2::read::GzDecoder;
|
|
20
20
|
use futures::StreamExt;
|
|
21
21
|
use memchr::memchr;
|
|
22
|
-
use serde::Deserialize;
|
|
22
|
+
use serde::{Deserialize, Serialize};
|
|
23
23
|
use serde_json;
|
|
24
24
|
use std::collections::HashMap;
|
|
25
25
|
use std::io::{self, Read};
|
|
26
26
|
use std::sync::Arc;
|
|
27
27
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
28
|
+
use std::thread::sleep;
|
|
28
29
|
use std::time::Duration;
|
|
29
30
|
use tokio::io::{AsyncReadExt, BufReader};
|
|
30
31
|
use tokio::sync::Mutex;
|
|
@@ -40,21 +41,6 @@ struct ErrorEntry {
|
|
|
40
41
|
attempts_made: u32,
|
|
41
42
|
}
|
|
42
43
|
|
|
43
|
-
// Struct for the final output that includes both successful data and errors
|
|
44
|
-
#[derive(serde::Serialize)]
|
|
45
|
-
struct GdcOutput {
|
|
46
|
-
successful_data: Vec<Vec<Vec<String>>>, // Array of successful file data arrays
|
|
47
|
-
failed_files: Vec<ErrorEntry>,
|
|
48
|
-
summary: OutputSummary,
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
#[derive(serde::Serialize)]
|
|
52
|
-
struct OutputSummary {
|
|
53
|
-
total_files: usize,
|
|
54
|
-
successful_files: usize,
|
|
55
|
-
failed_files: usize,
|
|
56
|
-
}
|
|
57
|
-
|
|
58
44
|
// Define the structure for datadd
|
|
59
45
|
#[derive(Deserialize, Debug)]
|
|
60
46
|
struct DataType {
|
|
@@ -69,6 +55,67 @@ struct MafOptions {
|
|
|
69
55
|
min_total_depth: i32,
|
|
70
56
|
#[serde(rename = "minAltAlleleCount")]
|
|
71
57
|
min_alt_allele_count: i32,
|
|
58
|
+
consequences: Option<Vec<String>>, // Optional list of consequences to filter MAF files
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Define the structure for cnvOptions
|
|
62
|
+
#[derive(Deserialize, Debug)]
|
|
63
|
+
struct CnvOptions {
|
|
64
|
+
#[serde(rename = "lossThreshold")]
|
|
65
|
+
loss_threshold: f32,
|
|
66
|
+
#[serde(rename = "gainThreshold")]
|
|
67
|
+
gain_threshold: f32,
|
|
68
|
+
#[serde(rename = "segLength")]
|
|
69
|
+
seg_length: i32,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Individual successful file output (JSONL format)
|
|
73
|
+
#[derive(serde::Serialize)]
|
|
74
|
+
struct SuccessfulFileOutput {
|
|
75
|
+
#[serde(rename = "type")]
|
|
76
|
+
output_type: String, // Always "data"
|
|
77
|
+
case_id: String,
|
|
78
|
+
data_type: String,
|
|
79
|
+
data: Vec<Vec<String>>,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// struct for MAF filter details
|
|
83
|
+
#[derive(Clone, Serialize, Default)]
|
|
84
|
+
struct FilteredMafDetails {
|
|
85
|
+
invalid_consequences: usize,
|
|
86
|
+
t_alt_count: usize,
|
|
87
|
+
t_depth: usize,
|
|
88
|
+
invalid_rows: usize,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// struct for CNV filter details
|
|
92
|
+
#[derive(Clone, Serialize, Default)]
|
|
93
|
+
struct FilteredCnvDetails {
|
|
94
|
+
segment_mean: usize,
|
|
95
|
+
seg_length: usize,
|
|
96
|
+
invalid_rows: usize,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// struct for per-case filter details
|
|
100
|
+
#[derive(Clone, Serialize)]
|
|
101
|
+
struct FilteredCaseDetails {
|
|
102
|
+
maf: FilteredMafDetails,
|
|
103
|
+
cnv: FilteredCnvDetails,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Final summary output (JSONL format)
|
|
107
|
+
#[derive(serde::Serialize)]
|
|
108
|
+
struct FinalSummary {
|
|
109
|
+
#[serde(rename = "type")]
|
|
110
|
+
output_type: String, // Always "summary"
|
|
111
|
+
total_files: usize,
|
|
112
|
+
successful_files: usize,
|
|
113
|
+
failed_files: usize,
|
|
114
|
+
errors: Vec<ErrorEntry>,
|
|
115
|
+
filtered_records: usize,
|
|
116
|
+
filtered_maf_records: usize,
|
|
117
|
+
filtered_cnv_records: usize,
|
|
118
|
+
filtered_records_by_case: HashMap<String, FilteredCaseDetails>,
|
|
72
119
|
}
|
|
73
120
|
|
|
74
121
|
// Define the top-level input structure
|
|
@@ -78,104 +125,100 @@ struct InputData {
|
|
|
78
125
|
case_files: HashMap<String, DataType>,
|
|
79
126
|
#[serde(rename = "mafOptions")]
|
|
80
127
|
maf_options: Option<MafOptions>,
|
|
128
|
+
#[serde(rename = "cnvOptions")]
|
|
129
|
+
cnv_options: Option<CnvOptions>,
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Configuration for different data types
|
|
133
|
+
#[derive(Deserialize, Debug)]
|
|
134
|
+
struct DataTypeConfig {
|
|
135
|
+
header_marker: &'static str,
|
|
136
|
+
output_columns: Vec<&'static str>,
|
|
81
137
|
}
|
|
82
138
|
|
|
83
139
|
// Function to parse TSV content
|
|
84
|
-
fn parse_content(
|
|
140
|
+
async fn parse_content(
|
|
85
141
|
content: &str,
|
|
86
142
|
case_id: &str,
|
|
87
143
|
data_type: &str,
|
|
88
144
|
min_total_depth: i32,
|
|
89
145
|
min_alt_allele_count: i32,
|
|
146
|
+
consequences: &Option<Vec<String>>,
|
|
147
|
+
gain_threshold: f32,
|
|
148
|
+
loss_threshold: f32,
|
|
149
|
+
seg_length: i32,
|
|
150
|
+
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
151
|
+
filtered_maf_records: &AtomicUsize,
|
|
152
|
+
filtered_cnv_records: &AtomicUsize,
|
|
90
153
|
) -> Result<Vec<Vec<String>>, (String, String, String)> {
|
|
154
|
+
let config = match data_type {
|
|
155
|
+
"cnv" => DataTypeConfig {
|
|
156
|
+
header_marker: "Segment_Mean",
|
|
157
|
+
output_columns: vec!["Chromosome", "Start", "End", "Segment_Mean"],
|
|
158
|
+
},
|
|
159
|
+
"maf" => DataTypeConfig {
|
|
160
|
+
header_marker: "Hugo_Symbol",
|
|
161
|
+
output_columns: vec!["Chromosome", "Start_Position", "End_Position", "t_depth", "t_alt_count"],
|
|
162
|
+
},
|
|
163
|
+
_ => {
|
|
164
|
+
return Err((
|
|
165
|
+
case_id.to_string(),
|
|
166
|
+
data_type.to_string(),
|
|
167
|
+
"Invalid data type".to_string(),
|
|
168
|
+
));
|
|
169
|
+
}
|
|
170
|
+
};
|
|
171
|
+
|
|
91
172
|
let lines = content.lines();
|
|
92
173
|
let mut parsed_data = Vec::new();
|
|
93
174
|
let mut columns_indices: Vec<usize> = Vec::new();
|
|
94
|
-
let mut
|
|
95
|
-
let mut
|
|
96
|
-
|
|
97
|
-
if data_type == "cnv" {
|
|
98
|
-
header_mk = "GDC_Aliquot_ID";
|
|
99
|
-
columns = vec!["Chromosome", "Start", "End", "Segment_Mean"]
|
|
100
|
-
} else if data_type == "maf" {
|
|
101
|
-
header_mk = "Hugo_Symbol";
|
|
102
|
-
columns = vec!["Chromosome", "Start_Position", "End_Position", "t_depth", "t_alt_count"]
|
|
103
|
-
};
|
|
175
|
+
let mut variant_classification_index: Option<usize> = None;
|
|
176
|
+
//let mut header_mk: &str = "";
|
|
177
|
+
//let mut columns = Vec::new();
|
|
104
178
|
|
|
105
179
|
let mut header: Vec<String> = Vec::new();
|
|
106
180
|
|
|
107
181
|
for line in lines {
|
|
108
182
|
if line.starts_with("#") {
|
|
109
183
|
continue;
|
|
110
|
-
}
|
|
184
|
+
};
|
|
185
|
+
if line.contains(config.header_marker) {
|
|
111
186
|
header = line.split("\t").map(|s| s.to_string()).collect();
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
} else {
|
|
124
|
-
let mut keep_ck: bool = true;
|
|
125
|
-
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
126
|
-
let mut out_lst: Vec<String> = Vec::new();
|
|
127
|
-
out_lst.push(case_id.to_string());
|
|
128
|
-
|
|
129
|
-
for x in columns_indices.iter() {
|
|
130
|
-
let mut element = cont_lst[*x].to_string();
|
|
131
|
-
|
|
132
|
-
if data_type == "cnv" && &header[*x] == "Segment_Mean" {
|
|
133
|
-
let seg_mean = match element.parse::<f32>() {
|
|
134
|
-
Ok(val) => val,
|
|
135
|
-
Err(_e) => {
|
|
136
|
-
let error_msg = "Segment_Mean in cnv file is not float".to_string();
|
|
137
|
-
return Err((case_id.to_string(), data_type.to_string(), error_msg));
|
|
138
|
-
}
|
|
139
|
-
};
|
|
140
|
-
if seg_mean >= 0.3 {
|
|
141
|
-
element = "gain".to_string();
|
|
142
|
-
} else if seg_mean <= -0.4 {
|
|
143
|
-
element = "loss".to_string();
|
|
144
|
-
} else {
|
|
145
|
-
keep_ck = false;
|
|
146
|
-
}
|
|
147
|
-
}
|
|
148
|
-
out_lst.push(element);
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
if data_type == "maf" {
|
|
152
|
-
let alle_depth = match out_lst[4].parse::<i32>() {
|
|
153
|
-
Ok(value) => value,
|
|
154
|
-
Err(_) => {
|
|
155
|
-
let error_msg = "Failed to convert t_depth to i32.".to_string();
|
|
156
|
-
return Err((case_id.to_string(), data_type.to_string(), error_msg));
|
|
157
|
-
}
|
|
158
|
-
};
|
|
159
|
-
let alt_count = match out_lst[5].parse::<i32>() {
|
|
160
|
-
Ok(value) => value,
|
|
161
|
-
Err(_) => {
|
|
162
|
-
let error_msg = "Failed to convert t_alt_count to i32.".to_string();
|
|
163
|
-
return Err((case_id.to_string(), data_type.to_string(), error_msg));
|
|
164
|
-
}
|
|
165
|
-
};
|
|
166
|
-
|
|
167
|
-
if alle_depth >= min_total_depth && alt_count >= min_alt_allele_count {
|
|
168
|
-
out_lst = out_lst[0..4].to_vec();
|
|
169
|
-
out_lst.push("mutation".to_string());
|
|
170
|
-
} else {
|
|
171
|
-
keep_ck = false;
|
|
172
|
-
}
|
|
187
|
+
if let Err(err) = setup_columns(
|
|
188
|
+
&header,
|
|
189
|
+
&config,
|
|
190
|
+
&mut columns_indices,
|
|
191
|
+
&mut variant_classification_index,
|
|
192
|
+
case_id,
|
|
193
|
+
data_type,
|
|
194
|
+
) {
|
|
195
|
+
return Err(err);
|
|
173
196
|
}
|
|
197
|
+
continue;
|
|
198
|
+
};
|
|
174
199
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
200
|
+
let row = process_row(
|
|
201
|
+
line,
|
|
202
|
+
case_id,
|
|
203
|
+
data_type,
|
|
204
|
+
&header,
|
|
205
|
+
&columns_indices,
|
|
206
|
+
variant_classification_index,
|
|
207
|
+
consequences,
|
|
208
|
+
min_total_depth,
|
|
209
|
+
min_alt_allele_count,
|
|
210
|
+
gain_threshold,
|
|
211
|
+
loss_threshold,
|
|
212
|
+
seg_length,
|
|
213
|
+
filtered_records,
|
|
214
|
+
filtered_maf_records,
|
|
215
|
+
filtered_cnv_records,
|
|
216
|
+
)
|
|
217
|
+
.await?;
|
|
218
|
+
|
|
219
|
+
if let Some(out_lst) = row {
|
|
220
|
+
parsed_data.push(out_lst);
|
|
221
|
+
};
|
|
179
222
|
}
|
|
180
223
|
|
|
181
224
|
if columns_indices.is_empty() {
|
|
@@ -189,6 +232,248 @@ fn parse_content(
|
|
|
189
232
|
Ok(parsed_data)
|
|
190
233
|
}
|
|
191
234
|
|
|
235
|
+
// Set up column indices for processing
|
|
236
|
+
fn setup_columns(
|
|
237
|
+
header: &[String],
|
|
238
|
+
config: &DataTypeConfig,
|
|
239
|
+
columns_indices: &mut Vec<usize>,
|
|
240
|
+
variant_classification_index: &mut Option<usize>,
|
|
241
|
+
case_id: &str,
|
|
242
|
+
data_type: &str,
|
|
243
|
+
) -> Result<(), (String, String, String)> {
|
|
244
|
+
for col in &config.output_columns {
|
|
245
|
+
match header.iter().position(|x| x == col) {
|
|
246
|
+
Some(index) => columns_indices.push(index),
|
|
247
|
+
None => {
|
|
248
|
+
return Err((
|
|
249
|
+
case_id.to_string(),
|
|
250
|
+
data_type.to_string(),
|
|
251
|
+
format!("Column {} was not found", col),
|
|
252
|
+
));
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
if data_type == "maf" {
|
|
258
|
+
*variant_classification_index = header.iter().position(|x| x == "Variant_Classification");
|
|
259
|
+
if variant_classification_index.is_none() {
|
|
260
|
+
return Err((
|
|
261
|
+
case_id.to_string(),
|
|
262
|
+
data_type.to_string(),
|
|
263
|
+
"Column Variant_Classification was not found".to_string(),
|
|
264
|
+
));
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
Ok(())
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// Process a single row of data
|
|
272
|
+
async fn process_row(
|
|
273
|
+
line: &str,
|
|
274
|
+
case_id: &str,
|
|
275
|
+
data_type: &str,
|
|
276
|
+
header: &[String],
|
|
277
|
+
columns_indices: &[usize],
|
|
278
|
+
variant_classification_index: Option<usize>,
|
|
279
|
+
consequences: &Option<Vec<String>>,
|
|
280
|
+
min_total_depth: i32,
|
|
281
|
+
min_alt_allele_count: i32,
|
|
282
|
+
gain_threshold: f32,
|
|
283
|
+
loss_threshold: f32,
|
|
284
|
+
seg_length: i32,
|
|
285
|
+
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
286
|
+
filtered_maf_records: &AtomicUsize,
|
|
287
|
+
filtered_cnv_records: &AtomicUsize,
|
|
288
|
+
) -> Result<Option<Vec<String>>, (String, String, String)> {
|
|
289
|
+
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
290
|
+
let mut out_lst = vec![case_id.to_string()];
|
|
291
|
+
|
|
292
|
+
// Initialize or update case details
|
|
293
|
+
let mut filtered_map = filtered_records.lock().await;
|
|
294
|
+
filtered_map
|
|
295
|
+
.entry(case_id.to_string())
|
|
296
|
+
.or_insert_with(|| FilteredCaseDetails {
|
|
297
|
+
maf: FilteredMafDetails::default(),
|
|
298
|
+
cnv: FilteredCnvDetails::default(),
|
|
299
|
+
});
|
|
300
|
+
|
|
301
|
+
let case_details = filtered_map.get_mut(case_id).unwrap();
|
|
302
|
+
|
|
303
|
+
// Check consequence filtering for MAF files
|
|
304
|
+
if data_type == "maf" && !is_valid_consequence(&cont_lst, variant_classification_index, consequences) {
|
|
305
|
+
case_details.maf.invalid_consequences += 1;
|
|
306
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
307
|
+
return Ok(None);
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// Extract relevant columns
|
|
311
|
+
for &x in columns_indices {
|
|
312
|
+
if x >= cont_lst.len() {
|
|
313
|
+
if data_type == "maf" {
|
|
314
|
+
case_details.maf.invalid_rows += 1;
|
|
315
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
316
|
+
} else if data_type == "cnv" {
|
|
317
|
+
case_details.cnv.invalid_rows += 1;
|
|
318
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
319
|
+
}
|
|
320
|
+
return Ok(None); // Invalid row
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
let mut element = cont_lst[x].to_string();
|
|
324
|
+
if data_type == "cnv" && header[x] == "Segment_Mean" {
|
|
325
|
+
element = process_segment_mean(&element, case_id, data_type, gain_threshold, loss_threshold)?;
|
|
326
|
+
if element.is_empty() {
|
|
327
|
+
case_details.cnv.segment_mean += 1;
|
|
328
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
329
|
+
return Ok(None);
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
out_lst.push(element);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// Additional MAF-specific processing
|
|
336
|
+
if data_type == "maf" {
|
|
337
|
+
if out_lst.len() < 6 {
|
|
338
|
+
case_details.maf.invalid_rows += 1;
|
|
339
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
340
|
+
return Ok(None); // Not enough columns
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
|
|
344
|
+
case_details.maf.invalid_rows += 1;
|
|
345
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
346
|
+
(
|
|
347
|
+
case_id.to_string(),
|
|
348
|
+
data_type.to_string(),
|
|
349
|
+
"Failed to convert t_depth to integer.".to_string(),
|
|
350
|
+
)
|
|
351
|
+
})?;
|
|
352
|
+
|
|
353
|
+
let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
|
|
354
|
+
case_details.maf.invalid_rows += 1;
|
|
355
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
356
|
+
(
|
|
357
|
+
case_id.to_string(),
|
|
358
|
+
data_type.to_string(),
|
|
359
|
+
"Failed to convert t_alt_count to integer.".to_string(),
|
|
360
|
+
)
|
|
361
|
+
})?;
|
|
362
|
+
|
|
363
|
+
if alle_depth < min_total_depth {
|
|
364
|
+
case_details.maf.t_depth += 1;
|
|
365
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
366
|
+
return Ok(None);
|
|
367
|
+
}
|
|
368
|
+
if alt_count < min_alt_allele_count {
|
|
369
|
+
case_details.maf.t_alt_count += 1;
|
|
370
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
371
|
+
return Ok(None);
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// Keep case_id, chr, start, end, and add "mutation"
|
|
375
|
+
out_lst = out_lst[0..4].to_vec();
|
|
376
|
+
out_lst.push("mutation".to_string());
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// filter cnvs based on segment length. Default: 0 (no filtering)
|
|
380
|
+
if data_type == "cnv" {
|
|
381
|
+
// calculate segment length (End_Position - Start_Position)
|
|
382
|
+
let end_position = out_lst[3].parse::<i32>().map_err(|_| {
|
|
383
|
+
case_details.cnv.invalid_rows += 1;
|
|
384
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
385
|
+
(
|
|
386
|
+
case_id.to_string(),
|
|
387
|
+
data_type.to_string(),
|
|
388
|
+
"Failed to convert End Position of cnv to integer.".to_string(),
|
|
389
|
+
)
|
|
390
|
+
})?;
|
|
391
|
+
|
|
392
|
+
let start_position = out_lst[2].parse::<i32>().map_err(|_| {
|
|
393
|
+
case_details.cnv.invalid_rows += 1;
|
|
394
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
395
|
+
(
|
|
396
|
+
case_id.to_string(),
|
|
397
|
+
data_type.to_string(),
|
|
398
|
+
"Failed to convert Start Position of cnv to integer.".to_string(),
|
|
399
|
+
)
|
|
400
|
+
})?;
|
|
401
|
+
let cnv_length = end_position - start_position;
|
|
402
|
+
if seg_length > 0 && cnv_length > seg_length {
|
|
403
|
+
case_details.cnv.seg_length += 1;
|
|
404
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
405
|
+
return Ok(None);
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
Ok(Some(out_lst))
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
// Check if the row meets consequence filtering criteria
|
|
413
|
+
fn is_valid_consequence(
|
|
414
|
+
cont_lst: &[String],
|
|
415
|
+
variant_classification_index: Option<usize>,
|
|
416
|
+
consequences: &Option<Vec<String>>,
|
|
417
|
+
) -> bool {
|
|
418
|
+
if let Some(consequence_filter) = consequences {
|
|
419
|
+
if !consequence_filter.is_empty() {
|
|
420
|
+
if let Some(var_class_idx) = variant_classification_index {
|
|
421
|
+
if var_class_idx < cont_lst.len() {
|
|
422
|
+
let variant_classification = &cont_lst[var_class_idx];
|
|
423
|
+
if let Some(normalized_consequence) = normalize_consequence(variant_classification) {
|
|
424
|
+
return consequence_filter.contains(&normalized_consequence);
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
return false; // Invalid row or unknown consequence
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
true // No filtering or empty filter
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
// Process Segment_Mean for CNV files
|
|
435
|
+
fn process_segment_mean(
|
|
436
|
+
element: &str,
|
|
437
|
+
case_id: &str,
|
|
438
|
+
data_type: &str,
|
|
439
|
+
gain_threshold: f32,
|
|
440
|
+
loss_threshold: f32,
|
|
441
|
+
) -> Result<String, (String, String, String)> {
|
|
442
|
+
let seg_mean = element.parse::<f32>().map_err(|_| {
|
|
443
|
+
(
|
|
444
|
+
case_id.to_string(),
|
|
445
|
+
data_type.to_string(),
|
|
446
|
+
"Segment_Mean in cnv file is not float".to_string(),
|
|
447
|
+
)
|
|
448
|
+
})?;
|
|
449
|
+
|
|
450
|
+
if seg_mean >= gain_threshold {
|
|
451
|
+
Ok("gain".to_string())
|
|
452
|
+
} else if seg_mean <= loss_threshold {
|
|
453
|
+
Ok("loss".to_string())
|
|
454
|
+
} else {
|
|
455
|
+
Ok(String::new())
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
/// Updated helper function to normalize MAF consequence types to frontend format
|
|
460
|
+
/// Returns None for unknown consequence types (which will be filtered out)
|
|
461
|
+
fn normalize_consequence(maf_consequence: &str) -> Option<String> {
|
|
462
|
+
match maf_consequence.to_lowercase().as_str() {
|
|
463
|
+
// Only map the consequence types we actually support
|
|
464
|
+
"missense_mutation" => Some("missense".to_string()),
|
|
465
|
+
"nonsense_mutation" | "stop_gained" | "stop_lost" => Some("nonsense".to_string()),
|
|
466
|
+
"frame_shift_del" | "frame_shift_ins" | "frameshift_variant" => Some("frameshift".to_string()),
|
|
467
|
+
"silent" | "synonymous_variant" => Some("silent".to_string()),
|
|
468
|
+
"in_frame_del" => Some("deletion".to_string()),
|
|
469
|
+
"in_frame_ins" => Some("insertion".to_string()),
|
|
470
|
+
"splice_site" | "splice_acceptor_variant" | "splice_donor_variant" => Some("splice_site".to_string()),
|
|
471
|
+
"tandem_duplication" | "duplication" => Some("duplication".to_string()),
|
|
472
|
+
"inversion" => Some("inversion".to_string()),
|
|
473
|
+
// Return None for all unknown consequence types - they will be filtered out
|
|
474
|
+
_ => None,
|
|
475
|
+
}
|
|
476
|
+
}
|
|
192
477
|
/// Downloads a single file with minimal retry logic for transient failures
|
|
193
478
|
async fn download_single_file(
|
|
194
479
|
case_id: String,
|
|
@@ -291,14 +576,19 @@ async fn download_single_file(
|
|
|
291
576
|
))
|
|
292
577
|
}
|
|
293
578
|
|
|
294
|
-
///
|
|
295
|
-
|
|
579
|
+
/// Streaming download function
|
|
580
|
+
/// Outputs JSONL format: one JSON object per line
|
|
581
|
+
/// Node.js will read this line-by-line but still wait for completion
|
|
582
|
+
async fn download_data_streaming(
|
|
296
583
|
data4dl: HashMap<String, DataType>,
|
|
297
584
|
host: &str,
|
|
298
585
|
min_total_depth: i32,
|
|
299
586
|
min_alt_allele_count: i32,
|
|
587
|
+
consequences: &Option<Vec<String>>,
|
|
588
|
+
gain_threshold: f32,
|
|
589
|
+
loss_threshold: f32,
|
|
590
|
+
seg_length: i32,
|
|
300
591
|
) {
|
|
301
|
-
// Generate URLs from data4dl, handling optional cnv and maf
|
|
302
592
|
let data_urls: Vec<(String, String, String)> = data4dl
|
|
303
593
|
.into_iter()
|
|
304
594
|
.flat_map(|(case_id, data_types)| {
|
|
@@ -315,42 +605,76 @@ async fn download_data(
|
|
|
315
605
|
|
|
316
606
|
let total_files = data_urls.len();
|
|
317
607
|
|
|
318
|
-
//
|
|
608
|
+
// Counters for final summary
|
|
319
609
|
let successful_downloads = Arc::new(AtomicUsize::new(0));
|
|
320
610
|
let failed_downloads = Arc::new(AtomicUsize::new(0));
|
|
611
|
+
let filtered_maf_records = Arc::new(AtomicUsize::new(0));
|
|
612
|
+
let filtered_cnv_records = Arc::new(AtomicUsize::new(0));
|
|
613
|
+
let filtered_records = Arc::new(Mutex::new(HashMap::<String, FilteredCaseDetails>::new()));
|
|
321
614
|
|
|
322
|
-
//
|
|
323
|
-
let successful_data = Arc::new(Mutex::new(Vec::<Vec<Vec<String>>>::new()));
|
|
615
|
+
// Only collect errors (successful data is output immediately)
|
|
324
616
|
let errors = Arc::new(Mutex::new(Vec::<ErrorEntry>::new()));
|
|
325
617
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
}
|
|
332
|
-
}));
|
|
618
|
+
let download_futures = futures::stream::iter(
|
|
619
|
+
data_urls
|
|
620
|
+
.into_iter()
|
|
621
|
+
.map(|(case_id, data_type, url)| async move { download_single_file(case_id, data_type, url, 2).await }),
|
|
622
|
+
);
|
|
333
623
|
|
|
334
|
-
//
|
|
624
|
+
// Process downloads and output results immediately as JSONL
|
|
335
625
|
download_futures
|
|
336
|
-
.buffer_unordered(
|
|
626
|
+
.buffer_unordered(20) // Increased concurrency for better performance
|
|
337
627
|
.for_each(|download_result| {
|
|
338
628
|
let successful_downloads = Arc::clone(&successful_downloads);
|
|
339
629
|
let failed_downloads = Arc::clone(&failed_downloads);
|
|
340
|
-
let
|
|
630
|
+
let filtered_maf_records = Arc::clone(&filtered_maf_records);
|
|
631
|
+
let filtered_cnv_records = Arc::clone(&filtered_cnv_records);
|
|
632
|
+
let filtered_records = Arc::clone(&filtered_records);
|
|
341
633
|
let errors = Arc::clone(&errors);
|
|
342
634
|
|
|
343
635
|
async move {
|
|
344
636
|
match download_result {
|
|
345
637
|
Ok((case_id, data_type, content)) => {
|
|
346
|
-
//
|
|
347
|
-
match parse_content(
|
|
638
|
+
// Try to parse the content
|
|
639
|
+
match parse_content(
|
|
640
|
+
&content,
|
|
641
|
+
&case_id,
|
|
642
|
+
&data_type,
|
|
643
|
+
min_total_depth,
|
|
644
|
+
min_alt_allele_count,
|
|
645
|
+
&consequences,
|
|
646
|
+
gain_threshold,
|
|
647
|
+
loss_threshold,
|
|
648
|
+
seg_length,
|
|
649
|
+
&filtered_records,
|
|
650
|
+
&filtered_maf_records,
|
|
651
|
+
&filtered_cnv_records,
|
|
652
|
+
)
|
|
653
|
+
.await
|
|
654
|
+
{
|
|
348
655
|
Ok(parsed_data) => {
|
|
349
|
-
//
|
|
350
|
-
|
|
656
|
+
// SUCCESS: Output immediately as JSONL
|
|
657
|
+
let success_output = SuccessfulFileOutput {
|
|
658
|
+
output_type: "data".to_string(),
|
|
659
|
+
case_id: case_id.clone(),
|
|
660
|
+
data_type: data_type.clone(),
|
|
661
|
+
data: parsed_data,
|
|
662
|
+
};
|
|
663
|
+
|
|
664
|
+
// Output this successful result immediately - Node.js will see this in real-time
|
|
665
|
+
if let Ok(json) = serde_json::to_string(&success_output) {
|
|
666
|
+
println!("{}", json); // IMMEDIATE output to stdout
|
|
667
|
+
// Force flush to ensure Node.js sees it immediately
|
|
668
|
+
use std::io::Write;
|
|
669
|
+
let _ = std::io::stdout().flush();
|
|
670
|
+
// Optional: Add small delay to separate lines
|
|
671
|
+
sleep(Duration::from_millis(10));
|
|
672
|
+
}
|
|
673
|
+
|
|
351
674
|
successful_downloads.fetch_add(1, Ordering::Relaxed);
|
|
352
675
|
}
|
|
353
676
|
Err((cid, dtp, error)) => {
|
|
677
|
+
// Parsing failed - add to errors
|
|
354
678
|
failed_downloads.fetch_add(1, Ordering::Relaxed);
|
|
355
679
|
let error = ErrorEntry {
|
|
356
680
|
case_id: cid,
|
|
@@ -364,9 +688,9 @@ async fn download_data(
|
|
|
364
688
|
}
|
|
365
689
|
}
|
|
366
690
|
Err((case_id, data_type, error_details, attempts)) => {
|
|
691
|
+
// Download failed - add to errors
|
|
367
692
|
failed_downloads.fetch_add(1, Ordering::Relaxed);
|
|
368
693
|
|
|
369
|
-
// Parse error type from error details
|
|
370
694
|
let (error_type, clean_details) = if error_details.contains(":") {
|
|
371
695
|
let parts: Vec<&str> = error_details.splitn(2, ": ").collect();
|
|
372
696
|
(parts[0].to_string(), parts[1].to_string())
|
|
@@ -388,27 +712,29 @@ async fn download_data(
|
|
|
388
712
|
})
|
|
389
713
|
.await;
|
|
390
714
|
|
|
391
|
-
//
|
|
715
|
+
// Output final summary as the last line
|
|
392
716
|
let success_count = successful_downloads.load(Ordering::Relaxed);
|
|
393
717
|
let failed_count = failed_downloads.load(Ordering::Relaxed);
|
|
394
|
-
|
|
395
|
-
let
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
718
|
+
let filtered_maf_count = filtered_maf_records.load(Ordering::Relaxed);
|
|
719
|
+
let filtered_cnv_count = filtered_cnv_records.load(Ordering::Relaxed);
|
|
720
|
+
|
|
721
|
+
let summary = FinalSummary {
|
|
722
|
+
output_type: "summary".to_string(),
|
|
723
|
+
total_files,
|
|
724
|
+
successful_files: success_count,
|
|
725
|
+
failed_files: failed_count,
|
|
726
|
+
errors: errors.lock().await.clone(),
|
|
727
|
+
filtered_records: filtered_maf_count + filtered_cnv_count,
|
|
728
|
+
filtered_maf_records: filtered_maf_count,
|
|
729
|
+
filtered_cnv_records: filtered_cnv_count,
|
|
730
|
+
filtered_records_by_case: filtered_records.lock().await.clone(),
|
|
403
731
|
};
|
|
404
732
|
|
|
405
|
-
// Output
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
std::process::exit(1);
|
|
411
|
-
}
|
|
733
|
+
// Output final summary - Node.js will know processing is complete when it sees this
|
|
734
|
+
if let Ok(json) = serde_json::to_string(&summary) {
|
|
735
|
+
println!("{}", json);
|
|
736
|
+
use std::io::Write;
|
|
737
|
+
let _ = std::io::stdout().flush();
|
|
412
738
|
}
|
|
413
739
|
}
|
|
414
740
|
|
|
@@ -455,13 +781,33 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
455
781
|
let case_files = input_js.case_files;
|
|
456
782
|
|
|
457
783
|
// Set default maf_options
|
|
458
|
-
let (min_total_depth, min_alt_allele_count) = match input_js.maf_options {
|
|
459
|
-
Some(options) => (
|
|
460
|
-
|
|
784
|
+
let (min_total_depth, min_alt_allele_count, consequences) = match input_js.maf_options {
|
|
785
|
+
Some(options) => (
|
|
786
|
+
options.min_total_depth,
|
|
787
|
+
options.min_alt_allele_count,
|
|
788
|
+
options.consequences.clone(),
|
|
789
|
+
),
|
|
790
|
+
None => (10, 2, None), // Default values
|
|
791
|
+
};
|
|
792
|
+
|
|
793
|
+
// Set default cnv_options
|
|
794
|
+
let (gain_threshold, loss_threshold, seg_length) = match input_js.cnv_options {
|
|
795
|
+
Some(options) => (options.gain_threshold, options.loss_threshold, options.seg_length),
|
|
796
|
+
None => (0.3, -0.4, 0), // Default values
|
|
461
797
|
};
|
|
462
798
|
|
|
463
799
|
// Download data - this will now handle errors gracefully
|
|
464
|
-
|
|
800
|
+
download_data_streaming(
|
|
801
|
+
case_files,
|
|
802
|
+
HOST,
|
|
803
|
+
min_total_depth,
|
|
804
|
+
min_alt_allele_count,
|
|
805
|
+
&consequences,
|
|
806
|
+
gain_threshold,
|
|
807
|
+
loss_threshold,
|
|
808
|
+
seg_length,
|
|
809
|
+
)
|
|
810
|
+
.await;
|
|
465
811
|
|
|
466
812
|
// Always exit successfully - individual file failures are logged but don't stop the process
|
|
467
813
|
Ok(())
|