@sjcrh/proteinpaint-rust 2.129.6 → 2.132.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -3
- package/package.json +1 -1
- package/src/gdcGRIN2.rs +209 -78
package/README.md
CHANGED
|
@@ -36,13 +36,18 @@ const out = await run_rust('indel', input_data)
|
|
|
36
36
|
|
|
37
37
|
## Test
|
|
38
38
|
|
|
39
|
-
|
|
39
|
+
For running the tests written in nodejs, from the `proteinpaint` directory run,
|
|
40
40
|
|
|
41
41
|
```bash
|
|
42
|
-
|
|
43
|
-
npx tsc
|
|
42
|
+
npm run test:unit --workspace="rust"
|
|
44
43
|
```
|
|
45
44
|
|
|
45
|
+
For running the tests written in native rust, from the `proteinpaint/rust` directory run.
|
|
46
|
+
```bash
|
|
47
|
+
cargo test
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
|
|
46
51
|
## Build
|
|
47
52
|
|
|
48
53
|
```bash
|
package/package.json
CHANGED
package/src/gdcGRIN2.rs
CHANGED
|
@@ -13,18 +13,19 @@
|
|
|
13
13
|
Output mutations as JSON array.
|
|
14
14
|
|
|
15
15
|
Example of usage:
|
|
16
|
-
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000}}' | ./target/release/gdcGRIN2
|
|
16
|
+
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20,"hyperMutator":1000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000}}' | ./target/release/gdcGRIN2
|
|
17
17
|
*/
|
|
18
18
|
|
|
19
19
|
use flate2::read::GzDecoder;
|
|
20
20
|
use futures::StreamExt;
|
|
21
21
|
use memchr::memchr;
|
|
22
|
-
use serde::Deserialize;
|
|
22
|
+
use serde::{Deserialize, Serialize};
|
|
23
23
|
use serde_json;
|
|
24
24
|
use std::collections::HashMap;
|
|
25
25
|
use std::io::{self, Read};
|
|
26
26
|
use std::sync::Arc;
|
|
27
27
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
28
|
+
use std::thread::sleep;
|
|
28
29
|
use std::time::Duration;
|
|
29
30
|
use tokio::io::{AsyncReadExt, BufReader};
|
|
30
31
|
use tokio::sync::Mutex;
|
|
@@ -54,6 +55,8 @@ struct MafOptions {
|
|
|
54
55
|
min_total_depth: i32,
|
|
55
56
|
#[serde(rename = "minAltAlleleCount")]
|
|
56
57
|
min_alt_allele_count: i32,
|
|
58
|
+
#[serde(rename = "hyperMutator")]
|
|
59
|
+
hyper_mutator: i32,
|
|
57
60
|
consequences: Option<Vec<String>>, // Optional list of consequences to filter MAF files
|
|
58
61
|
}
|
|
59
62
|
|
|
@@ -78,6 +81,31 @@ struct SuccessfulFileOutput {
|
|
|
78
81
|
data: Vec<Vec<String>>,
|
|
79
82
|
}
|
|
80
83
|
|
|
84
|
+
// struct for MAF filter details
|
|
85
|
+
#[derive(Clone, Serialize, Default)]
|
|
86
|
+
struct FilteredMafDetails {
|
|
87
|
+
matched_consequences: HashMap<String, usize>,
|
|
88
|
+
rejected_consequences: HashMap<String, usize>,
|
|
89
|
+
t_alt_count: usize,
|
|
90
|
+
t_depth: usize,
|
|
91
|
+
invalid_rows: usize,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// struct for CNV filter details
|
|
95
|
+
#[derive(Clone, Serialize, Default)]
|
|
96
|
+
struct FilteredCnvDetails {
|
|
97
|
+
segment_mean: usize,
|
|
98
|
+
seg_length: usize,
|
|
99
|
+
invalid_rows: usize,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// struct for per-case filter details
|
|
103
|
+
#[derive(Clone, Serialize)]
|
|
104
|
+
struct FilteredCaseDetails {
|
|
105
|
+
maf: FilteredMafDetails,
|
|
106
|
+
cnv: FilteredCnvDetails,
|
|
107
|
+
}
|
|
108
|
+
|
|
81
109
|
// Final summary output (JSONL format)
|
|
82
110
|
#[derive(serde::Serialize)]
|
|
83
111
|
struct FinalSummary {
|
|
@@ -87,6 +115,13 @@ struct FinalSummary {
|
|
|
87
115
|
successful_files: usize,
|
|
88
116
|
failed_files: usize,
|
|
89
117
|
errors: Vec<ErrorEntry>,
|
|
118
|
+
filtered_records: usize,
|
|
119
|
+
filtered_maf_records: usize,
|
|
120
|
+
filtered_cnv_records: usize,
|
|
121
|
+
included_maf_records: usize,
|
|
122
|
+
included_cnv_records: usize,
|
|
123
|
+
filtered_records_by_case: HashMap<String, FilteredCaseDetails>,
|
|
124
|
+
hyper_mutator_records: Vec<String>,
|
|
90
125
|
}
|
|
91
126
|
|
|
92
127
|
// Define the top-level input structure
|
|
@@ -107,39 +142,25 @@ struct DataTypeConfig {
|
|
|
107
142
|
output_columns: Vec<&'static str>,
|
|
108
143
|
}
|
|
109
144
|
|
|
110
|
-
// Function to check if CNV file has Segment_Mean column
|
|
111
|
-
fn has_segment_mean_column(content: &str) -> bool {
|
|
112
|
-
for line in content.lines() {
|
|
113
|
-
// Check if this line contains Segment_Mean (likely the header)
|
|
114
|
-
if line.contains("Segment_Mean") {
|
|
115
|
-
return true;
|
|
116
|
-
}
|
|
117
|
-
// Stop checking after a few non-comment lines to avoid parsing entire file
|
|
118
|
-
if !line.trim().is_empty() {
|
|
119
|
-
break;
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
false
|
|
123
|
-
}
|
|
124
|
-
|
|
125
145
|
// Function to parse TSV content
|
|
126
|
-
|
|
127
|
-
fn parse_content(
|
|
146
|
+
async fn parse_content(
|
|
128
147
|
content: &str,
|
|
129
148
|
case_id: &str,
|
|
130
149
|
data_type: &str,
|
|
131
150
|
min_total_depth: i32,
|
|
132
151
|
min_alt_allele_count: i32,
|
|
152
|
+
hyper_mutator: i32,
|
|
133
153
|
consequences: &Option<Vec<String>>,
|
|
134
154
|
gain_threshold: f32,
|
|
135
155
|
loss_threshold: f32,
|
|
136
156
|
seg_length: i32,
|
|
157
|
+
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
158
|
+
filtered_maf_records: &AtomicUsize,
|
|
159
|
+
filtered_cnv_records: &AtomicUsize,
|
|
160
|
+
included_maf_records: &AtomicUsize,
|
|
161
|
+
included_cnv_records: &AtomicUsize,
|
|
162
|
+
hyper_mutator_records: &Arc<Mutex<Vec<String>>>,
|
|
137
163
|
) -> Result<Vec<Vec<String>>, (String, String, String)> {
|
|
138
|
-
// Early filter for CNV files - only process files with Segment_Mean
|
|
139
|
-
if data_type == "cnv" && !has_segment_mean_column(content) {
|
|
140
|
-
return Ok(Vec::new()); // Return empty result, no error
|
|
141
|
-
}
|
|
142
|
-
|
|
143
164
|
let config = match data_type {
|
|
144
165
|
"cnv" => DataTypeConfig {
|
|
145
166
|
header_marker: "Segment_Mean",
|
|
@@ -158,6 +179,18 @@ fn parse_content(
|
|
|
158
179
|
}
|
|
159
180
|
};
|
|
160
181
|
|
|
182
|
+
// check hyperMutator for MAF files
|
|
183
|
+
if data_type == "maf" && hyper_mutator > 0 {
|
|
184
|
+
let line_count = content.lines().count();
|
|
185
|
+
if line_count as i32 > hyper_mutator {
|
|
186
|
+
let mut hyper_records = hyper_mutator_records.lock().await;
|
|
187
|
+
if !hyper_records.contains(&case_id.to_string()) {
|
|
188
|
+
hyper_records.push(case_id.to_string());
|
|
189
|
+
}
|
|
190
|
+
return Ok(Vec::new());
|
|
191
|
+
}
|
|
192
|
+
};
|
|
193
|
+
|
|
161
194
|
let lines = content.lines();
|
|
162
195
|
let mut parsed_data = Vec::new();
|
|
163
196
|
let mut columns_indices: Vec<usize> = Vec::new();
|
|
@@ -199,7 +232,13 @@ fn parse_content(
|
|
|
199
232
|
gain_threshold,
|
|
200
233
|
loss_threshold,
|
|
201
234
|
seg_length,
|
|
202
|
-
|
|
235
|
+
filtered_records,
|
|
236
|
+
filtered_maf_records,
|
|
237
|
+
filtered_cnv_records,
|
|
238
|
+
included_maf_records,
|
|
239
|
+
included_cnv_records,
|
|
240
|
+
)
|
|
241
|
+
.await?;
|
|
203
242
|
|
|
204
243
|
if let Some(out_lst) = row {
|
|
205
244
|
parsed_data.push(out_lst);
|
|
@@ -240,7 +279,7 @@ fn setup_columns(
|
|
|
240
279
|
}
|
|
241
280
|
|
|
242
281
|
if data_type == "maf" {
|
|
243
|
-
*variant_classification_index = header.iter().position(|x| x == "
|
|
282
|
+
*variant_classification_index = header.iter().position(|x| x == "One_Consequence");
|
|
244
283
|
if variant_classification_index.is_none() {
|
|
245
284
|
return Err((
|
|
246
285
|
case_id.to_string(),
|
|
@@ -254,7 +293,7 @@ fn setup_columns(
|
|
|
254
293
|
}
|
|
255
294
|
|
|
256
295
|
// Process a single row of data
|
|
257
|
-
fn process_row(
|
|
296
|
+
async fn process_row(
|
|
258
297
|
line: &str,
|
|
259
298
|
case_id: &str,
|
|
260
299
|
data_type: &str,
|
|
@@ -267,18 +306,88 @@ fn process_row(
|
|
|
267
306
|
gain_threshold: f32,
|
|
268
307
|
loss_threshold: f32,
|
|
269
308
|
seg_length: i32,
|
|
309
|
+
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
310
|
+
filtered_maf_records: &AtomicUsize,
|
|
311
|
+
filtered_cnv_records: &AtomicUsize,
|
|
312
|
+
included_maf_records: &AtomicUsize,
|
|
313
|
+
included_cnv_records: &AtomicUsize,
|
|
270
314
|
) -> Result<Option<Vec<String>>, (String, String, String)> {
|
|
271
315
|
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
272
316
|
let mut out_lst = vec![case_id.to_string()];
|
|
273
317
|
|
|
274
|
-
//
|
|
275
|
-
|
|
276
|
-
|
|
318
|
+
// Initialize or update case details
|
|
319
|
+
let mut filtered_map = filtered_records.lock().await;
|
|
320
|
+
filtered_map
|
|
321
|
+
.entry(case_id.to_string())
|
|
322
|
+
.or_insert_with(|| FilteredCaseDetails {
|
|
323
|
+
maf: FilteredMafDetails::default(),
|
|
324
|
+
cnv: FilteredCnvDetails::default(),
|
|
325
|
+
});
|
|
326
|
+
|
|
327
|
+
let case_details = filtered_map.get_mut(case_id).unwrap();
|
|
328
|
+
|
|
329
|
+
// Handle consequence filtering and counting for MAF files
|
|
330
|
+
if data_type == "maf" {
|
|
331
|
+
if let Some(var_class_idx) = variant_classification_index {
|
|
332
|
+
if var_class_idx < cont_lst.len() {
|
|
333
|
+
let variant_classification = &cont_lst[var_class_idx];
|
|
334
|
+
if let Some(consequence_filter) = consequences {
|
|
335
|
+
if !consequence_filter.is_empty() {
|
|
336
|
+
if consequence_filter.contains(variant_classification) {
|
|
337
|
+
// Matched consequence
|
|
338
|
+
*case_details
|
|
339
|
+
.maf
|
|
340
|
+
.matched_consequences
|
|
341
|
+
.entry(variant_classification.to_string())
|
|
342
|
+
.or_insert(0) += 1;
|
|
343
|
+
} else {
|
|
344
|
+
// Unmatched consequence
|
|
345
|
+
*case_details
|
|
346
|
+
.maf
|
|
347
|
+
.rejected_consequences
|
|
348
|
+
.entry(variant_classification.to_string())
|
|
349
|
+
.or_insert(0) += 1;
|
|
350
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
351
|
+
return Ok(None);
|
|
352
|
+
}
|
|
353
|
+
} else {
|
|
354
|
+
// Empty filter, count as matched
|
|
355
|
+
*case_details
|
|
356
|
+
.maf
|
|
357
|
+
.matched_consequences
|
|
358
|
+
.entry(variant_classification.to_string())
|
|
359
|
+
.or_insert(0) += 1;
|
|
360
|
+
}
|
|
361
|
+
} else {
|
|
362
|
+
// No filter, count as matched
|
|
363
|
+
*case_details
|
|
364
|
+
.maf
|
|
365
|
+
.matched_consequences
|
|
366
|
+
.entry(variant_classification.to_string())
|
|
367
|
+
.or_insert(0) += 1;
|
|
368
|
+
}
|
|
369
|
+
} else {
|
|
370
|
+
case_details.maf.invalid_rows += 1;
|
|
371
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
372
|
+
return Ok(None);
|
|
373
|
+
}
|
|
374
|
+
} else {
|
|
375
|
+
case_details.maf.invalid_rows += 1;
|
|
376
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
377
|
+
return Ok(None);
|
|
378
|
+
}
|
|
277
379
|
}
|
|
278
380
|
|
|
279
381
|
// Extract relevant columns
|
|
280
382
|
for &x in columns_indices {
|
|
281
383
|
if x >= cont_lst.len() {
|
|
384
|
+
if data_type == "maf" {
|
|
385
|
+
case_details.maf.invalid_rows += 1;
|
|
386
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
387
|
+
} else if data_type == "cnv" {
|
|
388
|
+
case_details.cnv.invalid_rows += 1;
|
|
389
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
390
|
+
}
|
|
282
391
|
return Ok(None); // Invalid row
|
|
283
392
|
}
|
|
284
393
|
|
|
@@ -286,6 +395,8 @@ fn process_row(
|
|
|
286
395
|
if data_type == "cnv" && header[x] == "Segment_Mean" {
|
|
287
396
|
element = process_segment_mean(&element, case_id, data_type, gain_threshold, loss_threshold)?;
|
|
288
397
|
if element.is_empty() {
|
|
398
|
+
case_details.cnv.segment_mean += 1;
|
|
399
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
289
400
|
return Ok(None);
|
|
290
401
|
}
|
|
291
402
|
}
|
|
@@ -295,10 +406,14 @@ fn process_row(
|
|
|
295
406
|
// Additional MAF-specific processing
|
|
296
407
|
if data_type == "maf" {
|
|
297
408
|
if out_lst.len() < 6 {
|
|
409
|
+
case_details.maf.invalid_rows += 1;
|
|
410
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
298
411
|
return Ok(None); // Not enough columns
|
|
299
412
|
}
|
|
300
413
|
|
|
301
414
|
let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
|
|
415
|
+
case_details.maf.invalid_rows += 1;
|
|
416
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
302
417
|
(
|
|
303
418
|
case_id.to_string(),
|
|
304
419
|
data_type.to_string(),
|
|
@@ -307,6 +422,8 @@ fn process_row(
|
|
|
307
422
|
})?;
|
|
308
423
|
|
|
309
424
|
let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
|
|
425
|
+
case_details.maf.invalid_rows += 1;
|
|
426
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
310
427
|
(
|
|
311
428
|
case_id.to_string(),
|
|
312
429
|
data_type.to_string(),
|
|
@@ -314,19 +431,31 @@ fn process_row(
|
|
|
314
431
|
)
|
|
315
432
|
})?;
|
|
316
433
|
|
|
317
|
-
if alle_depth < min_total_depth
|
|
434
|
+
if alle_depth < min_total_depth {
|
|
435
|
+
case_details.maf.t_depth += 1;
|
|
436
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
437
|
+
return Ok(None);
|
|
438
|
+
}
|
|
439
|
+
if alt_count < min_alt_allele_count {
|
|
440
|
+
case_details.maf.t_alt_count += 1;
|
|
441
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
318
442
|
return Ok(None);
|
|
319
443
|
}
|
|
320
444
|
|
|
321
445
|
// Keep case_id, chr, start, end, and add "mutation"
|
|
322
446
|
out_lst = out_lst[0..4].to_vec();
|
|
323
447
|
out_lst.push("mutation".to_string());
|
|
448
|
+
|
|
449
|
+
// Update counters for included MAF records
|
|
450
|
+
included_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
324
451
|
}
|
|
325
452
|
|
|
326
|
-
// filter cnvs based on segment length. Default:
|
|
453
|
+
// filter cnvs based on segment length. Default: 0 (no filtering)
|
|
327
454
|
if data_type == "cnv" {
|
|
328
455
|
// calculate segment length (End_Position - Start_Position)
|
|
329
456
|
let end_position = out_lst[3].parse::<i32>().map_err(|_| {
|
|
457
|
+
case_details.cnv.invalid_rows += 1;
|
|
458
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
330
459
|
(
|
|
331
460
|
case_id.to_string(),
|
|
332
461
|
data_type.to_string(),
|
|
@@ -335,6 +464,8 @@ fn process_row(
|
|
|
335
464
|
})?;
|
|
336
465
|
|
|
337
466
|
let start_position = out_lst[2].parse::<i32>().map_err(|_| {
|
|
467
|
+
case_details.cnv.invalid_rows += 1;
|
|
468
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
338
469
|
(
|
|
339
470
|
case_id.to_string(),
|
|
340
471
|
data_type.to_string(),
|
|
@@ -342,36 +473,17 @@ fn process_row(
|
|
|
342
473
|
)
|
|
343
474
|
})?;
|
|
344
475
|
let cnv_length = end_position - start_position;
|
|
345
|
-
if cnv_length > seg_length {
|
|
476
|
+
if seg_length > 0 && cnv_length > seg_length {
|
|
477
|
+
case_details.cnv.seg_length += 1;
|
|
478
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
346
479
|
return Ok(None);
|
|
347
480
|
}
|
|
481
|
+
included_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
348
482
|
}
|
|
349
483
|
|
|
350
484
|
Ok(Some(out_lst))
|
|
351
485
|
}
|
|
352
486
|
|
|
353
|
-
// Check if the row meets consequence filtering criteria
|
|
354
|
-
fn is_valid_consequence(
|
|
355
|
-
cont_lst: &[String],
|
|
356
|
-
variant_classification_index: Option<usize>,
|
|
357
|
-
consequences: &Option<Vec<String>>,
|
|
358
|
-
) -> bool {
|
|
359
|
-
if let Some(consequence_filter) = consequences {
|
|
360
|
-
if !consequence_filter.is_empty() {
|
|
361
|
-
if let Some(var_class_idx) = variant_classification_index {
|
|
362
|
-
if var_class_idx < cont_lst.len() {
|
|
363
|
-
let variant_classification = &cont_lst[var_class_idx];
|
|
364
|
-
if let Some(normalized_consequence) = normalize_consequence(variant_classification) {
|
|
365
|
-
return consequence_filter.contains(&normalized_consequence);
|
|
366
|
-
}
|
|
367
|
-
}
|
|
368
|
-
return false; // Invalid row or unknown consequence
|
|
369
|
-
}
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
true // No filtering or empty filter
|
|
373
|
-
}
|
|
374
|
-
|
|
375
487
|
// Process Segment_Mean for CNV files
|
|
376
488
|
fn process_segment_mean(
|
|
377
489
|
element: &str,
|
|
@@ -398,23 +510,6 @@ fn process_segment_mean(
|
|
|
398
510
|
}
|
|
399
511
|
|
|
400
512
|
/// Updated helper function to normalize MAF consequence types to frontend format
|
|
401
|
-
/// Returns None for unknown consequence types (which will be filtered out)
|
|
402
|
-
fn normalize_consequence(maf_consequence: &str) -> Option<String> {
|
|
403
|
-
match maf_consequence.to_lowercase().as_str() {
|
|
404
|
-
// Only map the consequence types we actually support
|
|
405
|
-
"missense_mutation" => Some("missense".to_string()),
|
|
406
|
-
"nonsense_mutation" | "stop_gained" | "stop_lost" => Some("nonsense".to_string()),
|
|
407
|
-
"frame_shift_del" | "frame_shift_ins" | "frameshift_variant" => Some("frameshift".to_string()),
|
|
408
|
-
"silent" | "synonymous_variant" => Some("silent".to_string()),
|
|
409
|
-
"in_frame_del" => Some("deletion".to_string()),
|
|
410
|
-
"in_frame_ins" => Some("insertion".to_string()),
|
|
411
|
-
"splice_site" | "splice_acceptor_variant" | "splice_donor_variant" => Some("splice_site".to_string()),
|
|
412
|
-
"tandem_duplication" | "duplication" => Some("duplication".to_string()),
|
|
413
|
-
"inversion" => Some("inversion".to_string()),
|
|
414
|
-
// Return None for all unknown consequence types - they will be filtered out
|
|
415
|
-
_ => None,
|
|
416
|
-
}
|
|
417
|
-
}
|
|
418
513
|
/// Downloads a single file with minimal retry logic for transient failures
|
|
419
514
|
async fn download_single_file(
|
|
420
515
|
case_id: String,
|
|
@@ -517,7 +612,7 @@ async fn download_single_file(
|
|
|
517
612
|
))
|
|
518
613
|
}
|
|
519
614
|
|
|
520
|
-
///
|
|
615
|
+
/// Streaming download function
|
|
521
616
|
/// Outputs JSONL format: one JSON object per line
|
|
522
617
|
/// Node.js will read this line-by-line but still wait for completion
|
|
523
618
|
async fn download_data_streaming(
|
|
@@ -525,6 +620,7 @@ async fn download_data_streaming(
|
|
|
525
620
|
host: &str,
|
|
526
621
|
min_total_depth: i32,
|
|
527
622
|
min_alt_allele_count: i32,
|
|
623
|
+
hyper_mutator: i32,
|
|
528
624
|
consequences: &Option<Vec<String>>,
|
|
529
625
|
gain_threshold: f32,
|
|
530
626
|
loss_threshold: f32,
|
|
@@ -549,6 +645,12 @@ async fn download_data_streaming(
|
|
|
549
645
|
// Counters for final summary
|
|
550
646
|
let successful_downloads = Arc::new(AtomicUsize::new(0));
|
|
551
647
|
let failed_downloads = Arc::new(AtomicUsize::new(0));
|
|
648
|
+
let filtered_maf_records = Arc::new(AtomicUsize::new(0));
|
|
649
|
+
let filtered_cnv_records = Arc::new(AtomicUsize::new(0));
|
|
650
|
+
let filtered_records = Arc::new(Mutex::new(HashMap::<String, FilteredCaseDetails>::new()));
|
|
651
|
+
let hyper_mutator_records = Arc::new(Mutex::new(Vec::<String>::new()));
|
|
652
|
+
let included_maf_records = Arc::new(AtomicUsize::new(0));
|
|
653
|
+
let included_cnv_records = Arc::new(AtomicUsize::new(0));
|
|
552
654
|
|
|
553
655
|
// Only collect errors (successful data is output immediately)
|
|
554
656
|
let errors = Arc::new(Mutex::new(Vec::<ErrorEntry>::new()));
|
|
@@ -565,6 +667,12 @@ async fn download_data_streaming(
|
|
|
565
667
|
.for_each(|download_result| {
|
|
566
668
|
let successful_downloads = Arc::clone(&successful_downloads);
|
|
567
669
|
let failed_downloads = Arc::clone(&failed_downloads);
|
|
670
|
+
let filtered_maf_records = Arc::clone(&filtered_maf_records);
|
|
671
|
+
let filtered_cnv_records = Arc::clone(&filtered_cnv_records);
|
|
672
|
+
let filtered_records = Arc::clone(&filtered_records);
|
|
673
|
+
let included_maf_records = Arc::clone(&included_maf_records);
|
|
674
|
+
let included_cnv_records = Arc::clone(&included_cnv_records);
|
|
675
|
+
let hyper_mutator_records = Arc::clone(&hyper_mutator_records);
|
|
568
676
|
let errors = Arc::clone(&errors);
|
|
569
677
|
|
|
570
678
|
async move {
|
|
@@ -577,11 +685,20 @@ async fn download_data_streaming(
|
|
|
577
685
|
&data_type,
|
|
578
686
|
min_total_depth,
|
|
579
687
|
min_alt_allele_count,
|
|
688
|
+
hyper_mutator,
|
|
580
689
|
&consequences,
|
|
581
690
|
gain_threshold,
|
|
582
691
|
loss_threshold,
|
|
583
692
|
seg_length,
|
|
584
|
-
|
|
693
|
+
&filtered_records,
|
|
694
|
+
&filtered_maf_records,
|
|
695
|
+
&filtered_cnv_records,
|
|
696
|
+
&included_maf_records,
|
|
697
|
+
&included_cnv_records,
|
|
698
|
+
&hyper_mutator_records,
|
|
699
|
+
)
|
|
700
|
+
.await
|
|
701
|
+
{
|
|
585
702
|
Ok(parsed_data) => {
|
|
586
703
|
// SUCCESS: Output immediately as JSONL
|
|
587
704
|
let success_output = SuccessfulFileOutput {
|
|
@@ -597,6 +714,8 @@ async fn download_data_streaming(
|
|
|
597
714
|
// Force flush to ensure Node.js sees it immediately
|
|
598
715
|
use std::io::Write;
|
|
599
716
|
let _ = std::io::stdout().flush();
|
|
717
|
+
// Optional: Add small delay to separate lines
|
|
718
|
+
sleep(Duration::from_millis(10));
|
|
600
719
|
}
|
|
601
720
|
|
|
602
721
|
successful_downloads.fetch_add(1, Ordering::Relaxed);
|
|
@@ -643,6 +762,10 @@ async fn download_data_streaming(
|
|
|
643
762
|
// Output final summary as the last line
|
|
644
763
|
let success_count = successful_downloads.load(Ordering::Relaxed);
|
|
645
764
|
let failed_count = failed_downloads.load(Ordering::Relaxed);
|
|
765
|
+
let filtered_maf_count = filtered_maf_records.load(Ordering::Relaxed);
|
|
766
|
+
let filtered_cnv_count = filtered_cnv_records.load(Ordering::Relaxed);
|
|
767
|
+
let included_maf_count = included_maf_records.load(Ordering::Relaxed);
|
|
768
|
+
let included_cnv_count = included_cnv_records.load(Ordering::Relaxed);
|
|
646
769
|
|
|
647
770
|
let summary = FinalSummary {
|
|
648
771
|
output_type: "summary".to_string(),
|
|
@@ -650,6 +773,13 @@ async fn download_data_streaming(
|
|
|
650
773
|
successful_files: success_count,
|
|
651
774
|
failed_files: failed_count,
|
|
652
775
|
errors: errors.lock().await.clone(),
|
|
776
|
+
filtered_records: filtered_maf_count + filtered_cnv_count,
|
|
777
|
+
filtered_maf_records: filtered_maf_count,
|
|
778
|
+
filtered_cnv_records: filtered_cnv_count,
|
|
779
|
+
filtered_records_by_case: filtered_records.lock().await.clone(),
|
|
780
|
+
included_maf_records: included_maf_count,
|
|
781
|
+
included_cnv_records: included_cnv_count,
|
|
782
|
+
hyper_mutator_records: hyper_mutator_records.lock().await.clone(),
|
|
653
783
|
};
|
|
654
784
|
|
|
655
785
|
// Output final summary - Node.js will know processing is complete when it sees this
|
|
@@ -703,28 +833,29 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
703
833
|
let case_files = input_js.case_files;
|
|
704
834
|
|
|
705
835
|
// Set default maf_options
|
|
706
|
-
let (min_total_depth, min_alt_allele_count, consequences) = match input_js.maf_options {
|
|
836
|
+
let (min_total_depth, min_alt_allele_count, hyper_mutator, consequences) = match input_js.maf_options {
|
|
707
837
|
Some(options) => (
|
|
708
838
|
options.min_total_depth,
|
|
709
839
|
options.min_alt_allele_count,
|
|
840
|
+
options.hyper_mutator,
|
|
710
841
|
options.consequences.clone(),
|
|
711
842
|
),
|
|
712
|
-
None => (10, 2, None), // Default values
|
|
843
|
+
None => (10, 2, 8000, None), // Default values
|
|
713
844
|
};
|
|
714
845
|
|
|
715
846
|
// Set default cnv_options
|
|
716
847
|
let (gain_threshold, loss_threshold, seg_length) = match input_js.cnv_options {
|
|
717
848
|
Some(options) => (options.gain_threshold, options.loss_threshold, options.seg_length),
|
|
718
|
-
None => (0.3, -0.4,
|
|
849
|
+
None => (0.3, -0.4, 0), // Default values
|
|
719
850
|
};
|
|
720
851
|
|
|
721
852
|
// Download data - this will now handle errors gracefully
|
|
722
|
-
// download_data(case_files, HOST, min_total_depth, min_alt_allele_count, &consequences).await;
|
|
723
853
|
download_data_streaming(
|
|
724
854
|
case_files,
|
|
725
855
|
HOST,
|
|
726
856
|
min_total_depth,
|
|
727
857
|
min_alt_allele_count,
|
|
858
|
+
hyper_mutator,
|
|
728
859
|
&consequences,
|
|
729
860
|
gain_threshold,
|
|
730
861
|
loss_threshold,
|