@sjcrh/proteinpaint-rust 2.130.0 → 2.132.1-0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/gdcGRIN2.rs +135 -49
package/package.json
CHANGED
package/src/gdcGRIN2.rs
CHANGED
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
Output mutations as JSON array.
|
|
14
14
|
|
|
15
15
|
Example of usage:
|
|
16
|
-
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000}}' | ./target/release/gdcGRIN2
|
|
16
|
+
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20,"hyperMutator":1000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000}}' | ./target/release/gdcGRIN2
|
|
17
17
|
*/
|
|
18
18
|
|
|
19
19
|
use flate2::read::GzDecoder;
|
|
@@ -55,6 +55,8 @@ struct MafOptions {
|
|
|
55
55
|
min_total_depth: i32,
|
|
56
56
|
#[serde(rename = "minAltAlleleCount")]
|
|
57
57
|
min_alt_allele_count: i32,
|
|
58
|
+
#[serde(rename = "hyperMutator")]
|
|
59
|
+
hyper_mutator: i32,
|
|
58
60
|
consequences: Option<Vec<String>>, // Optional list of consequences to filter MAF files
|
|
59
61
|
}
|
|
60
62
|
|
|
@@ -82,10 +84,16 @@ struct SuccessfulFileOutput {
|
|
|
82
84
|
// struct for MAF filter details
|
|
83
85
|
#[derive(Clone, Serialize, Default)]
|
|
84
86
|
struct FilteredMafDetails {
|
|
85
|
-
|
|
87
|
+
matched_consequences: HashMap<String, usize>,
|
|
88
|
+
rejected_consequences: HashMap<String, usize>,
|
|
86
89
|
t_alt_count: usize,
|
|
87
90
|
t_depth: usize,
|
|
88
91
|
invalid_rows: usize,
|
|
92
|
+
excluded_by_min_depth: usize,
|
|
93
|
+
excluded_by_min_alt_count: usize,
|
|
94
|
+
excluded_by_consequence_type: usize,
|
|
95
|
+
total_processed: usize,
|
|
96
|
+
total_included: usize,
|
|
89
97
|
}
|
|
90
98
|
|
|
91
99
|
// struct for CNV filter details
|
|
@@ -94,6 +102,11 @@ struct FilteredCnvDetails {
|
|
|
94
102
|
segment_mean: usize,
|
|
95
103
|
seg_length: usize,
|
|
96
104
|
invalid_rows: usize,
|
|
105
|
+
excluded_by_loss_threshold: usize,
|
|
106
|
+
excluded_by_gain_threshold: usize,
|
|
107
|
+
excluded_by_segment_length: usize,
|
|
108
|
+
total_processed: usize,
|
|
109
|
+
total_included: usize,
|
|
97
110
|
}
|
|
98
111
|
|
|
99
112
|
// struct for per-case filter details
|
|
@@ -115,7 +128,10 @@ struct FinalSummary {
|
|
|
115
128
|
filtered_records: usize,
|
|
116
129
|
filtered_maf_records: usize,
|
|
117
130
|
filtered_cnv_records: usize,
|
|
131
|
+
included_maf_records: usize,
|
|
132
|
+
included_cnv_records: usize,
|
|
118
133
|
filtered_records_by_case: HashMap<String, FilteredCaseDetails>,
|
|
134
|
+
hyper_mutator_records: Vec<String>,
|
|
119
135
|
}
|
|
120
136
|
|
|
121
137
|
// Define the top-level input structure
|
|
@@ -143,6 +159,7 @@ async fn parse_content(
|
|
|
143
159
|
data_type: &str,
|
|
144
160
|
min_total_depth: i32,
|
|
145
161
|
min_alt_allele_count: i32,
|
|
162
|
+
hyper_mutator: i32,
|
|
146
163
|
consequences: &Option<Vec<String>>,
|
|
147
164
|
gain_threshold: f32,
|
|
148
165
|
loss_threshold: f32,
|
|
@@ -150,6 +167,9 @@ async fn parse_content(
|
|
|
150
167
|
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
151
168
|
filtered_maf_records: &AtomicUsize,
|
|
152
169
|
filtered_cnv_records: &AtomicUsize,
|
|
170
|
+
included_maf_records: &AtomicUsize,
|
|
171
|
+
included_cnv_records: &AtomicUsize,
|
|
172
|
+
hyper_mutator_records: &Arc<Mutex<Vec<String>>>,
|
|
153
173
|
) -> Result<Vec<Vec<String>>, (String, String, String)> {
|
|
154
174
|
let config = match data_type {
|
|
155
175
|
"cnv" => DataTypeConfig {
|
|
@@ -169,6 +189,18 @@ async fn parse_content(
|
|
|
169
189
|
}
|
|
170
190
|
};
|
|
171
191
|
|
|
192
|
+
// check hyperMutator for MAF files
|
|
193
|
+
if data_type == "maf" && hyper_mutator > 0 {
|
|
194
|
+
let line_count = content.lines().count();
|
|
195
|
+
if line_count as i32 > hyper_mutator {
|
|
196
|
+
let mut hyper_records = hyper_mutator_records.lock().await;
|
|
197
|
+
if !hyper_records.contains(&case_id.to_string()) {
|
|
198
|
+
hyper_records.push(case_id.to_string());
|
|
199
|
+
}
|
|
200
|
+
return Ok(Vec::new());
|
|
201
|
+
}
|
|
202
|
+
};
|
|
203
|
+
|
|
172
204
|
let lines = content.lines();
|
|
173
205
|
let mut parsed_data = Vec::new();
|
|
174
206
|
let mut columns_indices: Vec<usize> = Vec::new();
|
|
@@ -213,6 +245,8 @@ async fn parse_content(
|
|
|
213
245
|
filtered_records,
|
|
214
246
|
filtered_maf_records,
|
|
215
247
|
filtered_cnv_records,
|
|
248
|
+
included_maf_records,
|
|
249
|
+
included_cnv_records,
|
|
216
250
|
)
|
|
217
251
|
.await?;
|
|
218
252
|
|
|
@@ -255,7 +289,7 @@ fn setup_columns(
|
|
|
255
289
|
}
|
|
256
290
|
|
|
257
291
|
if data_type == "maf" {
|
|
258
|
-
*variant_classification_index = header.iter().position(|x| x == "
|
|
292
|
+
*variant_classification_index = header.iter().position(|x| x == "One_Consequence");
|
|
259
293
|
if variant_classification_index.is_none() {
|
|
260
294
|
return Err((
|
|
261
295
|
case_id.to_string(),
|
|
@@ -285,6 +319,8 @@ async fn process_row(
|
|
|
285
319
|
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
286
320
|
filtered_maf_records: &AtomicUsize,
|
|
287
321
|
filtered_cnv_records: &AtomicUsize,
|
|
322
|
+
included_maf_records: &AtomicUsize,
|
|
323
|
+
included_cnv_records: &AtomicUsize,
|
|
288
324
|
) -> Result<Option<Vec<String>>, (String, String, String)> {
|
|
289
325
|
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
290
326
|
let mut out_lst = vec![case_id.to_string()];
|
|
@@ -300,11 +336,64 @@ async fn process_row(
|
|
|
300
336
|
|
|
301
337
|
let case_details = filtered_map.get_mut(case_id).unwrap();
|
|
302
338
|
|
|
303
|
-
//
|
|
304
|
-
if data_type == "maf"
|
|
305
|
-
case_details.maf.
|
|
306
|
-
|
|
307
|
-
|
|
339
|
+
// Track total processed records
|
|
340
|
+
if data_type == "maf" {
|
|
341
|
+
case_details.maf.total_processed += 1;
|
|
342
|
+
} else if data_type == "cnv" {
|
|
343
|
+
case_details.cnv.total_processed += 1;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// Handle consequence filtering and counting for MAF files
|
|
347
|
+
if data_type == "maf" {
|
|
348
|
+
if let Some(var_class_idx) = variant_classification_index {
|
|
349
|
+
if var_class_idx < cont_lst.len() {
|
|
350
|
+
let variant_classification = &cont_lst[var_class_idx];
|
|
351
|
+
if let Some(consequence_filter) = consequences {
|
|
352
|
+
if !consequence_filter.is_empty() {
|
|
353
|
+
if consequence_filter.contains(variant_classification) {
|
|
354
|
+
// Matched consequence
|
|
355
|
+
*case_details
|
|
356
|
+
.maf
|
|
357
|
+
.matched_consequences
|
|
358
|
+
.entry(variant_classification.to_string())
|
|
359
|
+
.or_insert(0) += 1;
|
|
360
|
+
} else {
|
|
361
|
+
// Unmatched consequence
|
|
362
|
+
*case_details
|
|
363
|
+
.maf
|
|
364
|
+
.rejected_consequences
|
|
365
|
+
.entry(variant_classification.to_string())
|
|
366
|
+
.or_insert(0) += 1;
|
|
367
|
+
case_details.maf.excluded_by_consequence_type += 1;
|
|
368
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
369
|
+
return Ok(None);
|
|
370
|
+
}
|
|
371
|
+
} else {
|
|
372
|
+
// Empty filter, count as matched
|
|
373
|
+
*case_details
|
|
374
|
+
.maf
|
|
375
|
+
.matched_consequences
|
|
376
|
+
.entry(variant_classification.to_string())
|
|
377
|
+
.or_insert(0) += 1;
|
|
378
|
+
}
|
|
379
|
+
} else {
|
|
380
|
+
// No filter, count as matched
|
|
381
|
+
*case_details
|
|
382
|
+
.maf
|
|
383
|
+
.matched_consequences
|
|
384
|
+
.entry(variant_classification.to_string())
|
|
385
|
+
.or_insert(0) += 1;
|
|
386
|
+
}
|
|
387
|
+
} else {
|
|
388
|
+
case_details.maf.invalid_rows += 1;
|
|
389
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
390
|
+
return Ok(None);
|
|
391
|
+
}
|
|
392
|
+
} else {
|
|
393
|
+
case_details.maf.invalid_rows += 1;
|
|
394
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
395
|
+
return Ok(None);
|
|
396
|
+
}
|
|
308
397
|
}
|
|
309
398
|
|
|
310
399
|
// Extract relevant columns
|
|
@@ -325,6 +414,15 @@ async fn process_row(
|
|
|
325
414
|
element = process_segment_mean(&element, case_id, data_type, gain_threshold, loss_threshold)?;
|
|
326
415
|
if element.is_empty() {
|
|
327
416
|
case_details.cnv.segment_mean += 1;
|
|
417
|
+
let seg_mean = cont_lst[x].parse::<f32>().unwrap_or(0.0);
|
|
418
|
+
if seg_mean > loss_threshold && seg_mean < gain_threshold {
|
|
419
|
+
// Between thresholds - not a significant gain or loss
|
|
420
|
+
if seg_mean >= 0.0 {
|
|
421
|
+
case_details.cnv.excluded_by_gain_threshold += 1;
|
|
422
|
+
} else {
|
|
423
|
+
case_details.cnv.excluded_by_loss_threshold += 1;
|
|
424
|
+
}
|
|
425
|
+
}
|
|
328
426
|
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
329
427
|
return Ok(None);
|
|
330
428
|
}
|
|
@@ -362,11 +460,13 @@ async fn process_row(
|
|
|
362
460
|
|
|
363
461
|
if alle_depth < min_total_depth {
|
|
364
462
|
case_details.maf.t_depth += 1;
|
|
463
|
+
case_details.maf.excluded_by_min_depth += 1;
|
|
365
464
|
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
366
465
|
return Ok(None);
|
|
367
466
|
}
|
|
368
467
|
if alt_count < min_alt_allele_count {
|
|
369
468
|
case_details.maf.t_alt_count += 1;
|
|
469
|
+
case_details.maf.excluded_by_min_alt_count += 1;
|
|
370
470
|
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
371
471
|
return Ok(None);
|
|
372
472
|
}
|
|
@@ -374,6 +474,10 @@ async fn process_row(
|
|
|
374
474
|
// Keep case_id, chr, start, end, and add "mutation"
|
|
375
475
|
out_lst = out_lst[0..4].to_vec();
|
|
376
476
|
out_lst.push("mutation".to_string());
|
|
477
|
+
|
|
478
|
+
// Update counters for included MAF records
|
|
479
|
+
case_details.maf.total_included += 1;
|
|
480
|
+
included_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
377
481
|
}
|
|
378
482
|
|
|
379
483
|
// filter cnvs based on segment length. Default: 0 (no filtering)
|
|
@@ -401,36 +505,17 @@ async fn process_row(
|
|
|
401
505
|
let cnv_length = end_position - start_position;
|
|
402
506
|
if seg_length > 0 && cnv_length > seg_length {
|
|
403
507
|
case_details.cnv.seg_length += 1;
|
|
508
|
+
case_details.cnv.excluded_by_segment_length += 1;
|
|
404
509
|
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
405
510
|
return Ok(None);
|
|
406
511
|
}
|
|
512
|
+
case_details.cnv.total_included += 1;
|
|
513
|
+
included_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
407
514
|
}
|
|
408
515
|
|
|
409
516
|
Ok(Some(out_lst))
|
|
410
517
|
}
|
|
411
518
|
|
|
412
|
-
// Check if the row meets consequence filtering criteria
|
|
413
|
-
fn is_valid_consequence(
|
|
414
|
-
cont_lst: &[String],
|
|
415
|
-
variant_classification_index: Option<usize>,
|
|
416
|
-
consequences: &Option<Vec<String>>,
|
|
417
|
-
) -> bool {
|
|
418
|
-
if let Some(consequence_filter) = consequences {
|
|
419
|
-
if !consequence_filter.is_empty() {
|
|
420
|
-
if let Some(var_class_idx) = variant_classification_index {
|
|
421
|
-
if var_class_idx < cont_lst.len() {
|
|
422
|
-
let variant_classification = &cont_lst[var_class_idx];
|
|
423
|
-
if let Some(normalized_consequence) = normalize_consequence(variant_classification) {
|
|
424
|
-
return consequence_filter.contains(&normalized_consequence);
|
|
425
|
-
}
|
|
426
|
-
}
|
|
427
|
-
return false; // Invalid row or unknown consequence
|
|
428
|
-
}
|
|
429
|
-
}
|
|
430
|
-
}
|
|
431
|
-
true // No filtering or empty filter
|
|
432
|
-
}
|
|
433
|
-
|
|
434
519
|
// Process Segment_Mean for CNV files
|
|
435
520
|
fn process_segment_mean(
|
|
436
521
|
element: &str,
|
|
@@ -457,23 +542,6 @@ fn process_segment_mean(
|
|
|
457
542
|
}
|
|
458
543
|
|
|
459
544
|
/// Updated helper function to normalize MAF consequence types to frontend format
|
|
460
|
-
/// Returns None for unknown consequence types (which will be filtered out)
|
|
461
|
-
fn normalize_consequence(maf_consequence: &str) -> Option<String> {
|
|
462
|
-
match maf_consequence.to_lowercase().as_str() {
|
|
463
|
-
// Only map the consequence types we actually support
|
|
464
|
-
"missense_mutation" => Some("missense".to_string()),
|
|
465
|
-
"nonsense_mutation" | "stop_gained" | "stop_lost" => Some("nonsense".to_string()),
|
|
466
|
-
"frame_shift_del" | "frame_shift_ins" | "frameshift_variant" => Some("frameshift".to_string()),
|
|
467
|
-
"silent" | "synonymous_variant" => Some("silent".to_string()),
|
|
468
|
-
"in_frame_del" => Some("deletion".to_string()),
|
|
469
|
-
"in_frame_ins" => Some("insertion".to_string()),
|
|
470
|
-
"splice_site" | "splice_acceptor_variant" | "splice_donor_variant" => Some("splice_site".to_string()),
|
|
471
|
-
"tandem_duplication" | "duplication" => Some("duplication".to_string()),
|
|
472
|
-
"inversion" => Some("inversion".to_string()),
|
|
473
|
-
// Return None for all unknown consequence types - they will be filtered out
|
|
474
|
-
_ => None,
|
|
475
|
-
}
|
|
476
|
-
}
|
|
477
545
|
/// Downloads a single file with minimal retry logic for transient failures
|
|
478
546
|
async fn download_single_file(
|
|
479
547
|
case_id: String,
|
|
@@ -584,6 +652,7 @@ async fn download_data_streaming(
|
|
|
584
652
|
host: &str,
|
|
585
653
|
min_total_depth: i32,
|
|
586
654
|
min_alt_allele_count: i32,
|
|
655
|
+
hyper_mutator: i32,
|
|
587
656
|
consequences: &Option<Vec<String>>,
|
|
588
657
|
gain_threshold: f32,
|
|
589
658
|
loss_threshold: f32,
|
|
@@ -611,6 +680,9 @@ async fn download_data_streaming(
|
|
|
611
680
|
let filtered_maf_records = Arc::new(AtomicUsize::new(0));
|
|
612
681
|
let filtered_cnv_records = Arc::new(AtomicUsize::new(0));
|
|
613
682
|
let filtered_records = Arc::new(Mutex::new(HashMap::<String, FilteredCaseDetails>::new()));
|
|
683
|
+
let hyper_mutator_records = Arc::new(Mutex::new(Vec::<String>::new()));
|
|
684
|
+
let included_maf_records = Arc::new(AtomicUsize::new(0));
|
|
685
|
+
let included_cnv_records = Arc::new(AtomicUsize::new(0));
|
|
614
686
|
|
|
615
687
|
// Only collect errors (successful data is output immediately)
|
|
616
688
|
let errors = Arc::new(Mutex::new(Vec::<ErrorEntry>::new()));
|
|
@@ -630,6 +702,9 @@ async fn download_data_streaming(
|
|
|
630
702
|
let filtered_maf_records = Arc::clone(&filtered_maf_records);
|
|
631
703
|
let filtered_cnv_records = Arc::clone(&filtered_cnv_records);
|
|
632
704
|
let filtered_records = Arc::clone(&filtered_records);
|
|
705
|
+
let included_maf_records = Arc::clone(&included_maf_records);
|
|
706
|
+
let included_cnv_records = Arc::clone(&included_cnv_records);
|
|
707
|
+
let hyper_mutator_records = Arc::clone(&hyper_mutator_records);
|
|
633
708
|
let errors = Arc::clone(&errors);
|
|
634
709
|
|
|
635
710
|
async move {
|
|
@@ -642,6 +717,7 @@ async fn download_data_streaming(
|
|
|
642
717
|
&data_type,
|
|
643
718
|
min_total_depth,
|
|
644
719
|
min_alt_allele_count,
|
|
720
|
+
hyper_mutator,
|
|
645
721
|
&consequences,
|
|
646
722
|
gain_threshold,
|
|
647
723
|
loss_threshold,
|
|
@@ -649,6 +725,9 @@ async fn download_data_streaming(
|
|
|
649
725
|
&filtered_records,
|
|
650
726
|
&filtered_maf_records,
|
|
651
727
|
&filtered_cnv_records,
|
|
728
|
+
&included_maf_records,
|
|
729
|
+
&included_cnv_records,
|
|
730
|
+
&hyper_mutator_records,
|
|
652
731
|
)
|
|
653
732
|
.await
|
|
654
733
|
{
|
|
@@ -717,6 +796,8 @@ async fn download_data_streaming(
|
|
|
717
796
|
let failed_count = failed_downloads.load(Ordering::Relaxed);
|
|
718
797
|
let filtered_maf_count = filtered_maf_records.load(Ordering::Relaxed);
|
|
719
798
|
let filtered_cnv_count = filtered_cnv_records.load(Ordering::Relaxed);
|
|
799
|
+
let included_maf_count = included_maf_records.load(Ordering::Relaxed);
|
|
800
|
+
let included_cnv_count = included_cnv_records.load(Ordering::Relaxed);
|
|
720
801
|
|
|
721
802
|
let summary = FinalSummary {
|
|
722
803
|
output_type: "summary".to_string(),
|
|
@@ -728,6 +809,9 @@ async fn download_data_streaming(
|
|
|
728
809
|
filtered_maf_records: filtered_maf_count,
|
|
729
810
|
filtered_cnv_records: filtered_cnv_count,
|
|
730
811
|
filtered_records_by_case: filtered_records.lock().await.clone(),
|
|
812
|
+
included_maf_records: included_maf_count,
|
|
813
|
+
included_cnv_records: included_cnv_count,
|
|
814
|
+
hyper_mutator_records: hyper_mutator_records.lock().await.clone(),
|
|
731
815
|
};
|
|
732
816
|
|
|
733
817
|
// Output final summary - Node.js will know processing is complete when it sees this
|
|
@@ -781,13 +865,14 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
781
865
|
let case_files = input_js.case_files;
|
|
782
866
|
|
|
783
867
|
// Set default maf_options
|
|
784
|
-
let (min_total_depth, min_alt_allele_count, consequences) = match input_js.maf_options {
|
|
868
|
+
let (min_total_depth, min_alt_allele_count, hyper_mutator, consequences) = match input_js.maf_options {
|
|
785
869
|
Some(options) => (
|
|
786
870
|
options.min_total_depth,
|
|
787
871
|
options.min_alt_allele_count,
|
|
872
|
+
options.hyper_mutator,
|
|
788
873
|
options.consequences.clone(),
|
|
789
874
|
),
|
|
790
|
-
None => (10, 2, None), // Default values
|
|
875
|
+
None => (10, 2, 8000, None), // Default values
|
|
791
876
|
};
|
|
792
877
|
|
|
793
878
|
// Set default cnv_options
|
|
@@ -802,6 +887,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
802
887
|
HOST,
|
|
803
888
|
min_total_depth,
|
|
804
889
|
min_alt_allele_count,
|
|
890
|
+
hyper_mutator,
|
|
805
891
|
&consequences,
|
|
806
892
|
gain_threshold,
|
|
807
893
|
loss_threshold,
|