@sjcrh/proteinpaint-rust 2.133.0 → 2.135.2-0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/gdcGRIN2.rs +543 -204
package/package.json
CHANGED
package/src/gdcGRIN2.rs
CHANGED
|
@@ -1,19 +1,34 @@
|
|
|
1
1
|
/*
|
|
2
|
-
This script
|
|
2
|
+
This script can either download cohort maf/cnv files from GDC or read them from local files, with default behavior being to download from GDC. It gracefully handles timeout and other possible errors related to GDC API processing or file reading for use by the client file summary div.
|
|
3
3
|
|
|
4
4
|
Key improvements:
|
|
5
5
|
1. Graceful error handling - individual file failures don't stop the entire process
|
|
6
6
|
2. Better timeout handling with retries
|
|
7
7
|
3. More detailed error reporting
|
|
8
8
|
4. Continues processing even when some files fail
|
|
9
|
+
5. Added chromosome filtering
|
|
10
|
+
6. Supports reading from local files with --from-file flag
|
|
11
|
+
|
|
12
|
+
Command-line arguments:
|
|
13
|
+
- --from-file: Read data from local files instead of downloading from GDC
|
|
9
14
|
|
|
10
15
|
Input JSON:
|
|
11
16
|
caseFiles
|
|
12
17
|
mafOptions: For SNVindel filtering
|
|
18
|
+
cnvOptions: For CNV filtering
|
|
19
|
+
chromosomes: chromosomes will be included:[]
|
|
20
|
+
|
|
13
21
|
Output mutations as JSON array.
|
|
22
|
+
{
|
|
23
|
+
grin2lesion:str,
|
|
24
|
+
summary:{}
|
|
25
|
+
}
|
|
14
26
|
|
|
15
27
|
Example of usage:
|
|
16
|
-
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth":
|
|
28
|
+
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 10,"minAltAlleleCount": 2,"hyperMutator":8000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-0.4, "gainThreshold": 0.3, "segLength":2000000, "hyperMutator":500}, "chromosomes":["chr1","chr2","chr3"], "max_record": 100000}' | ./target/release/gdcGRIN2
|
|
29
|
+
Example of usage (read from local files):
|
|
30
|
+
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 10,"minAltAlleleCount": 2,"hyperMutator":8000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-0.4, "gainThreshold": 0.3, "segLength":2000000, "hyperMutator":500}, "chromosomes":["chr1","chr2","chr3"], "max_record": 100000}' | ./target/release/gdcGRIN2 --from-file
|
|
31
|
+
|
|
17
32
|
*/
|
|
18
33
|
|
|
19
34
|
use flate2::read::GzDecoder;
|
|
@@ -21,11 +36,12 @@ use futures::StreamExt;
|
|
|
21
36
|
use memchr::memchr;
|
|
22
37
|
use serde::{Deserialize, Serialize};
|
|
23
38
|
use serde_json;
|
|
24
|
-
use std::collections::HashMap;
|
|
39
|
+
use std::collections::{HashMap, HashSet};
|
|
40
|
+
use std::env;
|
|
41
|
+
use std::fs;
|
|
25
42
|
use std::io::{self, Read};
|
|
26
43
|
use std::sync::Arc;
|
|
27
44
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
28
|
-
use std::thread::sleep;
|
|
29
45
|
use std::time::Duration;
|
|
30
46
|
use tokio::io::{AsyncReadExt, BufReader};
|
|
31
47
|
use tokio::sync::Mutex;
|
|
@@ -73,16 +89,6 @@ struct CnvOptions {
|
|
|
73
89
|
hyper_mutator: i32,
|
|
74
90
|
}
|
|
75
91
|
|
|
76
|
-
// Individual successful file output (JSONL format)
|
|
77
|
-
#[derive(serde::Serialize)]
|
|
78
|
-
struct SuccessfulFileOutput {
|
|
79
|
-
#[serde(rename = "type")]
|
|
80
|
-
output_type: String, // Always "data"
|
|
81
|
-
case_id: String,
|
|
82
|
-
data_type: String,
|
|
83
|
-
data: Vec<Vec<String>>,
|
|
84
|
-
}
|
|
85
|
-
|
|
86
92
|
// struct for MAF filter details
|
|
87
93
|
#[derive(Clone, Serialize, Default)]
|
|
88
94
|
struct FilteredMafDetails {
|
|
@@ -96,6 +102,7 @@ struct FilteredMafDetails {
|
|
|
96
102
|
excluded_by_consequence_type: usize,
|
|
97
103
|
total_processed: usize,
|
|
98
104
|
total_included: usize,
|
|
105
|
+
skipped_chromosomes: HashMap<String, usize>,
|
|
99
106
|
}
|
|
100
107
|
|
|
101
108
|
// struct for CNV filter details
|
|
@@ -109,6 +116,7 @@ struct FilteredCnvDetails {
|
|
|
109
116
|
excluded_by_segment_length: usize,
|
|
110
117
|
total_processed: usize,
|
|
111
118
|
total_included: usize,
|
|
119
|
+
skipped_chromosomes: HashMap<String, usize>,
|
|
112
120
|
}
|
|
113
121
|
|
|
114
122
|
// struct for per-case filter details
|
|
@@ -121,8 +129,6 @@ struct FilteredCaseDetails {
|
|
|
121
129
|
// Final summary output (JSONL format)
|
|
122
130
|
#[derive(serde::Serialize)]
|
|
123
131
|
struct FinalSummary {
|
|
124
|
-
#[serde(rename = "type")]
|
|
125
|
-
output_type: String, // Always "summary"
|
|
126
132
|
total_files: usize,
|
|
127
133
|
successful_files: usize,
|
|
128
134
|
failed_files: usize,
|
|
@@ -134,6 +140,14 @@ struct FinalSummary {
|
|
|
134
140
|
included_cnv_records: usize,
|
|
135
141
|
filtered_records_by_case: HashMap<String, FilteredCaseDetails>,
|
|
136
142
|
hyper_mutator_records: HashMap<String, Vec<String>>,
|
|
143
|
+
excluded_by_max_record: HashMap<String, Vec<String>>,
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Enum to hold both SuccessfulFileoutput and FinalSummary
|
|
147
|
+
#[derive(Serialize)]
|
|
148
|
+
struct Output {
|
|
149
|
+
grin2lesion: String,
|
|
150
|
+
summary: FinalSummary,
|
|
137
151
|
}
|
|
138
152
|
|
|
139
153
|
// Define the top-level input structure
|
|
@@ -145,6 +159,8 @@ struct InputData {
|
|
|
145
159
|
maf_options: Option<MafOptions>,
|
|
146
160
|
#[serde(rename = "cnvOptions")]
|
|
147
161
|
cnv_options: Option<CnvOptions>,
|
|
162
|
+
chromosomes: Vec<String>,
|
|
163
|
+
max_record: usize,
|
|
148
164
|
}
|
|
149
165
|
|
|
150
166
|
// Configuration for different data types
|
|
@@ -167,6 +183,7 @@ async fn parse_content(
|
|
|
167
183
|
loss_threshold: f32,
|
|
168
184
|
seg_length: i32,
|
|
169
185
|
cnv_hyper_mutator: i32,
|
|
186
|
+
chromosomes: &HashSet<String>,
|
|
170
187
|
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
171
188
|
filtered_maf_records: &AtomicUsize,
|
|
172
189
|
filtered_cnv_records: &AtomicUsize,
|
|
@@ -219,9 +236,6 @@ async fn parse_content(
|
|
|
219
236
|
let mut parsed_data = Vec::new();
|
|
220
237
|
let mut columns_indices: Vec<usize> = Vec::new();
|
|
221
238
|
let mut variant_classification_index: Option<usize> = None;
|
|
222
|
-
//let mut header_mk: &str = "";
|
|
223
|
-
//let mut columns = Vec::new();
|
|
224
|
-
|
|
225
239
|
let mut header: Vec<String> = Vec::new();
|
|
226
240
|
|
|
227
241
|
for line in lines {
|
|
@@ -243,26 +257,49 @@ async fn parse_content(
|
|
|
243
257
|
continue;
|
|
244
258
|
};
|
|
245
259
|
|
|
246
|
-
let row =
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
260
|
+
let row = match data_type {
|
|
261
|
+
"maf" => {
|
|
262
|
+
process_mafline(
|
|
263
|
+
line,
|
|
264
|
+
case_id,
|
|
265
|
+
data_type,
|
|
266
|
+
&columns_indices,
|
|
267
|
+
variant_classification_index,
|
|
268
|
+
consequences,
|
|
269
|
+
min_total_depth,
|
|
270
|
+
min_alt_allele_count,
|
|
271
|
+
chromosomes,
|
|
272
|
+
filtered_records,
|
|
273
|
+
filtered_maf_records,
|
|
274
|
+
included_maf_records,
|
|
275
|
+
)
|
|
276
|
+
.await
|
|
277
|
+
}
|
|
278
|
+
"cnv" => {
|
|
279
|
+
process_cnvline(
|
|
280
|
+
line,
|
|
281
|
+
case_id,
|
|
282
|
+
data_type,
|
|
283
|
+
&header,
|
|
284
|
+
&columns_indices,
|
|
285
|
+
gain_threshold,
|
|
286
|
+
loss_threshold,
|
|
287
|
+
seg_length,
|
|
288
|
+
chromosomes,
|
|
289
|
+
filtered_records,
|
|
290
|
+
filtered_cnv_records,
|
|
291
|
+
included_cnv_records,
|
|
292
|
+
)
|
|
293
|
+
.await
|
|
294
|
+
}
|
|
295
|
+
_ => {
|
|
296
|
+
return Err((
|
|
297
|
+
case_id.to_string(),
|
|
298
|
+
data_type.to_string(),
|
|
299
|
+
"Invalid data type".to_string(),
|
|
300
|
+
));
|
|
301
|
+
}
|
|
302
|
+
}?;
|
|
266
303
|
|
|
267
304
|
if let Some(out_lst) = row {
|
|
268
305
|
parsed_data.push(out_lst);
|
|
@@ -316,25 +353,20 @@ fn setup_columns(
|
|
|
316
353
|
Ok(())
|
|
317
354
|
}
|
|
318
355
|
|
|
319
|
-
// Process a single row of
|
|
320
|
-
async fn
|
|
356
|
+
// Process a single row of MAF file
|
|
357
|
+
async fn process_mafline(
|
|
321
358
|
line: &str,
|
|
322
359
|
case_id: &str,
|
|
323
360
|
data_type: &str,
|
|
324
|
-
header: &[String],
|
|
325
361
|
columns_indices: &[usize],
|
|
326
362
|
variant_classification_index: Option<usize>,
|
|
327
363
|
consequences: &Option<Vec<String>>,
|
|
328
364
|
min_total_depth: i32,
|
|
329
365
|
min_alt_allele_count: i32,
|
|
330
|
-
|
|
331
|
-
loss_threshold: f32,
|
|
332
|
-
seg_length: i32,
|
|
366
|
+
chromosomes: &HashSet<String>,
|
|
333
367
|
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
334
368
|
filtered_maf_records: &AtomicUsize,
|
|
335
|
-
filtered_cnv_records: &AtomicUsize,
|
|
336
369
|
included_maf_records: &AtomicUsize,
|
|
337
|
-
included_cnv_records: &AtomicUsize,
|
|
338
370
|
) -> Result<Option<Vec<String>>, (String, String, String)> {
|
|
339
371
|
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
340
372
|
let mut out_lst = vec![case_id.to_string()];
|
|
@@ -347,51 +379,38 @@ async fn process_row(
|
|
|
347
379
|
maf: FilteredMafDetails::default(),
|
|
348
380
|
cnv: FilteredCnvDetails::default(),
|
|
349
381
|
});
|
|
350
|
-
|
|
351
382
|
let case_details = filtered_map.get_mut(case_id).unwrap();
|
|
352
383
|
|
|
353
384
|
// Track total processed records
|
|
354
|
-
|
|
355
|
-
case_details.maf.total_processed += 1;
|
|
356
|
-
} else if data_type == "cnv" {
|
|
357
|
-
case_details.cnv.total_processed += 1;
|
|
358
|
-
}
|
|
385
|
+
case_details.maf.total_processed += 1;
|
|
359
386
|
|
|
360
387
|
// Handle consequence filtering and counting for MAF files
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
*case_details
|
|
370
|
-
.maf
|
|
371
|
-
.matched_consequences
|
|
372
|
-
.entry(variant_classification.to_string())
|
|
373
|
-
.or_insert(0) += 1;
|
|
374
|
-
} else {
|
|
375
|
-
// Unmatched consequence
|
|
376
|
-
*case_details
|
|
377
|
-
.maf
|
|
378
|
-
.rejected_consequences
|
|
379
|
-
.entry(variant_classification.to_string())
|
|
380
|
-
.or_insert(0) += 1;
|
|
381
|
-
case_details.maf.excluded_by_consequence_type += 1;
|
|
382
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
383
|
-
return Ok(None);
|
|
384
|
-
}
|
|
385
|
-
} else {
|
|
386
|
-
// Empty filter, count as matched
|
|
388
|
+
|
|
389
|
+
if let Some(var_class_idx) = variant_classification_index {
|
|
390
|
+
if var_class_idx < cont_lst.len() {
|
|
391
|
+
let variant_classification = &cont_lst[var_class_idx];
|
|
392
|
+
if let Some(consequence_filter) = consequences {
|
|
393
|
+
if !consequence_filter.is_empty() {
|
|
394
|
+
if consequence_filter.contains(variant_classification) {
|
|
395
|
+
// Matched consequence
|
|
387
396
|
*case_details
|
|
388
397
|
.maf
|
|
389
398
|
.matched_consequences
|
|
390
399
|
.entry(variant_classification.to_string())
|
|
391
400
|
.or_insert(0) += 1;
|
|
401
|
+
} else {
|
|
402
|
+
// Unmatched consequence
|
|
403
|
+
*case_details
|
|
404
|
+
.maf
|
|
405
|
+
.rejected_consequences
|
|
406
|
+
.entry(variant_classification.to_string())
|
|
407
|
+
.or_insert(0) += 1;
|
|
408
|
+
case_details.maf.excluded_by_consequence_type += 1;
|
|
409
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
410
|
+
return Ok(None);
|
|
392
411
|
}
|
|
393
412
|
} else {
|
|
394
|
-
//
|
|
413
|
+
// Empty filter, count as matched
|
|
395
414
|
*case_details
|
|
396
415
|
.maf
|
|
397
416
|
.matched_consequences
|
|
@@ -399,32 +418,142 @@ async fn process_row(
|
|
|
399
418
|
.or_insert(0) += 1;
|
|
400
419
|
}
|
|
401
420
|
} else {
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
421
|
+
// No filter, count as matched
|
|
422
|
+
*case_details
|
|
423
|
+
.maf
|
|
424
|
+
.matched_consequences
|
|
425
|
+
.entry(variant_classification.to_string())
|
|
426
|
+
.or_insert(0) += 1;
|
|
405
427
|
}
|
|
406
428
|
} else {
|
|
407
429
|
case_details.maf.invalid_rows += 1;
|
|
408
430
|
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
409
431
|
return Ok(None);
|
|
410
432
|
}
|
|
433
|
+
} else {
|
|
434
|
+
case_details.maf.invalid_rows += 1;
|
|
435
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
436
|
+
return Ok(None);
|
|
411
437
|
}
|
|
412
438
|
|
|
413
439
|
// Extract relevant columns
|
|
414
440
|
for &x in columns_indices {
|
|
415
441
|
if x >= cont_lst.len() {
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
419
|
-
} else if data_type == "cnv" {
|
|
420
|
-
case_details.cnv.invalid_rows += 1;
|
|
421
|
-
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
422
|
-
}
|
|
442
|
+
case_details.maf.invalid_rows += 1;
|
|
443
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
423
444
|
return Ok(None); // Invalid row
|
|
424
445
|
}
|
|
446
|
+
let element = cont_lst[x].to_string();
|
|
447
|
+
out_lst.push(element);
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
// Additional MAF-specific processing
|
|
451
|
+
if out_lst.len() < 6 {
|
|
452
|
+
case_details.maf.invalid_rows += 1;
|
|
453
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
454
|
+
return Ok(None); // Not enough columns
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
|
|
458
|
+
case_details.maf.invalid_rows += 1;
|
|
459
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
460
|
+
(
|
|
461
|
+
case_id.to_string(),
|
|
462
|
+
data_type.to_string(),
|
|
463
|
+
"Failed to convert t_depth to integer.".to_string(),
|
|
464
|
+
)
|
|
465
|
+
})?;
|
|
466
|
+
|
|
467
|
+
let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
|
|
468
|
+
case_details.maf.invalid_rows += 1;
|
|
469
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
470
|
+
(
|
|
471
|
+
case_id.to_string(),
|
|
472
|
+
data_type.to_string(),
|
|
473
|
+
"Failed to convert t_alt_count to integer.".to_string(),
|
|
474
|
+
)
|
|
475
|
+
})?;
|
|
476
|
+
|
|
477
|
+
if alle_depth < min_total_depth {
|
|
478
|
+
case_details.maf.t_depth += 1;
|
|
479
|
+
case_details.maf.excluded_by_min_depth += 1;
|
|
480
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
481
|
+
return Ok(None);
|
|
482
|
+
}
|
|
483
|
+
if alt_count < min_alt_allele_count {
|
|
484
|
+
case_details.maf.t_alt_count += 1;
|
|
485
|
+
case_details.maf.excluded_by_min_alt_count += 1;
|
|
486
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
487
|
+
return Ok(None);
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// Keep case_id, chr, start, end, and add "mutation"
|
|
491
|
+
out_lst = out_lst[0..4].to_vec();
|
|
492
|
+
out_lst.push("mutation".to_string());
|
|
493
|
+
|
|
494
|
+
// adding 'chr' to chromosome if it is not start with 'chr'
|
|
495
|
+
if !out_lst[1].starts_with("chr") {
|
|
496
|
+
out_lst[1] = format!("chr{}", out_lst[1]);
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
// Chromosome filtering
|
|
500
|
+
if !chromosomes.is_empty() && !chromosomes.contains(&out_lst[1]) {
|
|
501
|
+
*case_details
|
|
502
|
+
.maf
|
|
503
|
+
.skipped_chromosomes
|
|
504
|
+
.entry(out_lst[1].clone())
|
|
505
|
+
.or_insert(0) += 1;
|
|
506
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
507
|
+
return Ok(None);
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
// Update counters for included MAF records
|
|
511
|
+
case_details.maf.total_included += 1;
|
|
512
|
+
included_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
425
513
|
|
|
514
|
+
Ok(Some(out_lst))
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// Process a single row of CNV file
|
|
518
|
+
async fn process_cnvline(
|
|
519
|
+
line: &str,
|
|
520
|
+
case_id: &str,
|
|
521
|
+
data_type: &str,
|
|
522
|
+
header: &[String],
|
|
523
|
+
columns_indices: &[usize],
|
|
524
|
+
gain_threshold: f32,
|
|
525
|
+
loss_threshold: f32,
|
|
526
|
+
seg_length: i32,
|
|
527
|
+
chromosomes: &HashSet<String>,
|
|
528
|
+
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
529
|
+
filtered_cnv_records: &AtomicUsize,
|
|
530
|
+
included_cnv_records: &AtomicUsize,
|
|
531
|
+
) -> Result<Option<Vec<String>>, (String, String, String)> {
|
|
532
|
+
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
533
|
+
let mut out_lst = vec![case_id.to_string()];
|
|
534
|
+
|
|
535
|
+
// Initialize or update case details
|
|
536
|
+
let mut filtered_map = filtered_records.lock().await;
|
|
537
|
+
filtered_map
|
|
538
|
+
.entry(case_id.to_string())
|
|
539
|
+
.or_insert_with(|| FilteredCaseDetails {
|
|
540
|
+
maf: FilteredMafDetails::default(),
|
|
541
|
+
cnv: FilteredCnvDetails::default(),
|
|
542
|
+
});
|
|
543
|
+
let case_details = filtered_map.get_mut(case_id).unwrap();
|
|
544
|
+
|
|
545
|
+
// Track total processed records
|
|
546
|
+
case_details.cnv.total_processed += 1;
|
|
547
|
+
|
|
548
|
+
// Extract relevant columns
|
|
549
|
+
for &x in columns_indices {
|
|
550
|
+
if x >= cont_lst.len() {
|
|
551
|
+
case_details.cnv.invalid_rows += 1;
|
|
552
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
553
|
+
return Ok(None); // Invalid row
|
|
554
|
+
}
|
|
426
555
|
let mut element = cont_lst[x].to_string();
|
|
427
|
-
if
|
|
556
|
+
if header[x] == "Segment_Mean" {
|
|
428
557
|
element = process_segment_mean(&element, case_id, data_type, gain_threshold, loss_threshold)?;
|
|
429
558
|
if element.is_empty() {
|
|
430
559
|
case_details.cnv.segment_mean += 1;
|
|
@@ -444,89 +573,55 @@ async fn process_row(
|
|
|
444
573
|
out_lst.push(element);
|
|
445
574
|
}
|
|
446
575
|
|
|
447
|
-
//
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
(
|
|
459
|
-
case_id.to_string(),
|
|
460
|
-
data_type.to_string(),
|
|
461
|
-
"Failed to convert t_depth to integer.".to_string(),
|
|
462
|
-
)
|
|
463
|
-
})?;
|
|
464
|
-
|
|
465
|
-
let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
|
|
466
|
-
case_details.maf.invalid_rows += 1;
|
|
467
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
468
|
-
(
|
|
469
|
-
case_id.to_string(),
|
|
470
|
-
data_type.to_string(),
|
|
471
|
-
"Failed to convert t_alt_count to integer.".to_string(),
|
|
472
|
-
)
|
|
473
|
-
})?;
|
|
474
|
-
|
|
475
|
-
if alle_depth < min_total_depth {
|
|
476
|
-
case_details.maf.t_depth += 1;
|
|
477
|
-
case_details.maf.excluded_by_min_depth += 1;
|
|
478
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
479
|
-
return Ok(None);
|
|
480
|
-
}
|
|
481
|
-
if alt_count < min_alt_allele_count {
|
|
482
|
-
case_details.maf.t_alt_count += 1;
|
|
483
|
-
case_details.maf.excluded_by_min_alt_count += 1;
|
|
484
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
485
|
-
return Ok(None);
|
|
486
|
-
}
|
|
487
|
-
|
|
488
|
-
// Keep case_id, chr, start, end, and add "mutation"
|
|
489
|
-
out_lst = out_lst[0..4].to_vec();
|
|
490
|
-
out_lst.push("mutation".to_string());
|
|
576
|
+
// filter cnvs based on segment length. Default: 0 (no filtering)
|
|
577
|
+
// calculate segment length (End_Position - Start_Position)
|
|
578
|
+
let end_position = out_lst[3].parse::<i32>().map_err(|_| {
|
|
579
|
+
case_details.cnv.invalid_rows += 1;
|
|
580
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
581
|
+
(
|
|
582
|
+
case_id.to_string(),
|
|
583
|
+
data_type.to_string(),
|
|
584
|
+
"Failed to convert End Position of cnv to integer.".to_string(),
|
|
585
|
+
)
|
|
586
|
+
})?;
|
|
491
587
|
|
|
492
|
-
|
|
493
|
-
case_details.
|
|
494
|
-
|
|
588
|
+
let start_position = out_lst[2].parse::<i32>().map_err(|_| {
|
|
589
|
+
case_details.cnv.invalid_rows += 1;
|
|
590
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
591
|
+
(
|
|
592
|
+
case_id.to_string(),
|
|
593
|
+
data_type.to_string(),
|
|
594
|
+
"Failed to convert Start Position of cnv to integer.".to_string(),
|
|
595
|
+
)
|
|
596
|
+
})?;
|
|
597
|
+
let cnv_length = end_position - start_position;
|
|
598
|
+
if seg_length > 0 && cnv_length > seg_length {
|
|
599
|
+
case_details.cnv.seg_length += 1;
|
|
600
|
+
case_details.cnv.excluded_by_segment_length += 1;
|
|
601
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
602
|
+
return Ok(None);
|
|
495
603
|
}
|
|
496
604
|
|
|
497
|
-
//
|
|
498
|
-
if
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
case_details.cnv.invalid_rows += 1;
|
|
502
|
-
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
503
|
-
(
|
|
504
|
-
case_id.to_string(),
|
|
505
|
-
data_type.to_string(),
|
|
506
|
-
"Failed to convert End Position of cnv to integer.".to_string(),
|
|
507
|
-
)
|
|
508
|
-
})?;
|
|
605
|
+
// adding 'chr' to chromosome if it is not start with 'chr'
|
|
606
|
+
if !out_lst[1].starts_with("chr") {
|
|
607
|
+
out_lst[1] = format!("chr{}", out_lst[1]);
|
|
608
|
+
}
|
|
509
609
|
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
let cnv_length = end_position - start_position;
|
|
520
|
-
if seg_length > 0 && cnv_length > seg_length {
|
|
521
|
-
case_details.cnv.seg_length += 1;
|
|
522
|
-
case_details.cnv.excluded_by_segment_length += 1;
|
|
523
|
-
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
524
|
-
return Ok(None);
|
|
525
|
-
}
|
|
526
|
-
case_details.cnv.total_included += 1;
|
|
527
|
-
included_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
610
|
+
// Chromosome filtering
|
|
611
|
+
if !chromosomes.is_empty() && !chromosomes.contains(&out_lst[1]) {
|
|
612
|
+
*case_details
|
|
613
|
+
.cnv
|
|
614
|
+
.skipped_chromosomes
|
|
615
|
+
.entry(out_lst[1].clone())
|
|
616
|
+
.or_insert(0) += 1;
|
|
617
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
618
|
+
return Ok(None);
|
|
528
619
|
}
|
|
529
620
|
|
|
621
|
+
// Update counters for included MAF records
|
|
622
|
+
case_details.cnv.total_included += 1;
|
|
623
|
+
included_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
624
|
+
|
|
530
625
|
Ok(Some(out_lst))
|
|
531
626
|
}
|
|
532
627
|
|
|
@@ -658,10 +753,9 @@ async fn download_single_file(
|
|
|
658
753
|
))
|
|
659
754
|
}
|
|
660
755
|
|
|
661
|
-
///
|
|
756
|
+
/// Downloading from GDC
|
|
662
757
|
/// Outputs JSONL format: one JSON object per line
|
|
663
|
-
|
|
664
|
-
async fn download_data_streaming(
|
|
758
|
+
async fn download_data(
|
|
665
759
|
data4dl: HashMap<String, DataType>,
|
|
666
760
|
host: &str,
|
|
667
761
|
min_total_depth: i32,
|
|
@@ -672,6 +766,8 @@ async fn download_data_streaming(
|
|
|
672
766
|
loss_threshold: f32,
|
|
673
767
|
seg_length: i32,
|
|
674
768
|
cnv_hyper_mutator: i32,
|
|
769
|
+
chromosomes: &HashSet<String>,
|
|
770
|
+
max_record: usize,
|
|
675
771
|
) {
|
|
676
772
|
let data_urls: Vec<(String, String, String)> = data4dl
|
|
677
773
|
.into_iter()
|
|
@@ -696,8 +792,11 @@ async fn download_data_streaming(
|
|
|
696
792
|
let filtered_cnv_records = Arc::new(AtomicUsize::new(0));
|
|
697
793
|
let filtered_records = Arc::new(Mutex::new(HashMap::<String, FilteredCaseDetails>::new()));
|
|
698
794
|
let hyper_mutator_records = Arc::new(Mutex::new(HashMap::<String, Vec<String>>::new()));
|
|
795
|
+
let excluded_by_max_record = Arc::new(Mutex::new(HashMap::<String, Vec<String>>::new()));
|
|
699
796
|
let included_maf_records = Arc::new(AtomicUsize::new(0));
|
|
700
797
|
let included_cnv_records = Arc::new(AtomicUsize::new(0));
|
|
798
|
+
let all_records = Arc::new(Mutex::new(Vec::<Vec<String>>::new()));
|
|
799
|
+
let data_count = Arc::new(AtomicUsize::new(0));
|
|
701
800
|
|
|
702
801
|
// Only collect errors (successful data is output immediately)
|
|
703
802
|
let errors = Arc::new(Mutex::new(Vec::<ErrorEntry>::new()));
|
|
@@ -720,9 +819,25 @@ async fn download_data_streaming(
|
|
|
720
819
|
let included_maf_records = Arc::clone(&included_maf_records);
|
|
721
820
|
let included_cnv_records = Arc::clone(&included_cnv_records);
|
|
722
821
|
let hyper_mutator_records = Arc::clone(&hyper_mutator_records);
|
|
822
|
+
let excluded_by_max_record = Arc::clone(&excluded_by_max_record);
|
|
723
823
|
let errors = Arc::clone(&errors);
|
|
824
|
+
let all_records = Arc::clone(&all_records);
|
|
825
|
+
let data_count = Arc::clone(&data_count);
|
|
724
826
|
|
|
725
827
|
async move {
|
|
828
|
+
let current_count = data_count.load(Ordering::Relaxed);
|
|
829
|
+
if current_count >= max_record {
|
|
830
|
+
// Skip processing and mark as excluded by max_record
|
|
831
|
+
if let Ok((case_id, data_type, _)) = download_result {
|
|
832
|
+
let mut exclud_max_record = excluded_by_max_record.lock().await;
|
|
833
|
+
exclud_max_record
|
|
834
|
+
.entry(data_type.to_string())
|
|
835
|
+
.or_insert_with(Vec::new)
|
|
836
|
+
.push(case_id.to_string());
|
|
837
|
+
successful_downloads.fetch_add(1, Ordering::Relaxed);
|
|
838
|
+
}
|
|
839
|
+
return;
|
|
840
|
+
}
|
|
726
841
|
match download_result {
|
|
727
842
|
Ok((case_id, data_type, content)) => {
|
|
728
843
|
// Try to parse the content
|
|
@@ -738,6 +853,7 @@ async fn download_data_streaming(
|
|
|
738
853
|
loss_threshold,
|
|
739
854
|
seg_length,
|
|
740
855
|
cnv_hyper_mutator,
|
|
856
|
+
&chromosomes,
|
|
741
857
|
&filtered_records,
|
|
742
858
|
&filtered_maf_records,
|
|
743
859
|
&filtered_cnv_records,
|
|
@@ -748,24 +864,18 @@ async fn download_data_streaming(
|
|
|
748
864
|
.await
|
|
749
865
|
{
|
|
750
866
|
Ok(parsed_data) => {
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
// Force flush to ensure Node.js sees it immediately
|
|
763
|
-
use std::io::Write;
|
|
764
|
-
let _ = std::io::stdout().flush();
|
|
765
|
-
// Optional: Add small delay to separate lines
|
|
766
|
-
sleep(Duration::from_millis(10));
|
|
867
|
+
let remaining = max_record - current_count;
|
|
868
|
+
if parsed_data.len() <= remaining {
|
|
869
|
+
data_count.fetch_add(parsed_data.len(), Ordering::Relaxed);
|
|
870
|
+
all_records.lock().await.extend(parsed_data);
|
|
871
|
+
} else {
|
|
872
|
+
// Skip file if it would exceed max_record
|
|
873
|
+
let mut exclud_max_record = excluded_by_max_record.lock().await;
|
|
874
|
+
exclud_max_record
|
|
875
|
+
.entry(data_type.to_string())
|
|
876
|
+
.or_insert_with(Vec::new)
|
|
877
|
+
.push(case_id.to_string());
|
|
767
878
|
}
|
|
768
|
-
|
|
769
879
|
successful_downloads.fetch_add(1, Ordering::Relaxed);
|
|
770
880
|
}
|
|
771
881
|
Err((cid, dtp, error)) => {
|
|
@@ -816,7 +926,6 @@ async fn download_data_streaming(
|
|
|
816
926
|
let included_cnv_count = included_cnv_records.load(Ordering::Relaxed);
|
|
817
927
|
|
|
818
928
|
let summary = FinalSummary {
|
|
819
|
-
output_type: "summary".to_string(),
|
|
820
929
|
total_files,
|
|
821
930
|
successful_files: success_count,
|
|
822
931
|
failed_files: failed_count,
|
|
@@ -828,10 +937,214 @@ async fn download_data_streaming(
|
|
|
828
937
|
included_maf_records: included_maf_count,
|
|
829
938
|
included_cnv_records: included_cnv_count,
|
|
830
939
|
hyper_mutator_records: hyper_mutator_records.lock().await.clone(),
|
|
940
|
+
excluded_by_max_record: excluded_by_max_record.lock().await.clone(),
|
|
831
941
|
};
|
|
832
942
|
|
|
943
|
+
let grin2lesion = serde_json::to_string(&all_records.lock().await.drain(..).collect::<Vec<Vec<String>>>())
|
|
944
|
+
.unwrap_or_else(|_| "[]".to_string());
|
|
945
|
+
let output = Output { grin2lesion, summary };
|
|
946
|
+
|
|
833
947
|
// Output final summary - Node.js will know processing is complete when it sees this
|
|
834
|
-
if let Ok(json) = serde_json::to_string(&summary) {
|
|
948
|
+
// if let Ok(json) = serde_json::to_string(&summary) {
|
|
949
|
+
if let Ok(json) = serde_json::to_string(&output) {
|
|
950
|
+
println!("{}", json);
|
|
951
|
+
use std::io::Write;
|
|
952
|
+
let _ = std::io::stdout().flush();
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
/// Read data from local file
|
|
957
|
+
async fn localread_data(
|
|
958
|
+
case_files: HashMap<String, DataType>,
|
|
959
|
+
min_total_depth: i32,
|
|
960
|
+
min_alt_allele_count: i32,
|
|
961
|
+
maf_hyper_mutator: i32,
|
|
962
|
+
consequences: &Option<Vec<String>>,
|
|
963
|
+
gain_threshold: f32,
|
|
964
|
+
loss_threshold: f32,
|
|
965
|
+
seg_length: i32,
|
|
966
|
+
cnv_hyper_mutator: i32,
|
|
967
|
+
chromosomes: &HashSet<String>,
|
|
968
|
+
max_record: usize,
|
|
969
|
+
) {
|
|
970
|
+
let data_files: Vec<(String, String, String)> = case_files
|
|
971
|
+
.into_iter()
|
|
972
|
+
.flat_map(|(case_id, data_types)| {
|
|
973
|
+
let mut files = Vec::new();
|
|
974
|
+
if let Some(cnv_file) = &data_types.cnv {
|
|
975
|
+
files.push((case_id.clone(), "cnv".to_string(), cnv_file.clone()));
|
|
976
|
+
}
|
|
977
|
+
if let Some(maf_file) = &data_types.maf {
|
|
978
|
+
files.push((case_id.clone(), "maf".to_string(), maf_file.clone()));
|
|
979
|
+
}
|
|
980
|
+
files
|
|
981
|
+
})
|
|
982
|
+
.collect();
|
|
983
|
+
let total_files = data_files.len();
|
|
984
|
+
|
|
985
|
+
// Counters for final summary
|
|
986
|
+
let successful_reads = Arc::new(AtomicUsize::new(0));
|
|
987
|
+
let failed_reads = Arc::new(AtomicUsize::new(0));
|
|
988
|
+
let filtered_maf_records = Arc::new(AtomicUsize::new(0));
|
|
989
|
+
let filtered_cnv_records = Arc::new(AtomicUsize::new(0));
|
|
990
|
+
let filtered_records = Arc::new(Mutex::new(HashMap::<String, FilteredCaseDetails>::new()));
|
|
991
|
+
let hyper_mutator_records = Arc::new(Mutex::new(HashMap::<String, Vec<String>>::new()));
|
|
992
|
+
let excluded_by_max_record = Arc::new(Mutex::new(HashMap::<String, Vec<String>>::new()));
|
|
993
|
+
let included_maf_records = Arc::new(AtomicUsize::new(0));
|
|
994
|
+
let included_cnv_records = Arc::new(AtomicUsize::new(0));
|
|
995
|
+
let errors = Arc::new(Mutex::new(Vec::<ErrorEntry>::new()));
|
|
996
|
+
let all_records = Arc::new(Mutex::new(Vec::<Vec<String>>::new()));
|
|
997
|
+
let data_count = Arc::new(AtomicUsize::new(0));
|
|
998
|
+
|
|
999
|
+
// Process files concurrently
|
|
1000
|
+
let read_futures = futures::stream::iter(data_files.into_iter().map(
|
|
1001
|
+
|(case_id, data_type, file_path)| async move {
|
|
1002
|
+
// read the local file
|
|
1003
|
+
match fs::read_to_string(&file_path) {
|
|
1004
|
+
Ok(content) => Ok((case_id, data_type, content)),
|
|
1005
|
+
Err(e) => Err((
|
|
1006
|
+
case_id,
|
|
1007
|
+
data_type,
|
|
1008
|
+
format!("file_read_error: {}", e),
|
|
1009
|
+
1, // Single attempt for local file readng
|
|
1010
|
+
)),
|
|
1011
|
+
}
|
|
1012
|
+
},
|
|
1013
|
+
));
|
|
1014
|
+
|
|
1015
|
+
// Process files and output results
|
|
1016
|
+
read_futures
|
|
1017
|
+
.buffer_unordered(3)
|
|
1018
|
+
.for_each(|read_result| {
|
|
1019
|
+
let successful_reads = Arc::clone(&successful_reads);
|
|
1020
|
+
let failed_reads = Arc::clone(&failed_reads);
|
|
1021
|
+
let filtered_maf_records = Arc::clone(&filtered_maf_records);
|
|
1022
|
+
let filtered_cnv_records = Arc::clone(&filtered_cnv_records);
|
|
1023
|
+
let filtered_records = Arc::clone(&filtered_records);
|
|
1024
|
+
let included_maf_records = Arc::clone(&included_maf_records);
|
|
1025
|
+
let included_cnv_records = Arc::clone(&included_cnv_records);
|
|
1026
|
+
let hyper_mutator_records = Arc::clone(&hyper_mutator_records);
|
|
1027
|
+
let excluded_by_max_record = Arc::clone(&excluded_by_max_record);
|
|
1028
|
+
let errors = Arc::clone(&errors);
|
|
1029
|
+
let all_records = Arc::clone(&all_records);
|
|
1030
|
+
let data_count = Arc::clone(&data_count);
|
|
1031
|
+
|
|
1032
|
+
async move {
|
|
1033
|
+
let current_count = data_count.load(Ordering::Relaxed);
|
|
1034
|
+
if current_count >= max_record {
|
|
1035
|
+
// Skip processing and mark as excluded by max_record
|
|
1036
|
+
if let Ok((case_id, data_type, _)) = read_result {
|
|
1037
|
+
let mut exclud_max_record = excluded_by_max_record.lock().await;
|
|
1038
|
+
exclud_max_record
|
|
1039
|
+
.entry(data_type.to_string())
|
|
1040
|
+
.or_insert_with(Vec::new)
|
|
1041
|
+
.push(case_id.to_string());
|
|
1042
|
+
successful_reads.fetch_add(1, Ordering::Relaxed);
|
|
1043
|
+
}
|
|
1044
|
+
return;
|
|
1045
|
+
}
|
|
1046
|
+
match read_result {
|
|
1047
|
+
Ok((case_id, data_type, content)) => {
|
|
1048
|
+
match parse_content(
|
|
1049
|
+
&content,
|
|
1050
|
+
&case_id,
|
|
1051
|
+
&data_type,
|
|
1052
|
+
min_total_depth,
|
|
1053
|
+
min_alt_allele_count,
|
|
1054
|
+
maf_hyper_mutator,
|
|
1055
|
+
consequences,
|
|
1056
|
+
gain_threshold,
|
|
1057
|
+
loss_threshold,
|
|
1058
|
+
seg_length,
|
|
1059
|
+
cnv_hyper_mutator,
|
|
1060
|
+
chromosomes,
|
|
1061
|
+
&filtered_records,
|
|
1062
|
+
&filtered_maf_records,
|
|
1063
|
+
&filtered_cnv_records,
|
|
1064
|
+
&included_maf_records,
|
|
1065
|
+
&included_cnv_records,
|
|
1066
|
+
&hyper_mutator_records,
|
|
1067
|
+
)
|
|
1068
|
+
.await
|
|
1069
|
+
{
|
|
1070
|
+
Ok(parsed_data) => {
|
|
1071
|
+
let remaining = max_record - current_count;
|
|
1072
|
+
if parsed_data.len() <= remaining {
|
|
1073
|
+
data_count.fetch_add(parsed_data.len(), Ordering::Relaxed);
|
|
1074
|
+
all_records.lock().await.extend(parsed_data);
|
|
1075
|
+
} else {
|
|
1076
|
+
// Skip file if it would exceed max_record
|
|
1077
|
+
let mut exclud_max_record = excluded_by_max_record.lock().await;
|
|
1078
|
+
exclud_max_record
|
|
1079
|
+
.entry(data_type.to_string())
|
|
1080
|
+
.or_insert_with(Vec::new)
|
|
1081
|
+
.push(case_id.to_string());
|
|
1082
|
+
}
|
|
1083
|
+
successful_reads.fetch_add(1, Ordering::Relaxed);
|
|
1084
|
+
}
|
|
1085
|
+
Err((cid, dtp, error)) => {
|
|
1086
|
+
failed_reads.fetch_add(1, Ordering::Relaxed);
|
|
1087
|
+
let error = ErrorEntry {
|
|
1088
|
+
case_id: cid,
|
|
1089
|
+
data_type: dtp,
|
|
1090
|
+
error_type: "parsing_error".to_string(),
|
|
1091
|
+
error_details: error,
|
|
1092
|
+
attempts_made: 1,
|
|
1093
|
+
};
|
|
1094
|
+
errors.lock().await.push(error);
|
|
1095
|
+
}
|
|
1096
|
+
}
|
|
1097
|
+
}
|
|
1098
|
+
Err((case_id, data_type, error_details, attempts)) => {
|
|
1099
|
+
failed_reads.fetch_add(1, Ordering::Relaxed);
|
|
1100
|
+
let (error_type, clean_details) = if error_details.contains(":") {
|
|
1101
|
+
let parts: Vec<&str> = error_details.splitn(2, ": ").collect();
|
|
1102
|
+
(parts[0].to_string(), parts[1].to_string())
|
|
1103
|
+
} else {
|
|
1104
|
+
("unknown_error".to_string(), error_details)
|
|
1105
|
+
};
|
|
1106
|
+
let error = ErrorEntry {
|
|
1107
|
+
case_id,
|
|
1108
|
+
data_type,
|
|
1109
|
+
error_type,
|
|
1110
|
+
error_details: clean_details,
|
|
1111
|
+
attempts_made: attempts,
|
|
1112
|
+
};
|
|
1113
|
+
errors.lock().await.push(error);
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
}
|
|
1117
|
+
})
|
|
1118
|
+
.await;
|
|
1119
|
+
// Output final summary as the last line
|
|
1120
|
+
let success_count = successful_reads.load(Ordering::Relaxed);
|
|
1121
|
+
let failed_count = failed_reads.load(Ordering::Relaxed);
|
|
1122
|
+
let filtered_maf_count = filtered_maf_records.load(Ordering::Relaxed);
|
|
1123
|
+
let filtered_cnv_count = filtered_cnv_records.load(Ordering::Relaxed);
|
|
1124
|
+
let included_maf_count = included_maf_records.load(Ordering::Relaxed);
|
|
1125
|
+
let included_cnv_count = included_cnv_records.load(Ordering::Relaxed);
|
|
1126
|
+
|
|
1127
|
+
let summary = FinalSummary {
|
|
1128
|
+
total_files,
|
|
1129
|
+
successful_files: success_count,
|
|
1130
|
+
failed_files: failed_count,
|
|
1131
|
+
errors: errors.lock().await.clone(),
|
|
1132
|
+
filtered_records: filtered_maf_count + filtered_cnv_count,
|
|
1133
|
+
filtered_maf_records: filtered_maf_count,
|
|
1134
|
+
filtered_cnv_records: filtered_cnv_count,
|
|
1135
|
+
filtered_records_by_case: filtered_records.lock().await.clone(),
|
|
1136
|
+
included_maf_records: included_maf_count,
|
|
1137
|
+
included_cnv_records: included_cnv_count,
|
|
1138
|
+
hyper_mutator_records: hyper_mutator_records.lock().await.clone(),
|
|
1139
|
+
excluded_by_max_record: excluded_by_max_record.lock().await.clone(),
|
|
1140
|
+
};
|
|
1141
|
+
|
|
1142
|
+
let grin2lesion = serde_json::to_string(&all_records.lock().await.drain(..).collect::<Vec<Vec<String>>>())
|
|
1143
|
+
.unwrap_or_else(|_| "[]".to_string());
|
|
1144
|
+
let output = Output { grin2lesion, summary };
|
|
1145
|
+
|
|
1146
|
+
// Output final JSON array
|
|
1147
|
+
if let Ok(json) = serde_json::to_string(&output) {
|
|
835
1148
|
println!("{}", json);
|
|
836
1149
|
use std::io::Write;
|
|
837
1150
|
let _ = std::io::stdout().flush();
|
|
@@ -840,6 +1153,9 @@ async fn download_data_streaming(
|
|
|
840
1153
|
|
|
841
1154
|
#[tokio::main]
|
|
842
1155
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
1156
|
+
let args: Vec<String> = env::args().collect();
|
|
1157
|
+
let from_file = args.contains(&"--from-file".to_string());
|
|
1158
|
+
|
|
843
1159
|
const HOST: &str = "https://api.gdc.cancer.gov/data/";
|
|
844
1160
|
|
|
845
1161
|
// Read input with timeout
|
|
@@ -879,6 +1195,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
879
1195
|
}
|
|
880
1196
|
|
|
881
1197
|
let case_files = input_js.case_files;
|
|
1198
|
+
let max_record: usize = input_js.max_record;
|
|
882
1199
|
|
|
883
1200
|
// Set default maf_options
|
|
884
1201
|
let (min_total_depth, min_alt_allele_count, maf_hyper_mutator, consequences) = match input_js.maf_options {
|
|
@@ -902,20 +1219,42 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
902
1219
|
None => (0.3, -0.4, 0, 500), // Default values
|
|
903
1220
|
};
|
|
904
1221
|
|
|
905
|
-
//
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
1222
|
+
// Convert Vec<String> to HashSet<String> for faster lookup
|
|
1223
|
+
let chromosomes = input_js.chromosomes.into_iter().collect::<HashSet<String>>();
|
|
1224
|
+
|
|
1225
|
+
if from_file {
|
|
1226
|
+
localread_data(
|
|
1227
|
+
case_files,
|
|
1228
|
+
min_total_depth,
|
|
1229
|
+
min_alt_allele_count,
|
|
1230
|
+
maf_hyper_mutator,
|
|
1231
|
+
&consequences,
|
|
1232
|
+
gain_threshold,
|
|
1233
|
+
loss_threshold,
|
|
1234
|
+
seg_length,
|
|
1235
|
+
cnv_hyper_mutator,
|
|
1236
|
+
&chromosomes,
|
|
1237
|
+
max_record,
|
|
1238
|
+
)
|
|
1239
|
+
.await;
|
|
1240
|
+
} else {
|
|
1241
|
+
// Download data from GDC- this will now handle errors gracefully
|
|
1242
|
+
download_data(
|
|
1243
|
+
case_files,
|
|
1244
|
+
HOST,
|
|
1245
|
+
min_total_depth,
|
|
1246
|
+
min_alt_allele_count,
|
|
1247
|
+
maf_hyper_mutator,
|
|
1248
|
+
&consequences,
|
|
1249
|
+
gain_threshold,
|
|
1250
|
+
loss_threshold,
|
|
1251
|
+
seg_length,
|
|
1252
|
+
cnv_hyper_mutator,
|
|
1253
|
+
&chromosomes,
|
|
1254
|
+
max_record,
|
|
1255
|
+
)
|
|
1256
|
+
.await;
|
|
1257
|
+
}
|
|
919
1258
|
|
|
920
1259
|
// Always exit successfully - individual file failures are logged but don't stop the process
|
|
921
1260
|
Ok(())
|