@sjcrh/proteinpaint-rust 2.133.0 → 2.135.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/gdcGRIN2.rs +246 -154
package/package.json
CHANGED
package/src/gdcGRIN2.rs
CHANGED
|
@@ -6,14 +6,18 @@
|
|
|
6
6
|
2. Better timeout handling with retries
|
|
7
7
|
3. More detailed error reporting
|
|
8
8
|
4. Continues processing even when some files fail
|
|
9
|
+
5. Added chromosome filtering
|
|
9
10
|
|
|
10
11
|
Input JSON:
|
|
11
12
|
caseFiles
|
|
12
13
|
mafOptions: For SNVindel filtering
|
|
14
|
+
cnvOptions: For CNV filtering
|
|
15
|
+
chromosomes: chromosomes will be included:[]
|
|
16
|
+
|
|
13
17
|
Output mutations as JSON array.
|
|
14
18
|
|
|
15
19
|
Example of usage:
|
|
16
|
-
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20,"hyperMutator":8000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000, "hyperMutator":8000}}' | ./target/release/gdcGRIN2
|
|
20
|
+
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20,"hyperMutator":8000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000, "hyperMutator":8000}, "chromosomes":["chr1","chr2","chr3"]}' | ./target/release/gdcGRIN2
|
|
17
21
|
*/
|
|
18
22
|
|
|
19
23
|
use flate2::read::GzDecoder;
|
|
@@ -21,7 +25,7 @@ use futures::StreamExt;
|
|
|
21
25
|
use memchr::memchr;
|
|
22
26
|
use serde::{Deserialize, Serialize};
|
|
23
27
|
use serde_json;
|
|
24
|
-
use std::collections::HashMap;
|
|
28
|
+
use std::collections::{HashMap, HashSet};
|
|
25
29
|
use std::io::{self, Read};
|
|
26
30
|
use std::sync::Arc;
|
|
27
31
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
@@ -96,6 +100,7 @@ struct FilteredMafDetails {
|
|
|
96
100
|
excluded_by_consequence_type: usize,
|
|
97
101
|
total_processed: usize,
|
|
98
102
|
total_included: usize,
|
|
103
|
+
skipped_chromosomes: HashMap<String, usize>,
|
|
99
104
|
}
|
|
100
105
|
|
|
101
106
|
// struct for CNV filter details
|
|
@@ -109,6 +114,7 @@ struct FilteredCnvDetails {
|
|
|
109
114
|
excluded_by_segment_length: usize,
|
|
110
115
|
total_processed: usize,
|
|
111
116
|
total_included: usize,
|
|
117
|
+
skipped_chromosomes: HashMap<String, usize>,
|
|
112
118
|
}
|
|
113
119
|
|
|
114
120
|
// struct for per-case filter details
|
|
@@ -145,6 +151,7 @@ struct InputData {
|
|
|
145
151
|
maf_options: Option<MafOptions>,
|
|
146
152
|
#[serde(rename = "cnvOptions")]
|
|
147
153
|
cnv_options: Option<CnvOptions>,
|
|
154
|
+
chromosomes: Vec<String>,
|
|
148
155
|
}
|
|
149
156
|
|
|
150
157
|
// Configuration for different data types
|
|
@@ -167,6 +174,7 @@ async fn parse_content(
|
|
|
167
174
|
loss_threshold: f32,
|
|
168
175
|
seg_length: i32,
|
|
169
176
|
cnv_hyper_mutator: i32,
|
|
177
|
+
chromosomes: &HashSet<String>,
|
|
170
178
|
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
171
179
|
filtered_maf_records: &AtomicUsize,
|
|
172
180
|
filtered_cnv_records: &AtomicUsize,
|
|
@@ -219,9 +227,6 @@ async fn parse_content(
|
|
|
219
227
|
let mut parsed_data = Vec::new();
|
|
220
228
|
let mut columns_indices: Vec<usize> = Vec::new();
|
|
221
229
|
let mut variant_classification_index: Option<usize> = None;
|
|
222
|
-
//let mut header_mk: &str = "";
|
|
223
|
-
//let mut columns = Vec::new();
|
|
224
|
-
|
|
225
230
|
let mut header: Vec<String> = Vec::new();
|
|
226
231
|
|
|
227
232
|
for line in lines {
|
|
@@ -243,26 +248,49 @@ async fn parse_content(
|
|
|
243
248
|
continue;
|
|
244
249
|
};
|
|
245
250
|
|
|
246
|
-
let row =
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
251
|
+
let row = match data_type {
|
|
252
|
+
"maf" => {
|
|
253
|
+
process_mafline(
|
|
254
|
+
line,
|
|
255
|
+
case_id,
|
|
256
|
+
data_type,
|
|
257
|
+
&columns_indices,
|
|
258
|
+
variant_classification_index,
|
|
259
|
+
consequences,
|
|
260
|
+
min_total_depth,
|
|
261
|
+
min_alt_allele_count,
|
|
262
|
+
chromosomes,
|
|
263
|
+
filtered_records,
|
|
264
|
+
filtered_maf_records,
|
|
265
|
+
included_maf_records,
|
|
266
|
+
)
|
|
267
|
+
.await
|
|
268
|
+
}
|
|
269
|
+
"cnv" => {
|
|
270
|
+
process_cnvline(
|
|
271
|
+
line,
|
|
272
|
+
case_id,
|
|
273
|
+
data_type,
|
|
274
|
+
&header,
|
|
275
|
+
&columns_indices,
|
|
276
|
+
gain_threshold,
|
|
277
|
+
loss_threshold,
|
|
278
|
+
seg_length,
|
|
279
|
+
chromosomes,
|
|
280
|
+
filtered_records,
|
|
281
|
+
filtered_cnv_records,
|
|
282
|
+
included_cnv_records,
|
|
283
|
+
)
|
|
284
|
+
.await
|
|
285
|
+
}
|
|
286
|
+
_ => {
|
|
287
|
+
return Err((
|
|
288
|
+
case_id.to_string(),
|
|
289
|
+
data_type.to_string(),
|
|
290
|
+
"Invalid data type".to_string(),
|
|
291
|
+
));
|
|
292
|
+
}
|
|
293
|
+
}?;
|
|
266
294
|
|
|
267
295
|
if let Some(out_lst) = row {
|
|
268
296
|
parsed_data.push(out_lst);
|
|
@@ -316,25 +344,20 @@ fn setup_columns(
|
|
|
316
344
|
Ok(())
|
|
317
345
|
}
|
|
318
346
|
|
|
319
|
-
// Process a single row of
|
|
320
|
-
async fn
|
|
347
|
+
// Process a single row of MAF file
|
|
348
|
+
async fn process_mafline(
|
|
321
349
|
line: &str,
|
|
322
350
|
case_id: &str,
|
|
323
351
|
data_type: &str,
|
|
324
|
-
header: &[String],
|
|
325
352
|
columns_indices: &[usize],
|
|
326
353
|
variant_classification_index: Option<usize>,
|
|
327
354
|
consequences: &Option<Vec<String>>,
|
|
328
355
|
min_total_depth: i32,
|
|
329
356
|
min_alt_allele_count: i32,
|
|
330
|
-
|
|
331
|
-
loss_threshold: f32,
|
|
332
|
-
seg_length: i32,
|
|
357
|
+
chromosomes: &HashSet<String>,
|
|
333
358
|
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
334
359
|
filtered_maf_records: &AtomicUsize,
|
|
335
|
-
filtered_cnv_records: &AtomicUsize,
|
|
336
360
|
included_maf_records: &AtomicUsize,
|
|
337
|
-
included_cnv_records: &AtomicUsize,
|
|
338
361
|
) -> Result<Option<Vec<String>>, (String, String, String)> {
|
|
339
362
|
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
340
363
|
let mut out_lst = vec![case_id.to_string()];
|
|
@@ -347,51 +370,38 @@ async fn process_row(
|
|
|
347
370
|
maf: FilteredMafDetails::default(),
|
|
348
371
|
cnv: FilteredCnvDetails::default(),
|
|
349
372
|
});
|
|
350
|
-
|
|
351
373
|
let case_details = filtered_map.get_mut(case_id).unwrap();
|
|
352
374
|
|
|
353
375
|
// Track total processed records
|
|
354
|
-
|
|
355
|
-
case_details.maf.total_processed += 1;
|
|
356
|
-
} else if data_type == "cnv" {
|
|
357
|
-
case_details.cnv.total_processed += 1;
|
|
358
|
-
}
|
|
376
|
+
case_details.maf.total_processed += 1;
|
|
359
377
|
|
|
360
378
|
// Handle consequence filtering and counting for MAF files
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
*case_details
|
|
370
|
-
.maf
|
|
371
|
-
.matched_consequences
|
|
372
|
-
.entry(variant_classification.to_string())
|
|
373
|
-
.or_insert(0) += 1;
|
|
374
|
-
} else {
|
|
375
|
-
// Unmatched consequence
|
|
376
|
-
*case_details
|
|
377
|
-
.maf
|
|
378
|
-
.rejected_consequences
|
|
379
|
-
.entry(variant_classification.to_string())
|
|
380
|
-
.or_insert(0) += 1;
|
|
381
|
-
case_details.maf.excluded_by_consequence_type += 1;
|
|
382
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
383
|
-
return Ok(None);
|
|
384
|
-
}
|
|
385
|
-
} else {
|
|
386
|
-
// Empty filter, count as matched
|
|
379
|
+
|
|
380
|
+
if let Some(var_class_idx) = variant_classification_index {
|
|
381
|
+
if var_class_idx < cont_lst.len() {
|
|
382
|
+
let variant_classification = &cont_lst[var_class_idx];
|
|
383
|
+
if let Some(consequence_filter) = consequences {
|
|
384
|
+
if !consequence_filter.is_empty() {
|
|
385
|
+
if consequence_filter.contains(variant_classification) {
|
|
386
|
+
// Matched consequence
|
|
387
387
|
*case_details
|
|
388
388
|
.maf
|
|
389
389
|
.matched_consequences
|
|
390
390
|
.entry(variant_classification.to_string())
|
|
391
391
|
.or_insert(0) += 1;
|
|
392
|
+
} else {
|
|
393
|
+
// Unmatched consequence
|
|
394
|
+
*case_details
|
|
395
|
+
.maf
|
|
396
|
+
.rejected_consequences
|
|
397
|
+
.entry(variant_classification.to_string())
|
|
398
|
+
.or_insert(0) += 1;
|
|
399
|
+
case_details.maf.excluded_by_consequence_type += 1;
|
|
400
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
401
|
+
return Ok(None);
|
|
392
402
|
}
|
|
393
403
|
} else {
|
|
394
|
-
//
|
|
404
|
+
// Empty filter, count as matched
|
|
395
405
|
*case_details
|
|
396
406
|
.maf
|
|
397
407
|
.matched_consequences
|
|
@@ -399,32 +409,142 @@ async fn process_row(
|
|
|
399
409
|
.or_insert(0) += 1;
|
|
400
410
|
}
|
|
401
411
|
} else {
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
412
|
+
// No filter, count as matched
|
|
413
|
+
*case_details
|
|
414
|
+
.maf
|
|
415
|
+
.matched_consequences
|
|
416
|
+
.entry(variant_classification.to_string())
|
|
417
|
+
.or_insert(0) += 1;
|
|
405
418
|
}
|
|
406
419
|
} else {
|
|
407
420
|
case_details.maf.invalid_rows += 1;
|
|
408
421
|
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
409
422
|
return Ok(None);
|
|
410
423
|
}
|
|
424
|
+
} else {
|
|
425
|
+
case_details.maf.invalid_rows += 1;
|
|
426
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
427
|
+
return Ok(None);
|
|
411
428
|
}
|
|
412
429
|
|
|
413
430
|
// Extract relevant columns
|
|
414
431
|
for &x in columns_indices {
|
|
415
432
|
if x >= cont_lst.len() {
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
419
|
-
} else if data_type == "cnv" {
|
|
420
|
-
case_details.cnv.invalid_rows += 1;
|
|
421
|
-
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
422
|
-
}
|
|
433
|
+
case_details.maf.invalid_rows += 1;
|
|
434
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
423
435
|
return Ok(None); // Invalid row
|
|
424
436
|
}
|
|
437
|
+
let element = cont_lst[x].to_string();
|
|
438
|
+
out_lst.push(element);
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
// Additional MAF-specific processing
|
|
442
|
+
if out_lst.len() < 6 {
|
|
443
|
+
case_details.maf.invalid_rows += 1;
|
|
444
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
445
|
+
return Ok(None); // Not enough columns
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
|
|
449
|
+
case_details.maf.invalid_rows += 1;
|
|
450
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
451
|
+
(
|
|
452
|
+
case_id.to_string(),
|
|
453
|
+
data_type.to_string(),
|
|
454
|
+
"Failed to convert t_depth to integer.".to_string(),
|
|
455
|
+
)
|
|
456
|
+
})?;
|
|
457
|
+
|
|
458
|
+
let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
|
|
459
|
+
case_details.maf.invalid_rows += 1;
|
|
460
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
461
|
+
(
|
|
462
|
+
case_id.to_string(),
|
|
463
|
+
data_type.to_string(),
|
|
464
|
+
"Failed to convert t_alt_count to integer.".to_string(),
|
|
465
|
+
)
|
|
466
|
+
})?;
|
|
467
|
+
|
|
468
|
+
if alle_depth < min_total_depth {
|
|
469
|
+
case_details.maf.t_depth += 1;
|
|
470
|
+
case_details.maf.excluded_by_min_depth += 1;
|
|
471
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
472
|
+
return Ok(None);
|
|
473
|
+
}
|
|
474
|
+
if alt_count < min_alt_allele_count {
|
|
475
|
+
case_details.maf.t_alt_count += 1;
|
|
476
|
+
case_details.maf.excluded_by_min_alt_count += 1;
|
|
477
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
478
|
+
return Ok(None);
|
|
479
|
+
}
|
|
425
480
|
|
|
481
|
+
// Keep case_id, chr, start, end, and add "mutation"
|
|
482
|
+
out_lst = out_lst[0..4].to_vec();
|
|
483
|
+
out_lst.push("mutation".to_string());
|
|
484
|
+
|
|
485
|
+
// adding 'chr' to chromosome if it is not start with 'chr'
|
|
486
|
+
if !out_lst[1].starts_with("chr") {
|
|
487
|
+
out_lst[1] = format!("chr{}", out_lst[1]);
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// Chromosome filtering
|
|
491
|
+
if !chromosomes.is_empty() && !chromosomes.contains(&out_lst[1]) {
|
|
492
|
+
*case_details
|
|
493
|
+
.maf
|
|
494
|
+
.skipped_chromosomes
|
|
495
|
+
.entry(out_lst[1].clone())
|
|
496
|
+
.or_insert(0) += 1;
|
|
497
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
498
|
+
return Ok(None);
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
// Update counters for included MAF records
|
|
502
|
+
case_details.maf.total_included += 1;
|
|
503
|
+
included_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
504
|
+
|
|
505
|
+
Ok(Some(out_lst))
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
// Process a single row of CNV file
|
|
509
|
+
async fn process_cnvline(
|
|
510
|
+
line: &str,
|
|
511
|
+
case_id: &str,
|
|
512
|
+
data_type: &str,
|
|
513
|
+
header: &[String],
|
|
514
|
+
columns_indices: &[usize],
|
|
515
|
+
gain_threshold: f32,
|
|
516
|
+
loss_threshold: f32,
|
|
517
|
+
seg_length: i32,
|
|
518
|
+
chromosomes: &HashSet<String>,
|
|
519
|
+
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
520
|
+
filtered_cnv_records: &AtomicUsize,
|
|
521
|
+
included_cnv_records: &AtomicUsize,
|
|
522
|
+
) -> Result<Option<Vec<String>>, (String, String, String)> {
|
|
523
|
+
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
524
|
+
let mut out_lst = vec![case_id.to_string()];
|
|
525
|
+
|
|
526
|
+
// Initialize or update case details
|
|
527
|
+
let mut filtered_map = filtered_records.lock().await;
|
|
528
|
+
filtered_map
|
|
529
|
+
.entry(case_id.to_string())
|
|
530
|
+
.or_insert_with(|| FilteredCaseDetails {
|
|
531
|
+
maf: FilteredMafDetails::default(),
|
|
532
|
+
cnv: FilteredCnvDetails::default(),
|
|
533
|
+
});
|
|
534
|
+
let case_details = filtered_map.get_mut(case_id).unwrap();
|
|
535
|
+
|
|
536
|
+
// Track total processed records
|
|
537
|
+
case_details.cnv.total_processed += 1;
|
|
538
|
+
|
|
539
|
+
// Extract relevant columns
|
|
540
|
+
for &x in columns_indices {
|
|
541
|
+
if x >= cont_lst.len() {
|
|
542
|
+
case_details.cnv.invalid_rows += 1;
|
|
543
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
544
|
+
return Ok(None); // Invalid row
|
|
545
|
+
}
|
|
426
546
|
let mut element = cont_lst[x].to_string();
|
|
427
|
-
if
|
|
547
|
+
if header[x] == "Segment_Mean" {
|
|
428
548
|
element = process_segment_mean(&element, case_id, data_type, gain_threshold, loss_threshold)?;
|
|
429
549
|
if element.is_empty() {
|
|
430
550
|
case_details.cnv.segment_mean += 1;
|
|
@@ -444,89 +564,55 @@ async fn process_row(
|
|
|
444
564
|
out_lst.push(element);
|
|
445
565
|
}
|
|
446
566
|
|
|
447
|
-
//
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
(
|
|
459
|
-
case_id.to_string(),
|
|
460
|
-
data_type.to_string(),
|
|
461
|
-
"Failed to convert t_depth to integer.".to_string(),
|
|
462
|
-
)
|
|
463
|
-
})?;
|
|
464
|
-
|
|
465
|
-
let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
|
|
466
|
-
case_details.maf.invalid_rows += 1;
|
|
467
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
468
|
-
(
|
|
469
|
-
case_id.to_string(),
|
|
470
|
-
data_type.to_string(),
|
|
471
|
-
"Failed to convert t_alt_count to integer.".to_string(),
|
|
472
|
-
)
|
|
473
|
-
})?;
|
|
474
|
-
|
|
475
|
-
if alle_depth < min_total_depth {
|
|
476
|
-
case_details.maf.t_depth += 1;
|
|
477
|
-
case_details.maf.excluded_by_min_depth += 1;
|
|
478
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
479
|
-
return Ok(None);
|
|
480
|
-
}
|
|
481
|
-
if alt_count < min_alt_allele_count {
|
|
482
|
-
case_details.maf.t_alt_count += 1;
|
|
483
|
-
case_details.maf.excluded_by_min_alt_count += 1;
|
|
484
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
485
|
-
return Ok(None);
|
|
486
|
-
}
|
|
487
|
-
|
|
488
|
-
// Keep case_id, chr, start, end, and add "mutation"
|
|
489
|
-
out_lst = out_lst[0..4].to_vec();
|
|
490
|
-
out_lst.push("mutation".to_string());
|
|
567
|
+
// filter cnvs based on segment length. Default: 0 (no filtering)
|
|
568
|
+
// calculate segment length (End_Position - Start_Position)
|
|
569
|
+
let end_position = out_lst[3].parse::<i32>().map_err(|_| {
|
|
570
|
+
case_details.cnv.invalid_rows += 1;
|
|
571
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
572
|
+
(
|
|
573
|
+
case_id.to_string(),
|
|
574
|
+
data_type.to_string(),
|
|
575
|
+
"Failed to convert End Position of cnv to integer.".to_string(),
|
|
576
|
+
)
|
|
577
|
+
})?;
|
|
491
578
|
|
|
492
|
-
|
|
493
|
-
case_details.
|
|
494
|
-
|
|
579
|
+
let start_position = out_lst[2].parse::<i32>().map_err(|_| {
|
|
580
|
+
case_details.cnv.invalid_rows += 1;
|
|
581
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
582
|
+
(
|
|
583
|
+
case_id.to_string(),
|
|
584
|
+
data_type.to_string(),
|
|
585
|
+
"Failed to convert Start Position of cnv to integer.".to_string(),
|
|
586
|
+
)
|
|
587
|
+
})?;
|
|
588
|
+
let cnv_length = end_position - start_position;
|
|
589
|
+
if seg_length > 0 && cnv_length > seg_length {
|
|
590
|
+
case_details.cnv.seg_length += 1;
|
|
591
|
+
case_details.cnv.excluded_by_segment_length += 1;
|
|
592
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
593
|
+
return Ok(None);
|
|
495
594
|
}
|
|
496
595
|
|
|
497
|
-
//
|
|
498
|
-
if
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
case_details.cnv.invalid_rows += 1;
|
|
502
|
-
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
503
|
-
(
|
|
504
|
-
case_id.to_string(),
|
|
505
|
-
data_type.to_string(),
|
|
506
|
-
"Failed to convert End Position of cnv to integer.".to_string(),
|
|
507
|
-
)
|
|
508
|
-
})?;
|
|
596
|
+
// adding 'chr' to chromosome if it is not start with 'chr'
|
|
597
|
+
if !out_lst[1].starts_with("chr") {
|
|
598
|
+
out_lst[1] = format!("chr{}", out_lst[1]);
|
|
599
|
+
}
|
|
509
600
|
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
let cnv_length = end_position - start_position;
|
|
520
|
-
if seg_length > 0 && cnv_length > seg_length {
|
|
521
|
-
case_details.cnv.seg_length += 1;
|
|
522
|
-
case_details.cnv.excluded_by_segment_length += 1;
|
|
523
|
-
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
524
|
-
return Ok(None);
|
|
525
|
-
}
|
|
526
|
-
case_details.cnv.total_included += 1;
|
|
527
|
-
included_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
601
|
+
// Chromosome filtering
|
|
602
|
+
if !chromosomes.is_empty() && !chromosomes.contains(&out_lst[1]) {
|
|
603
|
+
*case_details
|
|
604
|
+
.cnv
|
|
605
|
+
.skipped_chromosomes
|
|
606
|
+
.entry(out_lst[1].clone())
|
|
607
|
+
.or_insert(0) += 1;
|
|
608
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
609
|
+
return Ok(None);
|
|
528
610
|
}
|
|
529
611
|
|
|
612
|
+
// Update counters for included MAF records
|
|
613
|
+
case_details.cnv.total_included += 1;
|
|
614
|
+
included_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
615
|
+
|
|
530
616
|
Ok(Some(out_lst))
|
|
531
617
|
}
|
|
532
618
|
|
|
@@ -672,6 +758,7 @@ async fn download_data_streaming(
|
|
|
672
758
|
loss_threshold: f32,
|
|
673
759
|
seg_length: i32,
|
|
674
760
|
cnv_hyper_mutator: i32,
|
|
761
|
+
chromosomes: &HashSet<String>,
|
|
675
762
|
) {
|
|
676
763
|
let data_urls: Vec<(String, String, String)> = data4dl
|
|
677
764
|
.into_iter()
|
|
@@ -738,6 +825,7 @@ async fn download_data_streaming(
|
|
|
738
825
|
loss_threshold,
|
|
739
826
|
seg_length,
|
|
740
827
|
cnv_hyper_mutator,
|
|
828
|
+
&chromosomes,
|
|
741
829
|
&filtered_records,
|
|
742
830
|
&filtered_maf_records,
|
|
743
831
|
&filtered_cnv_records,
|
|
@@ -902,6 +990,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
902
990
|
None => (0.3, -0.4, 0, 500), // Default values
|
|
903
991
|
};
|
|
904
992
|
|
|
993
|
+
// Convert Vec<String> to HashSet<String> for faster lookup
|
|
994
|
+
let chromosomes = input_js.chromosomes.into_iter().collect::<HashSet<String>>();
|
|
995
|
+
|
|
905
996
|
// Download data - this will now handle errors gracefully
|
|
906
997
|
download_data_streaming(
|
|
907
998
|
case_files,
|
|
@@ -914,6 +1005,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
914
1005
|
loss_threshold,
|
|
915
1006
|
seg_length,
|
|
916
1007
|
cnv_hyper_mutator,
|
|
1008
|
+
&chromosomes,
|
|
917
1009
|
)
|
|
918
1010
|
.await;
|
|
919
1011
|
|