@sjcrh/proteinpaint-rust 2.133.0 → 2.135.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/gdcGRIN2.rs +246 -154
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.133.0",
2
+ "version": "2.135.0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "type": "module",
5
5
  "description": "Rust-based utilities for proteinpaint",
package/src/gdcGRIN2.rs CHANGED
@@ -6,14 +6,18 @@
6
6
  2. Better timeout handling with retries
7
7
  3. More detailed error reporting
8
8
  4. Continues processing even when some files fail
9
+ 5. Added chromosome filtering
9
10
 
10
11
  Input JSON:
11
12
  caseFiles
12
13
  mafOptions: For SNVindel filtering
14
+ cnvOptions: For CNV filtering
15
+ chromosomes: chromosomes will be included:[]
16
+
13
17
  Output mutations as JSON array.
14
18
 
15
19
  Example of usage:
16
- echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20,"hyperMutator":8000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000, "hyperMutator":8000}}' | ./target/release/gdcGRIN2
20
+ echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20,"hyperMutator":8000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000, "hyperMutator":8000}, "chromosomes":["chr1","chr2","chr3"]}' | ./target/release/gdcGRIN2
17
21
  */
18
22
 
19
23
  use flate2::read::GzDecoder;
@@ -21,7 +25,7 @@ use futures::StreamExt;
21
25
  use memchr::memchr;
22
26
  use serde::{Deserialize, Serialize};
23
27
  use serde_json;
24
- use std::collections::HashMap;
28
+ use std::collections::{HashMap, HashSet};
25
29
  use std::io::{self, Read};
26
30
  use std::sync::Arc;
27
31
  use std::sync::atomic::{AtomicUsize, Ordering};
@@ -96,6 +100,7 @@ struct FilteredMafDetails {
96
100
  excluded_by_consequence_type: usize,
97
101
  total_processed: usize,
98
102
  total_included: usize,
103
+ skipped_chromosomes: HashMap<String, usize>,
99
104
  }
100
105
 
101
106
  // struct for CNV filter details
@@ -109,6 +114,7 @@ struct FilteredCnvDetails {
109
114
  excluded_by_segment_length: usize,
110
115
  total_processed: usize,
111
116
  total_included: usize,
117
+ skipped_chromosomes: HashMap<String, usize>,
112
118
  }
113
119
 
114
120
  // struct for per-case filter details
@@ -145,6 +151,7 @@ struct InputData {
145
151
  maf_options: Option<MafOptions>,
146
152
  #[serde(rename = "cnvOptions")]
147
153
  cnv_options: Option<CnvOptions>,
154
+ chromosomes: Vec<String>,
148
155
  }
149
156
 
150
157
  // Configuration for different data types
@@ -167,6 +174,7 @@ async fn parse_content(
167
174
  loss_threshold: f32,
168
175
  seg_length: i32,
169
176
  cnv_hyper_mutator: i32,
177
+ chromosomes: &HashSet<String>,
170
178
  filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
171
179
  filtered_maf_records: &AtomicUsize,
172
180
  filtered_cnv_records: &AtomicUsize,
@@ -219,9 +227,6 @@ async fn parse_content(
219
227
  let mut parsed_data = Vec::new();
220
228
  let mut columns_indices: Vec<usize> = Vec::new();
221
229
  let mut variant_classification_index: Option<usize> = None;
222
- //let mut header_mk: &str = "";
223
- //let mut columns = Vec::new();
224
-
225
230
  let mut header: Vec<String> = Vec::new();
226
231
 
227
232
  for line in lines {
@@ -243,26 +248,49 @@ async fn parse_content(
243
248
  continue;
244
249
  };
245
250
 
246
- let row = process_row(
247
- line,
248
- case_id,
249
- data_type,
250
- &header,
251
- &columns_indices,
252
- variant_classification_index,
253
- consequences,
254
- min_total_depth,
255
- min_alt_allele_count,
256
- gain_threshold,
257
- loss_threshold,
258
- seg_length,
259
- filtered_records,
260
- filtered_maf_records,
261
- filtered_cnv_records,
262
- included_maf_records,
263
- included_cnv_records,
264
- )
265
- .await?;
251
+ let row = match data_type {
252
+ "maf" => {
253
+ process_mafline(
254
+ line,
255
+ case_id,
256
+ data_type,
257
+ &columns_indices,
258
+ variant_classification_index,
259
+ consequences,
260
+ min_total_depth,
261
+ min_alt_allele_count,
262
+ chromosomes,
263
+ filtered_records,
264
+ filtered_maf_records,
265
+ included_maf_records,
266
+ )
267
+ .await
268
+ }
269
+ "cnv" => {
270
+ process_cnvline(
271
+ line,
272
+ case_id,
273
+ data_type,
274
+ &header,
275
+ &columns_indices,
276
+ gain_threshold,
277
+ loss_threshold,
278
+ seg_length,
279
+ chromosomes,
280
+ filtered_records,
281
+ filtered_cnv_records,
282
+ included_cnv_records,
283
+ )
284
+ .await
285
+ }
286
+ _ => {
287
+ return Err((
288
+ case_id.to_string(),
289
+ data_type.to_string(),
290
+ "Invalid data type".to_string(),
291
+ ));
292
+ }
293
+ }?;
266
294
 
267
295
  if let Some(out_lst) = row {
268
296
  parsed_data.push(out_lst);
@@ -316,25 +344,20 @@ fn setup_columns(
316
344
  Ok(())
317
345
  }
318
346
 
319
- // Process a single row of data
320
- async fn process_row(
347
+ // Process a single row of MAF file
348
+ async fn process_mafline(
321
349
  line: &str,
322
350
  case_id: &str,
323
351
  data_type: &str,
324
- header: &[String],
325
352
  columns_indices: &[usize],
326
353
  variant_classification_index: Option<usize>,
327
354
  consequences: &Option<Vec<String>>,
328
355
  min_total_depth: i32,
329
356
  min_alt_allele_count: i32,
330
- gain_threshold: f32,
331
- loss_threshold: f32,
332
- seg_length: i32,
357
+ chromosomes: &HashSet<String>,
333
358
  filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
334
359
  filtered_maf_records: &AtomicUsize,
335
- filtered_cnv_records: &AtomicUsize,
336
360
  included_maf_records: &AtomicUsize,
337
- included_cnv_records: &AtomicUsize,
338
361
  ) -> Result<Option<Vec<String>>, (String, String, String)> {
339
362
  let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
340
363
  let mut out_lst = vec![case_id.to_string()];
@@ -347,51 +370,38 @@ async fn process_row(
347
370
  maf: FilteredMafDetails::default(),
348
371
  cnv: FilteredCnvDetails::default(),
349
372
  });
350
-
351
373
  let case_details = filtered_map.get_mut(case_id).unwrap();
352
374
 
353
375
  // Track total processed records
354
- if data_type == "maf" {
355
- case_details.maf.total_processed += 1;
356
- } else if data_type == "cnv" {
357
- case_details.cnv.total_processed += 1;
358
- }
376
+ case_details.maf.total_processed += 1;
359
377
 
360
378
  // Handle consequence filtering and counting for MAF files
361
- if data_type == "maf" {
362
- if let Some(var_class_idx) = variant_classification_index {
363
- if var_class_idx < cont_lst.len() {
364
- let variant_classification = &cont_lst[var_class_idx];
365
- if let Some(consequence_filter) = consequences {
366
- if !consequence_filter.is_empty() {
367
- if consequence_filter.contains(variant_classification) {
368
- // Matched consequence
369
- *case_details
370
- .maf
371
- .matched_consequences
372
- .entry(variant_classification.to_string())
373
- .or_insert(0) += 1;
374
- } else {
375
- // Unmatched consequence
376
- *case_details
377
- .maf
378
- .rejected_consequences
379
- .entry(variant_classification.to_string())
380
- .or_insert(0) += 1;
381
- case_details.maf.excluded_by_consequence_type += 1;
382
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
383
- return Ok(None);
384
- }
385
- } else {
386
- // Empty filter, count as matched
379
+
380
+ if let Some(var_class_idx) = variant_classification_index {
381
+ if var_class_idx < cont_lst.len() {
382
+ let variant_classification = &cont_lst[var_class_idx];
383
+ if let Some(consequence_filter) = consequences {
384
+ if !consequence_filter.is_empty() {
385
+ if consequence_filter.contains(variant_classification) {
386
+ // Matched consequence
387
387
  *case_details
388
388
  .maf
389
389
  .matched_consequences
390
390
  .entry(variant_classification.to_string())
391
391
  .or_insert(0) += 1;
392
+ } else {
393
+ // Unmatched consequence
394
+ *case_details
395
+ .maf
396
+ .rejected_consequences
397
+ .entry(variant_classification.to_string())
398
+ .or_insert(0) += 1;
399
+ case_details.maf.excluded_by_consequence_type += 1;
400
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
401
+ return Ok(None);
392
402
  }
393
403
  } else {
394
- // No filter, count as matched
404
+ // Empty filter, count as matched
395
405
  *case_details
396
406
  .maf
397
407
  .matched_consequences
@@ -399,32 +409,142 @@ async fn process_row(
399
409
  .or_insert(0) += 1;
400
410
  }
401
411
  } else {
402
- case_details.maf.invalid_rows += 1;
403
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
404
- return Ok(None);
412
+ // No filter, count as matched
413
+ *case_details
414
+ .maf
415
+ .matched_consequences
416
+ .entry(variant_classification.to_string())
417
+ .or_insert(0) += 1;
405
418
  }
406
419
  } else {
407
420
  case_details.maf.invalid_rows += 1;
408
421
  filtered_maf_records.fetch_add(1, Ordering::Relaxed);
409
422
  return Ok(None);
410
423
  }
424
+ } else {
425
+ case_details.maf.invalid_rows += 1;
426
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
427
+ return Ok(None);
411
428
  }
412
429
 
413
430
  // Extract relevant columns
414
431
  for &x in columns_indices {
415
432
  if x >= cont_lst.len() {
416
- if data_type == "maf" {
417
- case_details.maf.invalid_rows += 1;
418
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
419
- } else if data_type == "cnv" {
420
- case_details.cnv.invalid_rows += 1;
421
- filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
422
- }
433
+ case_details.maf.invalid_rows += 1;
434
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
423
435
  return Ok(None); // Invalid row
424
436
  }
437
+ let element = cont_lst[x].to_string();
438
+ out_lst.push(element);
439
+ }
440
+
441
+ // Additional MAF-specific processing
442
+ if out_lst.len() < 6 {
443
+ case_details.maf.invalid_rows += 1;
444
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
445
+ return Ok(None); // Not enough columns
446
+ }
447
+
448
+ let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
449
+ case_details.maf.invalid_rows += 1;
450
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
451
+ (
452
+ case_id.to_string(),
453
+ data_type.to_string(),
454
+ "Failed to convert t_depth to integer.".to_string(),
455
+ )
456
+ })?;
457
+
458
+ let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
459
+ case_details.maf.invalid_rows += 1;
460
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
461
+ (
462
+ case_id.to_string(),
463
+ data_type.to_string(),
464
+ "Failed to convert t_alt_count to integer.".to_string(),
465
+ )
466
+ })?;
467
+
468
+ if alle_depth < min_total_depth {
469
+ case_details.maf.t_depth += 1;
470
+ case_details.maf.excluded_by_min_depth += 1;
471
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
472
+ return Ok(None);
473
+ }
474
+ if alt_count < min_alt_allele_count {
475
+ case_details.maf.t_alt_count += 1;
476
+ case_details.maf.excluded_by_min_alt_count += 1;
477
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
478
+ return Ok(None);
479
+ }
425
480
 
481
+ // Keep case_id, chr, start, end, and add "mutation"
482
+ out_lst = out_lst[0..4].to_vec();
483
+ out_lst.push("mutation".to_string());
484
+
485
+ // adding 'chr' to chromosome if it is not start with 'chr'
486
+ if !out_lst[1].starts_with("chr") {
487
+ out_lst[1] = format!("chr{}", out_lst[1]);
488
+ }
489
+
490
+ // Chromosome filtering
491
+ if !chromosomes.is_empty() && !chromosomes.contains(&out_lst[1]) {
492
+ *case_details
493
+ .maf
494
+ .skipped_chromosomes
495
+ .entry(out_lst[1].clone())
496
+ .or_insert(0) += 1;
497
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
498
+ return Ok(None);
499
+ }
500
+
501
+ // Update counters for included MAF records
502
+ case_details.maf.total_included += 1;
503
+ included_maf_records.fetch_add(1, Ordering::Relaxed);
504
+
505
+ Ok(Some(out_lst))
506
+ }
507
+
508
+ // Process a single row of CNV file
509
+ async fn process_cnvline(
510
+ line: &str,
511
+ case_id: &str,
512
+ data_type: &str,
513
+ header: &[String],
514
+ columns_indices: &[usize],
515
+ gain_threshold: f32,
516
+ loss_threshold: f32,
517
+ seg_length: i32,
518
+ chromosomes: &HashSet<String>,
519
+ filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
520
+ filtered_cnv_records: &AtomicUsize,
521
+ included_cnv_records: &AtomicUsize,
522
+ ) -> Result<Option<Vec<String>>, (String, String, String)> {
523
+ let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
524
+ let mut out_lst = vec![case_id.to_string()];
525
+
526
+ // Initialize or update case details
527
+ let mut filtered_map = filtered_records.lock().await;
528
+ filtered_map
529
+ .entry(case_id.to_string())
530
+ .or_insert_with(|| FilteredCaseDetails {
531
+ maf: FilteredMafDetails::default(),
532
+ cnv: FilteredCnvDetails::default(),
533
+ });
534
+ let case_details = filtered_map.get_mut(case_id).unwrap();
535
+
536
+ // Track total processed records
537
+ case_details.cnv.total_processed += 1;
538
+
539
+ // Extract relevant columns
540
+ for &x in columns_indices {
541
+ if x >= cont_lst.len() {
542
+ case_details.cnv.invalid_rows += 1;
543
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
544
+ return Ok(None); // Invalid row
545
+ }
426
546
  let mut element = cont_lst[x].to_string();
427
- if data_type == "cnv" && header[x] == "Segment_Mean" {
547
+ if header[x] == "Segment_Mean" {
428
548
  element = process_segment_mean(&element, case_id, data_type, gain_threshold, loss_threshold)?;
429
549
  if element.is_empty() {
430
550
  case_details.cnv.segment_mean += 1;
@@ -444,89 +564,55 @@ async fn process_row(
444
564
  out_lst.push(element);
445
565
  }
446
566
 
447
- // Additional MAF-specific processing
448
- if data_type == "maf" {
449
- if out_lst.len() < 6 {
450
- case_details.maf.invalid_rows += 1;
451
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
452
- return Ok(None); // Not enough columns
453
- }
454
-
455
- let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
456
- case_details.maf.invalid_rows += 1;
457
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
458
- (
459
- case_id.to_string(),
460
- data_type.to_string(),
461
- "Failed to convert t_depth to integer.".to_string(),
462
- )
463
- })?;
464
-
465
- let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
466
- case_details.maf.invalid_rows += 1;
467
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
468
- (
469
- case_id.to_string(),
470
- data_type.to_string(),
471
- "Failed to convert t_alt_count to integer.".to_string(),
472
- )
473
- })?;
474
-
475
- if alle_depth < min_total_depth {
476
- case_details.maf.t_depth += 1;
477
- case_details.maf.excluded_by_min_depth += 1;
478
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
479
- return Ok(None);
480
- }
481
- if alt_count < min_alt_allele_count {
482
- case_details.maf.t_alt_count += 1;
483
- case_details.maf.excluded_by_min_alt_count += 1;
484
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
485
- return Ok(None);
486
- }
487
-
488
- // Keep case_id, chr, start, end, and add "mutation"
489
- out_lst = out_lst[0..4].to_vec();
490
- out_lst.push("mutation".to_string());
567
+ // filter cnvs based on segment length. Default: 0 (no filtering)
568
+ // calculate segment length (End_Position - Start_Position)
569
+ let end_position = out_lst[3].parse::<i32>().map_err(|_| {
570
+ case_details.cnv.invalid_rows += 1;
571
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
572
+ (
573
+ case_id.to_string(),
574
+ data_type.to_string(),
575
+ "Failed to convert End Position of cnv to integer.".to_string(),
576
+ )
577
+ })?;
491
578
 
492
- // Update counters for included MAF records
493
- case_details.maf.total_included += 1;
494
- included_maf_records.fetch_add(1, Ordering::Relaxed);
579
+ let start_position = out_lst[2].parse::<i32>().map_err(|_| {
580
+ case_details.cnv.invalid_rows += 1;
581
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
582
+ (
583
+ case_id.to_string(),
584
+ data_type.to_string(),
585
+ "Failed to convert Start Position of cnv to integer.".to_string(),
586
+ )
587
+ })?;
588
+ let cnv_length = end_position - start_position;
589
+ if seg_length > 0 && cnv_length > seg_length {
590
+ case_details.cnv.seg_length += 1;
591
+ case_details.cnv.excluded_by_segment_length += 1;
592
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
593
+ return Ok(None);
495
594
  }
496
595
 
497
- // filter cnvs based on segment length. Default: 0 (no filtering)
498
- if data_type == "cnv" {
499
- // calculate segment length (End_Position - Start_Position)
500
- let end_position = out_lst[3].parse::<i32>().map_err(|_| {
501
- case_details.cnv.invalid_rows += 1;
502
- filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
503
- (
504
- case_id.to_string(),
505
- data_type.to_string(),
506
- "Failed to convert End Position of cnv to integer.".to_string(),
507
- )
508
- })?;
596
+ // adding 'chr' to chromosome if it is not start with 'chr'
597
+ if !out_lst[1].starts_with("chr") {
598
+ out_lst[1] = format!("chr{}", out_lst[1]);
599
+ }
509
600
 
510
- let start_position = out_lst[2].parse::<i32>().map_err(|_| {
511
- case_details.cnv.invalid_rows += 1;
512
- filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
513
- (
514
- case_id.to_string(),
515
- data_type.to_string(),
516
- "Failed to convert Start Position of cnv to integer.".to_string(),
517
- )
518
- })?;
519
- let cnv_length = end_position - start_position;
520
- if seg_length > 0 && cnv_length > seg_length {
521
- case_details.cnv.seg_length += 1;
522
- case_details.cnv.excluded_by_segment_length += 1;
523
- filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
524
- return Ok(None);
525
- }
526
- case_details.cnv.total_included += 1;
527
- included_cnv_records.fetch_add(1, Ordering::Relaxed);
601
+ // Chromosome filtering
602
+ if !chromosomes.is_empty() && !chromosomes.contains(&out_lst[1]) {
603
+ *case_details
604
+ .cnv
605
+ .skipped_chromosomes
606
+ .entry(out_lst[1].clone())
607
+ .or_insert(0) += 1;
608
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
609
+ return Ok(None);
528
610
  }
529
611
 
612
+ // Update counters for included MAF records
613
+ case_details.cnv.total_included += 1;
614
+ included_cnv_records.fetch_add(1, Ordering::Relaxed);
615
+
530
616
  Ok(Some(out_lst))
531
617
  }
532
618
 
@@ -672,6 +758,7 @@ async fn download_data_streaming(
672
758
  loss_threshold: f32,
673
759
  seg_length: i32,
674
760
  cnv_hyper_mutator: i32,
761
+ chromosomes: &HashSet<String>,
675
762
  ) {
676
763
  let data_urls: Vec<(String, String, String)> = data4dl
677
764
  .into_iter()
@@ -738,6 +825,7 @@ async fn download_data_streaming(
738
825
  loss_threshold,
739
826
  seg_length,
740
827
  cnv_hyper_mutator,
828
+ &chromosomes,
741
829
  &filtered_records,
742
830
  &filtered_maf_records,
743
831
  &filtered_cnv_records,
@@ -902,6 +990,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
902
990
  None => (0.3, -0.4, 0, 500), // Default values
903
991
  };
904
992
 
993
+ // Convert Vec<String> to HashSet<String> for faster lookup
994
+ let chromosomes = input_js.chromosomes.into_iter().collect::<HashSet<String>>();
995
+
905
996
  // Download data - this will now handle errors gracefully
906
997
  download_data_streaming(
907
998
  case_files,
@@ -914,6 +1005,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
914
1005
  loss_threshold,
915
1006
  seg_length,
916
1007
  cnv_hyper_mutator,
1008
+ &chromosomes,
917
1009
  )
918
1010
  .await;
919
1011