@sjcrh/proteinpaint-rust 2.132.1-0 → 2.135.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/gdcGRIN2.rs +283 -169
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.132.1-0",
2
+ "version": "2.135.0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "type": "module",
5
5
  "description": "Rust-based utilities for proteinpaint",
package/src/gdcGRIN2.rs CHANGED
@@ -6,14 +6,18 @@
6
6
  2. Better timeout handling with retries
7
7
  3. More detailed error reporting
8
8
  4. Continues processing even when some files fail
9
+ 5. Added chromosome filtering
9
10
 
10
11
  Input JSON:
11
12
  caseFiles
12
13
  mafOptions: For SNVindel filtering
14
+ cnvOptions: For CNV filtering
15
+ chromosomes: chromosomes will be included:[]
16
+
13
17
  Output mutations as JSON array.
14
18
 
15
19
  Example of usage:
16
- echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20,"hyperMutator":1000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000}}' | ./target/release/gdcGRIN2
20
+ echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20,"hyperMutator":8000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000, "hyperMutator":8000}, "chromosomes":["chr1","chr2","chr3"]}' | ./target/release/gdcGRIN2
17
21
  */
18
22
 
19
23
  use flate2::read::GzDecoder;
@@ -21,7 +25,7 @@ use futures::StreamExt;
21
25
  use memchr::memchr;
22
26
  use serde::{Deserialize, Serialize};
23
27
  use serde_json;
24
- use std::collections::HashMap;
28
+ use std::collections::{HashMap, HashSet};
25
29
  use std::io::{self, Read};
26
30
  use std::sync::Arc;
27
31
  use std::sync::atomic::{AtomicUsize, Ordering};
@@ -69,6 +73,8 @@ struct CnvOptions {
69
73
  gain_threshold: f32,
70
74
  #[serde(rename = "segLength")]
71
75
  seg_length: i32,
76
+ #[serde(rename = "hyperMutator")]
77
+ hyper_mutator: i32,
72
78
  }
73
79
 
74
80
  // Individual successful file output (JSONL format)
@@ -94,6 +100,7 @@ struct FilteredMafDetails {
94
100
  excluded_by_consequence_type: usize,
95
101
  total_processed: usize,
96
102
  total_included: usize,
103
+ skipped_chromosomes: HashMap<String, usize>,
97
104
  }
98
105
 
99
106
  // struct for CNV filter details
@@ -107,6 +114,7 @@ struct FilteredCnvDetails {
107
114
  excluded_by_segment_length: usize,
108
115
  total_processed: usize,
109
116
  total_included: usize,
117
+ skipped_chromosomes: HashMap<String, usize>,
110
118
  }
111
119
 
112
120
  // struct for per-case filter details
@@ -131,7 +139,7 @@ struct FinalSummary {
131
139
  included_maf_records: usize,
132
140
  included_cnv_records: usize,
133
141
  filtered_records_by_case: HashMap<String, FilteredCaseDetails>,
134
- hyper_mutator_records: Vec<String>,
142
+ hyper_mutator_records: HashMap<String, Vec<String>>,
135
143
  }
136
144
 
137
145
  // Define the top-level input structure
@@ -143,6 +151,7 @@ struct InputData {
143
151
  maf_options: Option<MafOptions>,
144
152
  #[serde(rename = "cnvOptions")]
145
153
  cnv_options: Option<CnvOptions>,
154
+ chromosomes: Vec<String>,
146
155
  }
147
156
 
148
157
  // Configuration for different data types
@@ -159,17 +168,19 @@ async fn parse_content(
159
168
  data_type: &str,
160
169
  min_total_depth: i32,
161
170
  min_alt_allele_count: i32,
162
- hyper_mutator: i32,
171
+ maf_hyper_mutator: i32,
163
172
  consequences: &Option<Vec<String>>,
164
173
  gain_threshold: f32,
165
174
  loss_threshold: f32,
166
175
  seg_length: i32,
176
+ cnv_hyper_mutator: i32,
177
+ chromosomes: &HashSet<String>,
167
178
  filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
168
179
  filtered_maf_records: &AtomicUsize,
169
180
  filtered_cnv_records: &AtomicUsize,
170
181
  included_maf_records: &AtomicUsize,
171
182
  included_cnv_records: &AtomicUsize,
172
- hyper_mutator_records: &Arc<Mutex<Vec<String>>>,
183
+ hyper_mutator_records: &Arc<Mutex<HashMap<String, Vec<String>>>>,
173
184
  ) -> Result<Vec<Vec<String>>, (String, String, String)> {
174
185
  let config = match data_type {
175
186
  "cnv" => DataTypeConfig {
@@ -189,13 +200,24 @@ async fn parse_content(
189
200
  }
190
201
  };
191
202
 
192
- // check hyperMutator for MAF files
193
- if data_type == "maf" && hyper_mutator > 0 {
203
+ // check hyperMutator for MAF and CNV files
204
+ let hyper_mutator = if data_type == "maf" {
205
+ maf_hyper_mutator
206
+ } else {
207
+ cnv_hyper_mutator
208
+ };
209
+ if hyper_mutator > 0 {
194
210
  let line_count = content.lines().count();
195
211
  if line_count as i32 > hyper_mutator {
196
212
  let mut hyper_records = hyper_mutator_records.lock().await;
197
- if !hyper_records.contains(&case_id.to_string()) {
198
- hyper_records.push(case_id.to_string());
213
+ hyper_records
214
+ .entry(data_type.to_string())
215
+ .or_insert_with(Vec::new)
216
+ .push(case_id.to_string());
217
+ if data_type == "maf" {
218
+ filtered_maf_records.fetch_add(line_count, Ordering::Relaxed);
219
+ } else if data_type == "cnv" {
220
+ filtered_cnv_records.fetch_add(line_count, Ordering::Relaxed);
199
221
  }
200
222
  return Ok(Vec::new());
201
223
  }
@@ -205,9 +227,6 @@ async fn parse_content(
205
227
  let mut parsed_data = Vec::new();
206
228
  let mut columns_indices: Vec<usize> = Vec::new();
207
229
  let mut variant_classification_index: Option<usize> = None;
208
- //let mut header_mk: &str = "";
209
- //let mut columns = Vec::new();
210
-
211
230
  let mut header: Vec<String> = Vec::new();
212
231
 
213
232
  for line in lines {
@@ -229,26 +248,49 @@ async fn parse_content(
229
248
  continue;
230
249
  };
231
250
 
232
- let row = process_row(
233
- line,
234
- case_id,
235
- data_type,
236
- &header,
237
- &columns_indices,
238
- variant_classification_index,
239
- consequences,
240
- min_total_depth,
241
- min_alt_allele_count,
242
- gain_threshold,
243
- loss_threshold,
244
- seg_length,
245
- filtered_records,
246
- filtered_maf_records,
247
- filtered_cnv_records,
248
- included_maf_records,
249
- included_cnv_records,
250
- )
251
- .await?;
251
+ let row = match data_type {
252
+ "maf" => {
253
+ process_mafline(
254
+ line,
255
+ case_id,
256
+ data_type,
257
+ &columns_indices,
258
+ variant_classification_index,
259
+ consequences,
260
+ min_total_depth,
261
+ min_alt_allele_count,
262
+ chromosomes,
263
+ filtered_records,
264
+ filtered_maf_records,
265
+ included_maf_records,
266
+ )
267
+ .await
268
+ }
269
+ "cnv" => {
270
+ process_cnvline(
271
+ line,
272
+ case_id,
273
+ data_type,
274
+ &header,
275
+ &columns_indices,
276
+ gain_threshold,
277
+ loss_threshold,
278
+ seg_length,
279
+ chromosomes,
280
+ filtered_records,
281
+ filtered_cnv_records,
282
+ included_cnv_records,
283
+ )
284
+ .await
285
+ }
286
+ _ => {
287
+ return Err((
288
+ case_id.to_string(),
289
+ data_type.to_string(),
290
+ "Invalid data type".to_string(),
291
+ ));
292
+ }
293
+ }?;
252
294
 
253
295
  if let Some(out_lst) = row {
254
296
  parsed_data.push(out_lst);
@@ -302,25 +344,20 @@ fn setup_columns(
302
344
  Ok(())
303
345
  }
304
346
 
305
- // Process a single row of data
306
- async fn process_row(
347
+ // Process a single row of MAF file
348
+ async fn process_mafline(
307
349
  line: &str,
308
350
  case_id: &str,
309
351
  data_type: &str,
310
- header: &[String],
311
352
  columns_indices: &[usize],
312
353
  variant_classification_index: Option<usize>,
313
354
  consequences: &Option<Vec<String>>,
314
355
  min_total_depth: i32,
315
356
  min_alt_allele_count: i32,
316
- gain_threshold: f32,
317
- loss_threshold: f32,
318
- seg_length: i32,
357
+ chromosomes: &HashSet<String>,
319
358
  filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
320
359
  filtered_maf_records: &AtomicUsize,
321
- filtered_cnv_records: &AtomicUsize,
322
360
  included_maf_records: &AtomicUsize,
323
- included_cnv_records: &AtomicUsize,
324
361
  ) -> Result<Option<Vec<String>>, (String, String, String)> {
325
362
  let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
326
363
  let mut out_lst = vec![case_id.to_string()];
@@ -333,51 +370,38 @@ async fn process_row(
333
370
  maf: FilteredMafDetails::default(),
334
371
  cnv: FilteredCnvDetails::default(),
335
372
  });
336
-
337
373
  let case_details = filtered_map.get_mut(case_id).unwrap();
338
374
 
339
375
  // Track total processed records
340
- if data_type == "maf" {
341
- case_details.maf.total_processed += 1;
342
- } else if data_type == "cnv" {
343
- case_details.cnv.total_processed += 1;
344
- }
376
+ case_details.maf.total_processed += 1;
345
377
 
346
378
  // Handle consequence filtering and counting for MAF files
347
- if data_type == "maf" {
348
- if let Some(var_class_idx) = variant_classification_index {
349
- if var_class_idx < cont_lst.len() {
350
- let variant_classification = &cont_lst[var_class_idx];
351
- if let Some(consequence_filter) = consequences {
352
- if !consequence_filter.is_empty() {
353
- if consequence_filter.contains(variant_classification) {
354
- // Matched consequence
355
- *case_details
356
- .maf
357
- .matched_consequences
358
- .entry(variant_classification.to_string())
359
- .or_insert(0) += 1;
360
- } else {
361
- // Unmatched consequence
362
- *case_details
363
- .maf
364
- .rejected_consequences
365
- .entry(variant_classification.to_string())
366
- .or_insert(0) += 1;
367
- case_details.maf.excluded_by_consequence_type += 1;
368
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
369
- return Ok(None);
370
- }
371
- } else {
372
- // Empty filter, count as matched
379
+
380
+ if let Some(var_class_idx) = variant_classification_index {
381
+ if var_class_idx < cont_lst.len() {
382
+ let variant_classification = &cont_lst[var_class_idx];
383
+ if let Some(consequence_filter) = consequences {
384
+ if !consequence_filter.is_empty() {
385
+ if consequence_filter.contains(variant_classification) {
386
+ // Matched consequence
373
387
  *case_details
374
388
  .maf
375
389
  .matched_consequences
376
390
  .entry(variant_classification.to_string())
377
391
  .or_insert(0) += 1;
392
+ } else {
393
+ // Unmatched consequence
394
+ *case_details
395
+ .maf
396
+ .rejected_consequences
397
+ .entry(variant_classification.to_string())
398
+ .or_insert(0) += 1;
399
+ case_details.maf.excluded_by_consequence_type += 1;
400
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
401
+ return Ok(None);
378
402
  }
379
403
  } else {
380
- // No filter, count as matched
404
+ // Empty filter, count as matched
381
405
  *case_details
382
406
  .maf
383
407
  .matched_consequences
@@ -385,32 +409,142 @@ async fn process_row(
385
409
  .or_insert(0) += 1;
386
410
  }
387
411
  } else {
388
- case_details.maf.invalid_rows += 1;
389
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
390
- return Ok(None);
412
+ // No filter, count as matched
413
+ *case_details
414
+ .maf
415
+ .matched_consequences
416
+ .entry(variant_classification.to_string())
417
+ .or_insert(0) += 1;
391
418
  }
392
419
  } else {
393
420
  case_details.maf.invalid_rows += 1;
394
421
  filtered_maf_records.fetch_add(1, Ordering::Relaxed);
395
422
  return Ok(None);
396
423
  }
424
+ } else {
425
+ case_details.maf.invalid_rows += 1;
426
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
427
+ return Ok(None);
397
428
  }
398
429
 
399
430
  // Extract relevant columns
400
431
  for &x in columns_indices {
401
432
  if x >= cont_lst.len() {
402
- if data_type == "maf" {
403
- case_details.maf.invalid_rows += 1;
404
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
405
- } else if data_type == "cnv" {
406
- case_details.cnv.invalid_rows += 1;
407
- filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
408
- }
433
+ case_details.maf.invalid_rows += 1;
434
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
409
435
  return Ok(None); // Invalid row
410
436
  }
437
+ let element = cont_lst[x].to_string();
438
+ out_lst.push(element);
439
+ }
440
+
441
+ // Additional MAF-specific processing
442
+ if out_lst.len() < 6 {
443
+ case_details.maf.invalid_rows += 1;
444
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
445
+ return Ok(None); // Not enough columns
446
+ }
447
+
448
+ let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
449
+ case_details.maf.invalid_rows += 1;
450
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
451
+ (
452
+ case_id.to_string(),
453
+ data_type.to_string(),
454
+ "Failed to convert t_depth to integer.".to_string(),
455
+ )
456
+ })?;
457
+
458
+ let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
459
+ case_details.maf.invalid_rows += 1;
460
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
461
+ (
462
+ case_id.to_string(),
463
+ data_type.to_string(),
464
+ "Failed to convert t_alt_count to integer.".to_string(),
465
+ )
466
+ })?;
467
+
468
+ if alle_depth < min_total_depth {
469
+ case_details.maf.t_depth += 1;
470
+ case_details.maf.excluded_by_min_depth += 1;
471
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
472
+ return Ok(None);
473
+ }
474
+ if alt_count < min_alt_allele_count {
475
+ case_details.maf.t_alt_count += 1;
476
+ case_details.maf.excluded_by_min_alt_count += 1;
477
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
478
+ return Ok(None);
479
+ }
480
+
481
+ // Keep case_id, chr, start, end, and add "mutation"
482
+ out_lst = out_lst[0..4].to_vec();
483
+ out_lst.push("mutation".to_string());
484
+
485
+ // adding 'chr' to chromosome if it is not start with 'chr'
486
+ if !out_lst[1].starts_with("chr") {
487
+ out_lst[1] = format!("chr{}", out_lst[1]);
488
+ }
489
+
490
+ // Chromosome filtering
491
+ if !chromosomes.is_empty() && !chromosomes.contains(&out_lst[1]) {
492
+ *case_details
493
+ .maf
494
+ .skipped_chromosomes
495
+ .entry(out_lst[1].clone())
496
+ .or_insert(0) += 1;
497
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
498
+ return Ok(None);
499
+ }
500
+
501
+ // Update counters for included MAF records
502
+ case_details.maf.total_included += 1;
503
+ included_maf_records.fetch_add(1, Ordering::Relaxed);
504
+
505
+ Ok(Some(out_lst))
506
+ }
507
+
508
+ // Process a single row of CNV file
509
+ async fn process_cnvline(
510
+ line: &str,
511
+ case_id: &str,
512
+ data_type: &str,
513
+ header: &[String],
514
+ columns_indices: &[usize],
515
+ gain_threshold: f32,
516
+ loss_threshold: f32,
517
+ seg_length: i32,
518
+ chromosomes: &HashSet<String>,
519
+ filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
520
+ filtered_cnv_records: &AtomicUsize,
521
+ included_cnv_records: &AtomicUsize,
522
+ ) -> Result<Option<Vec<String>>, (String, String, String)> {
523
+ let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
524
+ let mut out_lst = vec![case_id.to_string()];
525
+
526
+ // Initialize or update case details
527
+ let mut filtered_map = filtered_records.lock().await;
528
+ filtered_map
529
+ .entry(case_id.to_string())
530
+ .or_insert_with(|| FilteredCaseDetails {
531
+ maf: FilteredMafDetails::default(),
532
+ cnv: FilteredCnvDetails::default(),
533
+ });
534
+ let case_details = filtered_map.get_mut(case_id).unwrap();
535
+
536
+ // Track total processed records
537
+ case_details.cnv.total_processed += 1;
411
538
 
539
+ // Extract relevant columns
540
+ for &x in columns_indices {
541
+ if x >= cont_lst.len() {
542
+ case_details.cnv.invalid_rows += 1;
543
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
544
+ return Ok(None); // Invalid row
545
+ }
412
546
  let mut element = cont_lst[x].to_string();
413
- if data_type == "cnv" && header[x] == "Segment_Mean" {
547
+ if header[x] == "Segment_Mean" {
414
548
  element = process_segment_mean(&element, case_id, data_type, gain_threshold, loss_threshold)?;
415
549
  if element.is_empty() {
416
550
  case_details.cnv.segment_mean += 1;
@@ -430,89 +564,55 @@ async fn process_row(
430
564
  out_lst.push(element);
431
565
  }
432
566
 
433
- // Additional MAF-specific processing
434
- if data_type == "maf" {
435
- if out_lst.len() < 6 {
436
- case_details.maf.invalid_rows += 1;
437
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
438
- return Ok(None); // Not enough columns
439
- }
440
-
441
- let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
442
- case_details.maf.invalid_rows += 1;
443
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
444
- (
445
- case_id.to_string(),
446
- data_type.to_string(),
447
- "Failed to convert t_depth to integer.".to_string(),
448
- )
449
- })?;
450
-
451
- let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
452
- case_details.maf.invalid_rows += 1;
453
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
454
- (
455
- case_id.to_string(),
456
- data_type.to_string(),
457
- "Failed to convert t_alt_count to integer.".to_string(),
458
- )
459
- })?;
460
-
461
- if alle_depth < min_total_depth {
462
- case_details.maf.t_depth += 1;
463
- case_details.maf.excluded_by_min_depth += 1;
464
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
465
- return Ok(None);
466
- }
467
- if alt_count < min_alt_allele_count {
468
- case_details.maf.t_alt_count += 1;
469
- case_details.maf.excluded_by_min_alt_count += 1;
470
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
471
- return Ok(None);
472
- }
473
-
474
- // Keep case_id, chr, start, end, and add "mutation"
475
- out_lst = out_lst[0..4].to_vec();
476
- out_lst.push("mutation".to_string());
567
+ // filter cnvs based on segment length. Default: 0 (no filtering)
568
+ // calculate segment length (End_Position - Start_Position)
569
+ let end_position = out_lst[3].parse::<i32>().map_err(|_| {
570
+ case_details.cnv.invalid_rows += 1;
571
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
572
+ (
573
+ case_id.to_string(),
574
+ data_type.to_string(),
575
+ "Failed to convert End Position of cnv to integer.".to_string(),
576
+ )
577
+ })?;
477
578
 
478
- // Update counters for included MAF records
479
- case_details.maf.total_included += 1;
480
- included_maf_records.fetch_add(1, Ordering::Relaxed);
579
+ let start_position = out_lst[2].parse::<i32>().map_err(|_| {
580
+ case_details.cnv.invalid_rows += 1;
581
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
582
+ (
583
+ case_id.to_string(),
584
+ data_type.to_string(),
585
+ "Failed to convert Start Position of cnv to integer.".to_string(),
586
+ )
587
+ })?;
588
+ let cnv_length = end_position - start_position;
589
+ if seg_length > 0 && cnv_length > seg_length {
590
+ case_details.cnv.seg_length += 1;
591
+ case_details.cnv.excluded_by_segment_length += 1;
592
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
593
+ return Ok(None);
481
594
  }
482
595
 
483
- // filter cnvs based on segment length. Default: 0 (no filtering)
484
- if data_type == "cnv" {
485
- // calculate segment length (End_Position - Start_Position)
486
- let end_position = out_lst[3].parse::<i32>().map_err(|_| {
487
- case_details.cnv.invalid_rows += 1;
488
- filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
489
- (
490
- case_id.to_string(),
491
- data_type.to_string(),
492
- "Failed to convert End Position of cnv to integer.".to_string(),
493
- )
494
- })?;
596
+ // adding 'chr' to chromosome if it is not start with 'chr'
597
+ if !out_lst[1].starts_with("chr") {
598
+ out_lst[1] = format!("chr{}", out_lst[1]);
599
+ }
495
600
 
496
- let start_position = out_lst[2].parse::<i32>().map_err(|_| {
497
- case_details.cnv.invalid_rows += 1;
498
- filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
499
- (
500
- case_id.to_string(),
501
- data_type.to_string(),
502
- "Failed to convert Start Position of cnv to integer.".to_string(),
503
- )
504
- })?;
505
- let cnv_length = end_position - start_position;
506
- if seg_length > 0 && cnv_length > seg_length {
507
- case_details.cnv.seg_length += 1;
508
- case_details.cnv.excluded_by_segment_length += 1;
509
- filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
510
- return Ok(None);
511
- }
512
- case_details.cnv.total_included += 1;
513
- included_cnv_records.fetch_add(1, Ordering::Relaxed);
601
+ // Chromosome filtering
602
+ if !chromosomes.is_empty() && !chromosomes.contains(&out_lst[1]) {
603
+ *case_details
604
+ .cnv
605
+ .skipped_chromosomes
606
+ .entry(out_lst[1].clone())
607
+ .or_insert(0) += 1;
608
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
609
+ return Ok(None);
514
610
  }
515
611
 
612
+ // Update counters for included MAF records
613
+ case_details.cnv.total_included += 1;
614
+ included_cnv_records.fetch_add(1, Ordering::Relaxed);
615
+
516
616
  Ok(Some(out_lst))
517
617
  }
518
618
 
@@ -652,11 +752,13 @@ async fn download_data_streaming(
652
752
  host: &str,
653
753
  min_total_depth: i32,
654
754
  min_alt_allele_count: i32,
655
- hyper_mutator: i32,
755
+ maf_hyper_mutator: i32,
656
756
  consequences: &Option<Vec<String>>,
657
757
  gain_threshold: f32,
658
758
  loss_threshold: f32,
659
759
  seg_length: i32,
760
+ cnv_hyper_mutator: i32,
761
+ chromosomes: &HashSet<String>,
660
762
  ) {
661
763
  let data_urls: Vec<(String, String, String)> = data4dl
662
764
  .into_iter()
@@ -680,7 +782,7 @@ async fn download_data_streaming(
680
782
  let filtered_maf_records = Arc::new(AtomicUsize::new(0));
681
783
  let filtered_cnv_records = Arc::new(AtomicUsize::new(0));
682
784
  let filtered_records = Arc::new(Mutex::new(HashMap::<String, FilteredCaseDetails>::new()));
683
- let hyper_mutator_records = Arc::new(Mutex::new(Vec::<String>::new()));
785
+ let hyper_mutator_records = Arc::new(Mutex::new(HashMap::<String, Vec<String>>::new()));
684
786
  let included_maf_records = Arc::new(AtomicUsize::new(0));
685
787
  let included_cnv_records = Arc::new(AtomicUsize::new(0));
686
788
 
@@ -717,11 +819,13 @@ async fn download_data_streaming(
717
819
  &data_type,
718
820
  min_total_depth,
719
821
  min_alt_allele_count,
720
- hyper_mutator,
822
+ maf_hyper_mutator,
721
823
  &consequences,
722
824
  gain_threshold,
723
825
  loss_threshold,
724
826
  seg_length,
827
+ cnv_hyper_mutator,
828
+ &chromosomes,
725
829
  &filtered_records,
726
830
  &filtered_maf_records,
727
831
  &filtered_cnv_records,
@@ -865,7 +969,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
865
969
  let case_files = input_js.case_files;
866
970
 
867
971
  // Set default maf_options
868
- let (min_total_depth, min_alt_allele_count, hyper_mutator, consequences) = match input_js.maf_options {
972
+ let (min_total_depth, min_alt_allele_count, maf_hyper_mutator, consequences) = match input_js.maf_options {
869
973
  Some(options) => (
870
974
  options.min_total_depth,
871
975
  options.min_alt_allele_count,
@@ -876,22 +980,32 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
876
980
  };
877
981
 
878
982
  // Set default cnv_options
879
- let (gain_threshold, loss_threshold, seg_length) = match input_js.cnv_options {
880
- Some(options) => (options.gain_threshold, options.loss_threshold, options.seg_length),
881
- None => (0.3, -0.4, 0), // Default values
983
+ let (gain_threshold, loss_threshold, seg_length, cnv_hyper_mutator) = match input_js.cnv_options {
984
+ Some(options) => (
985
+ options.gain_threshold,
986
+ options.loss_threshold,
987
+ options.seg_length,
988
+ options.hyper_mutator,
989
+ ),
990
+ None => (0.3, -0.4, 0, 500), // Default values
882
991
  };
883
992
 
993
+ // Convert Vec<String> to HashSet<String> for faster lookup
994
+ let chromosomes = input_js.chromosomes.into_iter().collect::<HashSet<String>>();
995
+
884
996
  // Download data - this will now handle errors gracefully
885
997
  download_data_streaming(
886
998
  case_files,
887
999
  HOST,
888
1000
  min_total_depth,
889
1001
  min_alt_allele_count,
890
- hyper_mutator,
1002
+ maf_hyper_mutator,
891
1003
  &consequences,
892
1004
  gain_threshold,
893
1005
  loss_threshold,
894
1006
  seg_length,
1007
+ cnv_hyper_mutator,
1008
+ &chromosomes,
895
1009
  )
896
1010
  .await;
897
1011