@sjcrh/proteinpaint-rust 2.133.0 → 2.135.2-0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/gdcGRIN2.rs +543 -204
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.133.0",
2
+ "version": "2.135.2-0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "type": "module",
5
5
  "description": "Rust-based utilities for proteinpaint",
package/src/gdcGRIN2.rs CHANGED
@@ -1,19 +1,34 @@
1
1
  /*
2
- This script downloads cohort maf files from GDC and gracefully handles timeout and other possible errors related to GDC api processing for use by the client file summary div
2
+ This script can either download cohort maf/cnv files from GDC or read them from local files, with default behavior being to download from GDC. It gracefully handles timeout and other possible errors related to GDC API processing or file reading for use by the client file summary div.
3
3
 
4
4
  Key improvements:
5
5
  1. Graceful error handling - individual file failures don't stop the entire process
6
6
  2. Better timeout handling with retries
7
7
  3. More detailed error reporting
8
8
  4. Continues processing even when some files fail
9
+ 5. Added chromosome filtering
10
+ 6. Supports reading from local files with --from-file flag
11
+
12
+ Command-line arguments:
13
+ - --from-file: Read data from local files instead of downloading from GDC
9
14
 
10
15
  Input JSON:
11
16
  caseFiles
12
17
  mafOptions: For SNVindel filtering
18
+ cnvOptions: For CNV filtering
19
+ chromosomes: chromosomes will be included:[]
20
+
13
21
  Output mutations as JSON array.
22
+ {
23
+ grin2lesion:str,
24
+ summary:{}
25
+ }
14
26
 
15
27
  Example of usage:
16
- echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20,"hyperMutator":8000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000, "hyperMutator":8000}}' | ./target/release/gdcGRIN2
28
+ echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 10,"minAltAlleleCount": 2,"hyperMutator":8000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-0.4, "gainThreshold": 0.3, "segLength":2000000, "hyperMutator":500}, "chromosomes":["chr1","chr2","chr3"], "max_record": 100000}' | ./target/release/gdcGRIN2
29
+ Example of usage (read from local files):
30
+ echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 10,"minAltAlleleCount": 2,"hyperMutator":8000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-0.4, "gainThreshold": 0.3, "segLength":2000000, "hyperMutator":500}, "chromosomes":["chr1","chr2","chr3"], "max_record": 100000}' | ./target/release/gdcGRIN2 --from-file
31
+
17
32
  */
18
33
 
19
34
  use flate2::read::GzDecoder;
@@ -21,11 +36,12 @@ use futures::StreamExt;
21
36
  use memchr::memchr;
22
37
  use serde::{Deserialize, Serialize};
23
38
  use serde_json;
24
- use std::collections::HashMap;
39
+ use std::collections::{HashMap, HashSet};
40
+ use std::env;
41
+ use std::fs;
25
42
  use std::io::{self, Read};
26
43
  use std::sync::Arc;
27
44
  use std::sync::atomic::{AtomicUsize, Ordering};
28
- use std::thread::sleep;
29
45
  use std::time::Duration;
30
46
  use tokio::io::{AsyncReadExt, BufReader};
31
47
  use tokio::sync::Mutex;
@@ -73,16 +89,6 @@ struct CnvOptions {
73
89
  hyper_mutator: i32,
74
90
  }
75
91
 
76
- // Individual successful file output (JSONL format)
77
- #[derive(serde::Serialize)]
78
- struct SuccessfulFileOutput {
79
- #[serde(rename = "type")]
80
- output_type: String, // Always "data"
81
- case_id: String,
82
- data_type: String,
83
- data: Vec<Vec<String>>,
84
- }
85
-
86
92
  // struct for MAF filter details
87
93
  #[derive(Clone, Serialize, Default)]
88
94
  struct FilteredMafDetails {
@@ -96,6 +102,7 @@ struct FilteredMafDetails {
96
102
  excluded_by_consequence_type: usize,
97
103
  total_processed: usize,
98
104
  total_included: usize,
105
+ skipped_chromosomes: HashMap<String, usize>,
99
106
  }
100
107
 
101
108
  // struct for CNV filter details
@@ -109,6 +116,7 @@ struct FilteredCnvDetails {
109
116
  excluded_by_segment_length: usize,
110
117
  total_processed: usize,
111
118
  total_included: usize,
119
+ skipped_chromosomes: HashMap<String, usize>,
112
120
  }
113
121
 
114
122
  // struct for per-case filter details
@@ -121,8 +129,6 @@ struct FilteredCaseDetails {
121
129
  // Final summary output (JSONL format)
122
130
  #[derive(serde::Serialize)]
123
131
  struct FinalSummary {
124
- #[serde(rename = "type")]
125
- output_type: String, // Always "summary"
126
132
  total_files: usize,
127
133
  successful_files: usize,
128
134
  failed_files: usize,
@@ -134,6 +140,14 @@ struct FinalSummary {
134
140
  included_cnv_records: usize,
135
141
  filtered_records_by_case: HashMap<String, FilteredCaseDetails>,
136
142
  hyper_mutator_records: HashMap<String, Vec<String>>,
143
+ excluded_by_max_record: HashMap<String, Vec<String>>,
144
+ }
145
+
146
+ // Enum to hold both SuccessfulFileoutput and FinalSummary
147
+ #[derive(Serialize)]
148
+ struct Output {
149
+ grin2lesion: String,
150
+ summary: FinalSummary,
137
151
  }
138
152
 
139
153
  // Define the top-level input structure
@@ -145,6 +159,8 @@ struct InputData {
145
159
  maf_options: Option<MafOptions>,
146
160
  #[serde(rename = "cnvOptions")]
147
161
  cnv_options: Option<CnvOptions>,
162
+ chromosomes: Vec<String>,
163
+ max_record: usize,
148
164
  }
149
165
 
150
166
  // Configuration for different data types
@@ -167,6 +183,7 @@ async fn parse_content(
167
183
  loss_threshold: f32,
168
184
  seg_length: i32,
169
185
  cnv_hyper_mutator: i32,
186
+ chromosomes: &HashSet<String>,
170
187
  filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
171
188
  filtered_maf_records: &AtomicUsize,
172
189
  filtered_cnv_records: &AtomicUsize,
@@ -219,9 +236,6 @@ async fn parse_content(
219
236
  let mut parsed_data = Vec::new();
220
237
  let mut columns_indices: Vec<usize> = Vec::new();
221
238
  let mut variant_classification_index: Option<usize> = None;
222
- //let mut header_mk: &str = "";
223
- //let mut columns = Vec::new();
224
-
225
239
  let mut header: Vec<String> = Vec::new();
226
240
 
227
241
  for line in lines {
@@ -243,26 +257,49 @@ async fn parse_content(
243
257
  continue;
244
258
  };
245
259
 
246
- let row = process_row(
247
- line,
248
- case_id,
249
- data_type,
250
- &header,
251
- &columns_indices,
252
- variant_classification_index,
253
- consequences,
254
- min_total_depth,
255
- min_alt_allele_count,
256
- gain_threshold,
257
- loss_threshold,
258
- seg_length,
259
- filtered_records,
260
- filtered_maf_records,
261
- filtered_cnv_records,
262
- included_maf_records,
263
- included_cnv_records,
264
- )
265
- .await?;
260
+ let row = match data_type {
261
+ "maf" => {
262
+ process_mafline(
263
+ line,
264
+ case_id,
265
+ data_type,
266
+ &columns_indices,
267
+ variant_classification_index,
268
+ consequences,
269
+ min_total_depth,
270
+ min_alt_allele_count,
271
+ chromosomes,
272
+ filtered_records,
273
+ filtered_maf_records,
274
+ included_maf_records,
275
+ )
276
+ .await
277
+ }
278
+ "cnv" => {
279
+ process_cnvline(
280
+ line,
281
+ case_id,
282
+ data_type,
283
+ &header,
284
+ &columns_indices,
285
+ gain_threshold,
286
+ loss_threshold,
287
+ seg_length,
288
+ chromosomes,
289
+ filtered_records,
290
+ filtered_cnv_records,
291
+ included_cnv_records,
292
+ )
293
+ .await
294
+ }
295
+ _ => {
296
+ return Err((
297
+ case_id.to_string(),
298
+ data_type.to_string(),
299
+ "Invalid data type".to_string(),
300
+ ));
301
+ }
302
+ }?;
266
303
 
267
304
  if let Some(out_lst) = row {
268
305
  parsed_data.push(out_lst);
@@ -316,25 +353,20 @@ fn setup_columns(
316
353
  Ok(())
317
354
  }
318
355
 
319
- // Process a single row of data
320
- async fn process_row(
356
+ // Process a single row of MAF file
357
+ async fn process_mafline(
321
358
  line: &str,
322
359
  case_id: &str,
323
360
  data_type: &str,
324
- header: &[String],
325
361
  columns_indices: &[usize],
326
362
  variant_classification_index: Option<usize>,
327
363
  consequences: &Option<Vec<String>>,
328
364
  min_total_depth: i32,
329
365
  min_alt_allele_count: i32,
330
- gain_threshold: f32,
331
- loss_threshold: f32,
332
- seg_length: i32,
366
+ chromosomes: &HashSet<String>,
333
367
  filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
334
368
  filtered_maf_records: &AtomicUsize,
335
- filtered_cnv_records: &AtomicUsize,
336
369
  included_maf_records: &AtomicUsize,
337
- included_cnv_records: &AtomicUsize,
338
370
  ) -> Result<Option<Vec<String>>, (String, String, String)> {
339
371
  let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
340
372
  let mut out_lst = vec![case_id.to_string()];
@@ -347,51 +379,38 @@ async fn process_row(
347
379
  maf: FilteredMafDetails::default(),
348
380
  cnv: FilteredCnvDetails::default(),
349
381
  });
350
-
351
382
  let case_details = filtered_map.get_mut(case_id).unwrap();
352
383
 
353
384
  // Track total processed records
354
- if data_type == "maf" {
355
- case_details.maf.total_processed += 1;
356
- } else if data_type == "cnv" {
357
- case_details.cnv.total_processed += 1;
358
- }
385
+ case_details.maf.total_processed += 1;
359
386
 
360
387
  // Handle consequence filtering and counting for MAF files
361
- if data_type == "maf" {
362
- if let Some(var_class_idx) = variant_classification_index {
363
- if var_class_idx < cont_lst.len() {
364
- let variant_classification = &cont_lst[var_class_idx];
365
- if let Some(consequence_filter) = consequences {
366
- if !consequence_filter.is_empty() {
367
- if consequence_filter.contains(variant_classification) {
368
- // Matched consequence
369
- *case_details
370
- .maf
371
- .matched_consequences
372
- .entry(variant_classification.to_string())
373
- .or_insert(0) += 1;
374
- } else {
375
- // Unmatched consequence
376
- *case_details
377
- .maf
378
- .rejected_consequences
379
- .entry(variant_classification.to_string())
380
- .or_insert(0) += 1;
381
- case_details.maf.excluded_by_consequence_type += 1;
382
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
383
- return Ok(None);
384
- }
385
- } else {
386
- // Empty filter, count as matched
388
+
389
+ if let Some(var_class_idx) = variant_classification_index {
390
+ if var_class_idx < cont_lst.len() {
391
+ let variant_classification = &cont_lst[var_class_idx];
392
+ if let Some(consequence_filter) = consequences {
393
+ if !consequence_filter.is_empty() {
394
+ if consequence_filter.contains(variant_classification) {
395
+ // Matched consequence
387
396
  *case_details
388
397
  .maf
389
398
  .matched_consequences
390
399
  .entry(variant_classification.to_string())
391
400
  .or_insert(0) += 1;
401
+ } else {
402
+ // Unmatched consequence
403
+ *case_details
404
+ .maf
405
+ .rejected_consequences
406
+ .entry(variant_classification.to_string())
407
+ .or_insert(0) += 1;
408
+ case_details.maf.excluded_by_consequence_type += 1;
409
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
410
+ return Ok(None);
392
411
  }
393
412
  } else {
394
- // No filter, count as matched
413
+ // Empty filter, count as matched
395
414
  *case_details
396
415
  .maf
397
416
  .matched_consequences
@@ -399,32 +418,142 @@ async fn process_row(
399
418
  .or_insert(0) += 1;
400
419
  }
401
420
  } else {
402
- case_details.maf.invalid_rows += 1;
403
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
404
- return Ok(None);
421
+ // No filter, count as matched
422
+ *case_details
423
+ .maf
424
+ .matched_consequences
425
+ .entry(variant_classification.to_string())
426
+ .or_insert(0) += 1;
405
427
  }
406
428
  } else {
407
429
  case_details.maf.invalid_rows += 1;
408
430
  filtered_maf_records.fetch_add(1, Ordering::Relaxed);
409
431
  return Ok(None);
410
432
  }
433
+ } else {
434
+ case_details.maf.invalid_rows += 1;
435
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
436
+ return Ok(None);
411
437
  }
412
438
 
413
439
  // Extract relevant columns
414
440
  for &x in columns_indices {
415
441
  if x >= cont_lst.len() {
416
- if data_type == "maf" {
417
- case_details.maf.invalid_rows += 1;
418
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
419
- } else if data_type == "cnv" {
420
- case_details.cnv.invalid_rows += 1;
421
- filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
422
- }
442
+ case_details.maf.invalid_rows += 1;
443
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
423
444
  return Ok(None); // Invalid row
424
445
  }
446
+ let element = cont_lst[x].to_string();
447
+ out_lst.push(element);
448
+ }
449
+
450
+ // Additional MAF-specific processing
451
+ if out_lst.len() < 6 {
452
+ case_details.maf.invalid_rows += 1;
453
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
454
+ return Ok(None); // Not enough columns
455
+ }
456
+
457
+ let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
458
+ case_details.maf.invalid_rows += 1;
459
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
460
+ (
461
+ case_id.to_string(),
462
+ data_type.to_string(),
463
+ "Failed to convert t_depth to integer.".to_string(),
464
+ )
465
+ })?;
466
+
467
+ let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
468
+ case_details.maf.invalid_rows += 1;
469
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
470
+ (
471
+ case_id.to_string(),
472
+ data_type.to_string(),
473
+ "Failed to convert t_alt_count to integer.".to_string(),
474
+ )
475
+ })?;
476
+
477
+ if alle_depth < min_total_depth {
478
+ case_details.maf.t_depth += 1;
479
+ case_details.maf.excluded_by_min_depth += 1;
480
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
481
+ return Ok(None);
482
+ }
483
+ if alt_count < min_alt_allele_count {
484
+ case_details.maf.t_alt_count += 1;
485
+ case_details.maf.excluded_by_min_alt_count += 1;
486
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
487
+ return Ok(None);
488
+ }
489
+
490
+ // Keep case_id, chr, start, end, and add "mutation"
491
+ out_lst = out_lst[0..4].to_vec();
492
+ out_lst.push("mutation".to_string());
493
+
494
+ // adding 'chr' to chromosome if it is not start with 'chr'
495
+ if !out_lst[1].starts_with("chr") {
496
+ out_lst[1] = format!("chr{}", out_lst[1]);
497
+ }
498
+
499
+ // Chromosome filtering
500
+ if !chromosomes.is_empty() && !chromosomes.contains(&out_lst[1]) {
501
+ *case_details
502
+ .maf
503
+ .skipped_chromosomes
504
+ .entry(out_lst[1].clone())
505
+ .or_insert(0) += 1;
506
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
507
+ return Ok(None);
508
+ }
509
+
510
+ // Update counters for included MAF records
511
+ case_details.maf.total_included += 1;
512
+ included_maf_records.fetch_add(1, Ordering::Relaxed);
425
513
 
514
+ Ok(Some(out_lst))
515
+ }
516
+
517
+ // Process a single row of CNV file
518
+ async fn process_cnvline(
519
+ line: &str,
520
+ case_id: &str,
521
+ data_type: &str,
522
+ header: &[String],
523
+ columns_indices: &[usize],
524
+ gain_threshold: f32,
525
+ loss_threshold: f32,
526
+ seg_length: i32,
527
+ chromosomes: &HashSet<String>,
528
+ filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
529
+ filtered_cnv_records: &AtomicUsize,
530
+ included_cnv_records: &AtomicUsize,
531
+ ) -> Result<Option<Vec<String>>, (String, String, String)> {
532
+ let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
533
+ let mut out_lst = vec![case_id.to_string()];
534
+
535
+ // Initialize or update case details
536
+ let mut filtered_map = filtered_records.lock().await;
537
+ filtered_map
538
+ .entry(case_id.to_string())
539
+ .or_insert_with(|| FilteredCaseDetails {
540
+ maf: FilteredMafDetails::default(),
541
+ cnv: FilteredCnvDetails::default(),
542
+ });
543
+ let case_details = filtered_map.get_mut(case_id).unwrap();
544
+
545
+ // Track total processed records
546
+ case_details.cnv.total_processed += 1;
547
+
548
+ // Extract relevant columns
549
+ for &x in columns_indices {
550
+ if x >= cont_lst.len() {
551
+ case_details.cnv.invalid_rows += 1;
552
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
553
+ return Ok(None); // Invalid row
554
+ }
426
555
  let mut element = cont_lst[x].to_string();
427
- if data_type == "cnv" && header[x] == "Segment_Mean" {
556
+ if header[x] == "Segment_Mean" {
428
557
  element = process_segment_mean(&element, case_id, data_type, gain_threshold, loss_threshold)?;
429
558
  if element.is_empty() {
430
559
  case_details.cnv.segment_mean += 1;
@@ -444,89 +573,55 @@ async fn process_row(
444
573
  out_lst.push(element);
445
574
  }
446
575
 
447
- // Additional MAF-specific processing
448
- if data_type == "maf" {
449
- if out_lst.len() < 6 {
450
- case_details.maf.invalid_rows += 1;
451
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
452
- return Ok(None); // Not enough columns
453
- }
454
-
455
- let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
456
- case_details.maf.invalid_rows += 1;
457
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
458
- (
459
- case_id.to_string(),
460
- data_type.to_string(),
461
- "Failed to convert t_depth to integer.".to_string(),
462
- )
463
- })?;
464
-
465
- let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
466
- case_details.maf.invalid_rows += 1;
467
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
468
- (
469
- case_id.to_string(),
470
- data_type.to_string(),
471
- "Failed to convert t_alt_count to integer.".to_string(),
472
- )
473
- })?;
474
-
475
- if alle_depth < min_total_depth {
476
- case_details.maf.t_depth += 1;
477
- case_details.maf.excluded_by_min_depth += 1;
478
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
479
- return Ok(None);
480
- }
481
- if alt_count < min_alt_allele_count {
482
- case_details.maf.t_alt_count += 1;
483
- case_details.maf.excluded_by_min_alt_count += 1;
484
- filtered_maf_records.fetch_add(1, Ordering::Relaxed);
485
- return Ok(None);
486
- }
487
-
488
- // Keep case_id, chr, start, end, and add "mutation"
489
- out_lst = out_lst[0..4].to_vec();
490
- out_lst.push("mutation".to_string());
576
+ // filter cnvs based on segment length. Default: 0 (no filtering)
577
+ // calculate segment length (End_Position - Start_Position)
578
+ let end_position = out_lst[3].parse::<i32>().map_err(|_| {
579
+ case_details.cnv.invalid_rows += 1;
580
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
581
+ (
582
+ case_id.to_string(),
583
+ data_type.to_string(),
584
+ "Failed to convert End Position of cnv to integer.".to_string(),
585
+ )
586
+ })?;
491
587
 
492
- // Update counters for included MAF records
493
- case_details.maf.total_included += 1;
494
- included_maf_records.fetch_add(1, Ordering::Relaxed);
588
+ let start_position = out_lst[2].parse::<i32>().map_err(|_| {
589
+ case_details.cnv.invalid_rows += 1;
590
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
591
+ (
592
+ case_id.to_string(),
593
+ data_type.to_string(),
594
+ "Failed to convert Start Position of cnv to integer.".to_string(),
595
+ )
596
+ })?;
597
+ let cnv_length = end_position - start_position;
598
+ if seg_length > 0 && cnv_length > seg_length {
599
+ case_details.cnv.seg_length += 1;
600
+ case_details.cnv.excluded_by_segment_length += 1;
601
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
602
+ return Ok(None);
495
603
  }
496
604
 
497
- // filter cnvs based on segment length. Default: 0 (no filtering)
498
- if data_type == "cnv" {
499
- // calculate segment length (End_Position - Start_Position)
500
- let end_position = out_lst[3].parse::<i32>().map_err(|_| {
501
- case_details.cnv.invalid_rows += 1;
502
- filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
503
- (
504
- case_id.to_string(),
505
- data_type.to_string(),
506
- "Failed to convert End Position of cnv to integer.".to_string(),
507
- )
508
- })?;
605
+ // adding 'chr' to chromosome if it is not start with 'chr'
606
+ if !out_lst[1].starts_with("chr") {
607
+ out_lst[1] = format!("chr{}", out_lst[1]);
608
+ }
509
609
 
510
- let start_position = out_lst[2].parse::<i32>().map_err(|_| {
511
- case_details.cnv.invalid_rows += 1;
512
- filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
513
- (
514
- case_id.to_string(),
515
- data_type.to_string(),
516
- "Failed to convert Start Position of cnv to integer.".to_string(),
517
- )
518
- })?;
519
- let cnv_length = end_position - start_position;
520
- if seg_length > 0 && cnv_length > seg_length {
521
- case_details.cnv.seg_length += 1;
522
- case_details.cnv.excluded_by_segment_length += 1;
523
- filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
524
- return Ok(None);
525
- }
526
- case_details.cnv.total_included += 1;
527
- included_cnv_records.fetch_add(1, Ordering::Relaxed);
610
+ // Chromosome filtering
611
+ if !chromosomes.is_empty() && !chromosomes.contains(&out_lst[1]) {
612
+ *case_details
613
+ .cnv
614
+ .skipped_chromosomes
615
+ .entry(out_lst[1].clone())
616
+ .or_insert(0) += 1;
617
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
618
+ return Ok(None);
528
619
  }
529
620
 
621
+ // Update counters for included MAF records
622
+ case_details.cnv.total_included += 1;
623
+ included_cnv_records.fetch_add(1, Ordering::Relaxed);
624
+
530
625
  Ok(Some(out_lst))
531
626
  }
532
627
 
@@ -658,10 +753,9 @@ async fn download_single_file(
658
753
  ))
659
754
  }
660
755
 
661
- /// Streaming download function
756
+ /// Downloading from GDC
662
757
  /// Outputs JSONL format: one JSON object per line
663
- /// Node.js will read this line-by-line but still wait for completion
664
- async fn download_data_streaming(
758
+ async fn download_data(
665
759
  data4dl: HashMap<String, DataType>,
666
760
  host: &str,
667
761
  min_total_depth: i32,
@@ -672,6 +766,8 @@ async fn download_data_streaming(
672
766
  loss_threshold: f32,
673
767
  seg_length: i32,
674
768
  cnv_hyper_mutator: i32,
769
+ chromosomes: &HashSet<String>,
770
+ max_record: usize,
675
771
  ) {
676
772
  let data_urls: Vec<(String, String, String)> = data4dl
677
773
  .into_iter()
@@ -696,8 +792,11 @@ async fn download_data_streaming(
696
792
  let filtered_cnv_records = Arc::new(AtomicUsize::new(0));
697
793
  let filtered_records = Arc::new(Mutex::new(HashMap::<String, FilteredCaseDetails>::new()));
698
794
  let hyper_mutator_records = Arc::new(Mutex::new(HashMap::<String, Vec<String>>::new()));
795
+ let excluded_by_max_record = Arc::new(Mutex::new(HashMap::<String, Vec<String>>::new()));
699
796
  let included_maf_records = Arc::new(AtomicUsize::new(0));
700
797
  let included_cnv_records = Arc::new(AtomicUsize::new(0));
798
+ let all_records = Arc::new(Mutex::new(Vec::<Vec<String>>::new()));
799
+ let data_count = Arc::new(AtomicUsize::new(0));
701
800
 
702
801
  // Only collect errors (successful data is output immediately)
703
802
  let errors = Arc::new(Mutex::new(Vec::<ErrorEntry>::new()));
@@ -720,9 +819,25 @@ async fn download_data_streaming(
720
819
  let included_maf_records = Arc::clone(&included_maf_records);
721
820
  let included_cnv_records = Arc::clone(&included_cnv_records);
722
821
  let hyper_mutator_records = Arc::clone(&hyper_mutator_records);
822
+ let excluded_by_max_record = Arc::clone(&excluded_by_max_record);
723
823
  let errors = Arc::clone(&errors);
824
+ let all_records = Arc::clone(&all_records);
825
+ let data_count = Arc::clone(&data_count);
724
826
 
725
827
  async move {
828
+ let current_count = data_count.load(Ordering::Relaxed);
829
+ if current_count >= max_record {
830
+ // Skip processing and mark as excluded by max_record
831
+ if let Ok((case_id, data_type, _)) = download_result {
832
+ let mut exclud_max_record = excluded_by_max_record.lock().await;
833
+ exclud_max_record
834
+ .entry(data_type.to_string())
835
+ .or_insert_with(Vec::new)
836
+ .push(case_id.to_string());
837
+ successful_downloads.fetch_add(1, Ordering::Relaxed);
838
+ }
839
+ return;
840
+ }
726
841
  match download_result {
727
842
  Ok((case_id, data_type, content)) => {
728
843
  // Try to parse the content
@@ -738,6 +853,7 @@ async fn download_data_streaming(
738
853
  loss_threshold,
739
854
  seg_length,
740
855
  cnv_hyper_mutator,
856
+ &chromosomes,
741
857
  &filtered_records,
742
858
  &filtered_maf_records,
743
859
  &filtered_cnv_records,
@@ -748,24 +864,18 @@ async fn download_data_streaming(
748
864
  .await
749
865
  {
750
866
  Ok(parsed_data) => {
751
- // SUCCESS: Output immediately as JSONL
752
- let success_output = SuccessfulFileOutput {
753
- output_type: "data".to_string(),
754
- case_id: case_id.clone(),
755
- data_type: data_type.clone(),
756
- data: parsed_data,
757
- };
758
-
759
- // Output this successful result immediately - Node.js will see this in real-time
760
- if let Ok(json) = serde_json::to_string(&success_output) {
761
- println!("{}", json); // IMMEDIATE output to stdout
762
- // Force flush to ensure Node.js sees it immediately
763
- use std::io::Write;
764
- let _ = std::io::stdout().flush();
765
- // Optional: Add small delay to separate lines
766
- sleep(Duration::from_millis(10));
867
+ let remaining = max_record - current_count;
868
+ if parsed_data.len() <= remaining {
869
+ data_count.fetch_add(parsed_data.len(), Ordering::Relaxed);
870
+ all_records.lock().await.extend(parsed_data);
871
+ } else {
872
+ // Skip file if it would exceed max_record
873
+ let mut exclud_max_record = excluded_by_max_record.lock().await;
874
+ exclud_max_record
875
+ .entry(data_type.to_string())
876
+ .or_insert_with(Vec::new)
877
+ .push(case_id.to_string());
767
878
  }
768
-
769
879
  successful_downloads.fetch_add(1, Ordering::Relaxed);
770
880
  }
771
881
  Err((cid, dtp, error)) => {
@@ -816,7 +926,6 @@ async fn download_data_streaming(
816
926
  let included_cnv_count = included_cnv_records.load(Ordering::Relaxed);
817
927
 
818
928
  let summary = FinalSummary {
819
- output_type: "summary".to_string(),
820
929
  total_files,
821
930
  successful_files: success_count,
822
931
  failed_files: failed_count,
@@ -828,10 +937,214 @@ async fn download_data_streaming(
828
937
  included_maf_records: included_maf_count,
829
938
  included_cnv_records: included_cnv_count,
830
939
  hyper_mutator_records: hyper_mutator_records.lock().await.clone(),
940
+ excluded_by_max_record: excluded_by_max_record.lock().await.clone(),
831
941
  };
832
942
 
943
+ let grin2lesion = serde_json::to_string(&all_records.lock().await.drain(..).collect::<Vec<Vec<String>>>())
944
+ .unwrap_or_else(|_| "[]".to_string());
945
+ let output = Output { grin2lesion, summary };
946
+
833
947
  // Output final summary - Node.js will know processing is complete when it sees this
834
- if let Ok(json) = serde_json::to_string(&summary) {
948
+ // if let Ok(json) = serde_json::to_string(&summary) {
949
+ if let Ok(json) = serde_json::to_string(&output) {
950
+ println!("{}", json);
951
+ use std::io::Write;
952
+ let _ = std::io::stdout().flush();
953
+ }
954
+ }
955
+
956
+ /// Read data from local file
957
+ async fn localread_data(
958
+ case_files: HashMap<String, DataType>,
959
+ min_total_depth: i32,
960
+ min_alt_allele_count: i32,
961
+ maf_hyper_mutator: i32,
962
+ consequences: &Option<Vec<String>>,
963
+ gain_threshold: f32,
964
+ loss_threshold: f32,
965
+ seg_length: i32,
966
+ cnv_hyper_mutator: i32,
967
+ chromosomes: &HashSet<String>,
968
+ max_record: usize,
969
+ ) {
970
+ let data_files: Vec<(String, String, String)> = case_files
971
+ .into_iter()
972
+ .flat_map(|(case_id, data_types)| {
973
+ let mut files = Vec::new();
974
+ if let Some(cnv_file) = &data_types.cnv {
975
+ files.push((case_id.clone(), "cnv".to_string(), cnv_file.clone()));
976
+ }
977
+ if let Some(maf_file) = &data_types.maf {
978
+ files.push((case_id.clone(), "maf".to_string(), maf_file.clone()));
979
+ }
980
+ files
981
+ })
982
+ .collect();
983
+ let total_files = data_files.len();
984
+
985
+ // Counters for final summary
986
+ let successful_reads = Arc::new(AtomicUsize::new(0));
987
+ let failed_reads = Arc::new(AtomicUsize::new(0));
988
+ let filtered_maf_records = Arc::new(AtomicUsize::new(0));
989
+ let filtered_cnv_records = Arc::new(AtomicUsize::new(0));
990
+ let filtered_records = Arc::new(Mutex::new(HashMap::<String, FilteredCaseDetails>::new()));
991
+ let hyper_mutator_records = Arc::new(Mutex::new(HashMap::<String, Vec<String>>::new()));
992
+ let excluded_by_max_record = Arc::new(Mutex::new(HashMap::<String, Vec<String>>::new()));
993
+ let included_maf_records = Arc::new(AtomicUsize::new(0));
994
+ let included_cnv_records = Arc::new(AtomicUsize::new(0));
995
+ let errors = Arc::new(Mutex::new(Vec::<ErrorEntry>::new()));
996
+ let all_records = Arc::new(Mutex::new(Vec::<Vec<String>>::new()));
997
+ let data_count = Arc::new(AtomicUsize::new(0));
998
+
999
+ // Process files concurrently
1000
+ let read_futures = futures::stream::iter(data_files.into_iter().map(
1001
+ |(case_id, data_type, file_path)| async move {
1002
+ // read the local file
1003
+ match fs::read_to_string(&file_path) {
1004
+ Ok(content) => Ok((case_id, data_type, content)),
1005
+ Err(e) => Err((
1006
+ case_id,
1007
+ data_type,
1008
+ format!("file_read_error: {}", e),
1009
+ 1, // Single attempt for local file readng
1010
+ )),
1011
+ }
1012
+ },
1013
+ ));
1014
+
1015
+ // Process files and output results
1016
+ read_futures
1017
+ .buffer_unordered(3)
1018
+ .for_each(|read_result| {
1019
+ let successful_reads = Arc::clone(&successful_reads);
1020
+ let failed_reads = Arc::clone(&failed_reads);
1021
+ let filtered_maf_records = Arc::clone(&filtered_maf_records);
1022
+ let filtered_cnv_records = Arc::clone(&filtered_cnv_records);
1023
+ let filtered_records = Arc::clone(&filtered_records);
1024
+ let included_maf_records = Arc::clone(&included_maf_records);
1025
+ let included_cnv_records = Arc::clone(&included_cnv_records);
1026
+ let hyper_mutator_records = Arc::clone(&hyper_mutator_records);
1027
+ let excluded_by_max_record = Arc::clone(&excluded_by_max_record);
1028
+ let errors = Arc::clone(&errors);
1029
+ let all_records = Arc::clone(&all_records);
1030
+ let data_count = Arc::clone(&data_count);
1031
+
1032
+ async move {
1033
+ let current_count = data_count.load(Ordering::Relaxed);
1034
+ if current_count >= max_record {
1035
+ // Skip processing and mark as excluded by max_record
1036
+ if let Ok((case_id, data_type, _)) = read_result {
1037
+ let mut exclud_max_record = excluded_by_max_record.lock().await;
1038
+ exclud_max_record
1039
+ .entry(data_type.to_string())
1040
+ .or_insert_with(Vec::new)
1041
+ .push(case_id.to_string());
1042
+ successful_reads.fetch_add(1, Ordering::Relaxed);
1043
+ }
1044
+ return;
1045
+ }
1046
+ match read_result {
1047
+ Ok((case_id, data_type, content)) => {
1048
+ match parse_content(
1049
+ &content,
1050
+ &case_id,
1051
+ &data_type,
1052
+ min_total_depth,
1053
+ min_alt_allele_count,
1054
+ maf_hyper_mutator,
1055
+ consequences,
1056
+ gain_threshold,
1057
+ loss_threshold,
1058
+ seg_length,
1059
+ cnv_hyper_mutator,
1060
+ chromosomes,
1061
+ &filtered_records,
1062
+ &filtered_maf_records,
1063
+ &filtered_cnv_records,
1064
+ &included_maf_records,
1065
+ &included_cnv_records,
1066
+ &hyper_mutator_records,
1067
+ )
1068
+ .await
1069
+ {
1070
+ Ok(parsed_data) => {
1071
+ let remaining = max_record - current_count;
1072
+ if parsed_data.len() <= remaining {
1073
+ data_count.fetch_add(parsed_data.len(), Ordering::Relaxed);
1074
+ all_records.lock().await.extend(parsed_data);
1075
+ } else {
1076
+ // Skip file if it would exceed max_record
1077
+ let mut exclud_max_record = excluded_by_max_record.lock().await;
1078
+ exclud_max_record
1079
+ .entry(data_type.to_string())
1080
+ .or_insert_with(Vec::new)
1081
+ .push(case_id.to_string());
1082
+ }
1083
+ successful_reads.fetch_add(1, Ordering::Relaxed);
1084
+ }
1085
+ Err((cid, dtp, error)) => {
1086
+ failed_reads.fetch_add(1, Ordering::Relaxed);
1087
+ let error = ErrorEntry {
1088
+ case_id: cid,
1089
+ data_type: dtp,
1090
+ error_type: "parsing_error".to_string(),
1091
+ error_details: error,
1092
+ attempts_made: 1,
1093
+ };
1094
+ errors.lock().await.push(error);
1095
+ }
1096
+ }
1097
+ }
1098
+ Err((case_id, data_type, error_details, attempts)) => {
1099
+ failed_reads.fetch_add(1, Ordering::Relaxed);
1100
+ let (error_type, clean_details) = if error_details.contains(":") {
1101
+ let parts: Vec<&str> = error_details.splitn(2, ": ").collect();
1102
+ (parts[0].to_string(), parts[1].to_string())
1103
+ } else {
1104
+ ("unknown_error".to_string(), error_details)
1105
+ };
1106
+ let error = ErrorEntry {
1107
+ case_id,
1108
+ data_type,
1109
+ error_type,
1110
+ error_details: clean_details,
1111
+ attempts_made: attempts,
1112
+ };
1113
+ errors.lock().await.push(error);
1114
+ }
1115
+ }
1116
+ }
1117
+ })
1118
+ .await;
1119
+ // Output final summary as the last line
1120
+ let success_count = successful_reads.load(Ordering::Relaxed);
1121
+ let failed_count = failed_reads.load(Ordering::Relaxed);
1122
+ let filtered_maf_count = filtered_maf_records.load(Ordering::Relaxed);
1123
+ let filtered_cnv_count = filtered_cnv_records.load(Ordering::Relaxed);
1124
+ let included_maf_count = included_maf_records.load(Ordering::Relaxed);
1125
+ let included_cnv_count = included_cnv_records.load(Ordering::Relaxed);
1126
+
1127
+ let summary = FinalSummary {
1128
+ total_files,
1129
+ successful_files: success_count,
1130
+ failed_files: failed_count,
1131
+ errors: errors.lock().await.clone(),
1132
+ filtered_records: filtered_maf_count + filtered_cnv_count,
1133
+ filtered_maf_records: filtered_maf_count,
1134
+ filtered_cnv_records: filtered_cnv_count,
1135
+ filtered_records_by_case: filtered_records.lock().await.clone(),
1136
+ included_maf_records: included_maf_count,
1137
+ included_cnv_records: included_cnv_count,
1138
+ hyper_mutator_records: hyper_mutator_records.lock().await.clone(),
1139
+ excluded_by_max_record: excluded_by_max_record.lock().await.clone(),
1140
+ };
1141
+
1142
+ let grin2lesion = serde_json::to_string(&all_records.lock().await.drain(..).collect::<Vec<Vec<String>>>())
1143
+ .unwrap_or_else(|_| "[]".to_string());
1144
+ let output = Output { grin2lesion, summary };
1145
+
1146
+ // Output final JSON array
1147
+ if let Ok(json) = serde_json::to_string(&output) {
835
1148
  println!("{}", json);
836
1149
  use std::io::Write;
837
1150
  let _ = std::io::stdout().flush();
@@ -840,6 +1153,9 @@ async fn download_data_streaming(
840
1153
 
841
1154
  #[tokio::main]
842
1155
  async fn main() -> Result<(), Box<dyn std::error::Error>> {
1156
+ let args: Vec<String> = env::args().collect();
1157
+ let from_file = args.contains(&"--from-file".to_string());
1158
+
843
1159
  const HOST: &str = "https://api.gdc.cancer.gov/data/";
844
1160
 
845
1161
  // Read input with timeout
@@ -879,6 +1195,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
879
1195
  }
880
1196
 
881
1197
  let case_files = input_js.case_files;
1198
+ let max_record: usize = input_js.max_record;
882
1199
 
883
1200
  // Set default maf_options
884
1201
  let (min_total_depth, min_alt_allele_count, maf_hyper_mutator, consequences) = match input_js.maf_options {
@@ -902,20 +1219,42 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
902
1219
  None => (0.3, -0.4, 0, 500), // Default values
903
1220
  };
904
1221
 
905
- // Download data - this will now handle errors gracefully
906
- download_data_streaming(
907
- case_files,
908
- HOST,
909
- min_total_depth,
910
- min_alt_allele_count,
911
- maf_hyper_mutator,
912
- &consequences,
913
- gain_threshold,
914
- loss_threshold,
915
- seg_length,
916
- cnv_hyper_mutator,
917
- )
918
- .await;
1222
+ // Convert Vec<String> to HashSet<String> for faster lookup
1223
+ let chromosomes = input_js.chromosomes.into_iter().collect::<HashSet<String>>();
1224
+
1225
+ if from_file {
1226
+ localread_data(
1227
+ case_files,
1228
+ min_total_depth,
1229
+ min_alt_allele_count,
1230
+ maf_hyper_mutator,
1231
+ &consequences,
1232
+ gain_threshold,
1233
+ loss_threshold,
1234
+ seg_length,
1235
+ cnv_hyper_mutator,
1236
+ &chromosomes,
1237
+ max_record,
1238
+ )
1239
+ .await;
1240
+ } else {
1241
+ // Download data from GDC- this will now handle errors gracefully
1242
+ download_data(
1243
+ case_files,
1244
+ HOST,
1245
+ min_total_depth,
1246
+ min_alt_allele_count,
1247
+ maf_hyper_mutator,
1248
+ &consequences,
1249
+ gain_threshold,
1250
+ loss_threshold,
1251
+ seg_length,
1252
+ cnv_hyper_mutator,
1253
+ &chromosomes,
1254
+ max_record,
1255
+ )
1256
+ .await;
1257
+ }
919
1258
 
920
1259
  // Always exit successfully - individual file failures are logged but don't stop the process
921
1260
  Ok(())