@sjcrh/proteinpaint-rust 2.129.6-2b2fdc7ee.0 → 2.130.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/gdcGRIN2.rs CHANGED
@@ -13,18 +13,19 @@
13
13
  Output mutations as JSON array.
14
14
 
15
15
  Example of usage:
16
- echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}},"mafOptions": {"minTotalDepth": 10,"minAltAlleleCount": 2}}' | ./target/release/gdcGRIN2
16
+ echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000}}' | ./target/release/gdcGRIN2
17
17
  */
18
18
 
19
19
  use flate2::read::GzDecoder;
20
20
  use futures::StreamExt;
21
21
  use memchr::memchr;
22
- use serde::Deserialize;
22
+ use serde::{Deserialize, Serialize};
23
23
  use serde_json;
24
24
  use std::collections::HashMap;
25
25
  use std::io::{self, Read};
26
26
  use std::sync::Arc;
27
27
  use std::sync::atomic::{AtomicUsize, Ordering};
28
+ use std::thread::sleep;
28
29
  use std::time::Duration;
29
30
  use tokio::io::{AsyncReadExt, BufReader};
30
31
  use tokio::sync::Mutex;
@@ -40,21 +41,6 @@ struct ErrorEntry {
40
41
  attempts_made: u32,
41
42
  }
42
43
 
43
- // Struct for the final output that includes both successful data and errors
44
- #[derive(serde::Serialize)]
45
- struct GdcOutput {
46
- successful_data: Vec<Vec<Vec<String>>>, // Array of successful file data arrays
47
- failed_files: Vec<ErrorEntry>,
48
- summary: OutputSummary,
49
- }
50
-
51
- #[derive(serde::Serialize)]
52
- struct OutputSummary {
53
- total_files: usize,
54
- successful_files: usize,
55
- failed_files: usize,
56
- }
57
-
58
44
  // Define the structure for datadd
59
45
  #[derive(Deserialize, Debug)]
60
46
  struct DataType {
@@ -69,6 +55,67 @@ struct MafOptions {
69
55
  min_total_depth: i32,
70
56
  #[serde(rename = "minAltAlleleCount")]
71
57
  min_alt_allele_count: i32,
58
+ consequences: Option<Vec<String>>, // Optional list of consequences to filter MAF files
59
+ }
60
+
61
+ // Define the structure for cnvOptions
62
+ #[derive(Deserialize, Debug)]
63
+ struct CnvOptions {
64
+ #[serde(rename = "lossThreshold")]
65
+ loss_threshold: f32,
66
+ #[serde(rename = "gainThreshold")]
67
+ gain_threshold: f32,
68
+ #[serde(rename = "segLength")]
69
+ seg_length: i32,
70
+ }
71
+
72
+ // Individual successful file output (JSONL format)
73
+ #[derive(serde::Serialize)]
74
+ struct SuccessfulFileOutput {
75
+ #[serde(rename = "type")]
76
+ output_type: String, // Always "data"
77
+ case_id: String,
78
+ data_type: String,
79
+ data: Vec<Vec<String>>,
80
+ }
81
+
82
+ // struct for MAF filter details
83
+ #[derive(Clone, Serialize, Default)]
84
+ struct FilteredMafDetails {
85
+ invalid_consequences: usize,
86
+ t_alt_count: usize,
87
+ t_depth: usize,
88
+ invalid_rows: usize,
89
+ }
90
+
91
+ // struct for CNV filter details
92
+ #[derive(Clone, Serialize, Default)]
93
+ struct FilteredCnvDetails {
94
+ segment_mean: usize,
95
+ seg_length: usize,
96
+ invalid_rows: usize,
97
+ }
98
+
99
+ // struct for per-case filter details
100
+ #[derive(Clone, Serialize)]
101
+ struct FilteredCaseDetails {
102
+ maf: FilteredMafDetails,
103
+ cnv: FilteredCnvDetails,
104
+ }
105
+
106
+ // Final summary output (JSONL format)
107
+ #[derive(serde::Serialize)]
108
+ struct FinalSummary {
109
+ #[serde(rename = "type")]
110
+ output_type: String, // Always "summary"
111
+ total_files: usize,
112
+ successful_files: usize,
113
+ failed_files: usize,
114
+ errors: Vec<ErrorEntry>,
115
+ filtered_records: usize,
116
+ filtered_maf_records: usize,
117
+ filtered_cnv_records: usize,
118
+ filtered_records_by_case: HashMap<String, FilteredCaseDetails>,
72
119
  }
73
120
 
74
121
  // Define the top-level input structure
@@ -78,104 +125,100 @@ struct InputData {
78
125
  case_files: HashMap<String, DataType>,
79
126
  #[serde(rename = "mafOptions")]
80
127
  maf_options: Option<MafOptions>,
128
+ #[serde(rename = "cnvOptions")]
129
+ cnv_options: Option<CnvOptions>,
130
+ }
131
+
132
+ // Configuration for different data types
133
+ #[derive(Deserialize, Debug)]
134
+ struct DataTypeConfig {
135
+ header_marker: &'static str,
136
+ output_columns: Vec<&'static str>,
81
137
  }
82
138
 
83
139
  // Function to parse TSV content
84
- fn parse_content(
140
+ async fn parse_content(
85
141
  content: &str,
86
142
  case_id: &str,
87
143
  data_type: &str,
88
144
  min_total_depth: i32,
89
145
  min_alt_allele_count: i32,
146
+ consequences: &Option<Vec<String>>,
147
+ gain_threshold: f32,
148
+ loss_threshold: f32,
149
+ seg_length: i32,
150
+ filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
151
+ filtered_maf_records: &AtomicUsize,
152
+ filtered_cnv_records: &AtomicUsize,
90
153
  ) -> Result<Vec<Vec<String>>, (String, String, String)> {
154
+ let config = match data_type {
155
+ "cnv" => DataTypeConfig {
156
+ header_marker: "Segment_Mean",
157
+ output_columns: vec!["Chromosome", "Start", "End", "Segment_Mean"],
158
+ },
159
+ "maf" => DataTypeConfig {
160
+ header_marker: "Hugo_Symbol",
161
+ output_columns: vec!["Chromosome", "Start_Position", "End_Position", "t_depth", "t_alt_count"],
162
+ },
163
+ _ => {
164
+ return Err((
165
+ case_id.to_string(),
166
+ data_type.to_string(),
167
+ "Invalid data type".to_string(),
168
+ ));
169
+ }
170
+ };
171
+
91
172
  let lines = content.lines();
92
173
  let mut parsed_data = Vec::new();
93
174
  let mut columns_indices: Vec<usize> = Vec::new();
94
- let mut header_mk: &str = "";
95
- let mut columns = Vec::new();
96
-
97
- if data_type == "cnv" {
98
- header_mk = "GDC_Aliquot_ID";
99
- columns = vec!["Chromosome", "Start", "End", "Segment_Mean"]
100
- } else if data_type == "maf" {
101
- header_mk = "Hugo_Symbol";
102
- columns = vec!["Chromosome", "Start_Position", "End_Position", "t_depth", "t_alt_count"]
103
- };
175
+ let mut variant_classification_index: Option<usize> = None;
176
+ //let mut header_mk: &str = "";
177
+ //let mut columns = Vec::new();
104
178
 
105
179
  let mut header: Vec<String> = Vec::new();
106
180
 
107
181
  for line in lines {
108
182
  if line.starts_with("#") {
109
183
  continue;
110
- } else if line.contains(&header_mk) {
184
+ };
185
+ if line.contains(config.header_marker) {
111
186
  header = line.split("\t").map(|s| s.to_string()).collect();
112
- for col in &columns {
113
- match header.iter().position(|x| x == col) {
114
- Some(index) => {
115
- columns_indices.push(index);
116
- }
117
- None => {
118
- let error_msg = format!("Column {} was not found", col);
119
- return Err((case_id.to_string(), data_type.to_string(), error_msg));
120
- }
121
- }
122
- }
123
- } else {
124
- let mut keep_ck: bool = true;
125
- let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
126
- let mut out_lst: Vec<String> = Vec::new();
127
- out_lst.push(case_id.to_string());
128
-
129
- for x in columns_indices.iter() {
130
- let mut element = cont_lst[*x].to_string();
131
-
132
- if data_type == "cnv" && &header[*x] == "Segment_Mean" {
133
- let seg_mean = match element.parse::<f32>() {
134
- Ok(val) => val,
135
- Err(_e) => {
136
- let error_msg = "Segment_Mean in cnv file is not float".to_string();
137
- return Err((case_id.to_string(), data_type.to_string(), error_msg));
138
- }
139
- };
140
- if seg_mean >= 0.3 {
141
- element = "gain".to_string();
142
- } else if seg_mean <= -0.4 {
143
- element = "loss".to_string();
144
- } else {
145
- keep_ck = false;
146
- }
147
- }
148
- out_lst.push(element);
149
- }
150
-
151
- if data_type == "maf" {
152
- let alle_depth = match out_lst[4].parse::<i32>() {
153
- Ok(value) => value,
154
- Err(_) => {
155
- let error_msg = "Failed to convert t_depth to i32.".to_string();
156
- return Err((case_id.to_string(), data_type.to_string(), error_msg));
157
- }
158
- };
159
- let alt_count = match out_lst[5].parse::<i32>() {
160
- Ok(value) => value,
161
- Err(_) => {
162
- let error_msg = "Failed to convert t_alt_count to i32.".to_string();
163
- return Err((case_id.to_string(), data_type.to_string(), error_msg));
164
- }
165
- };
166
-
167
- if alle_depth >= min_total_depth && alt_count >= min_alt_allele_count {
168
- out_lst = out_lst[0..4].to_vec();
169
- out_lst.push("mutation".to_string());
170
- } else {
171
- keep_ck = false;
172
- }
187
+ if let Err(err) = setup_columns(
188
+ &header,
189
+ &config,
190
+ &mut columns_indices,
191
+ &mut variant_classification_index,
192
+ case_id,
193
+ data_type,
194
+ ) {
195
+ return Err(err);
173
196
  }
197
+ continue;
198
+ };
174
199
 
175
- if keep_ck {
176
- parsed_data.push(out_lst);
177
- }
178
- }
200
+ let row = process_row(
201
+ line,
202
+ case_id,
203
+ data_type,
204
+ &header,
205
+ &columns_indices,
206
+ variant_classification_index,
207
+ consequences,
208
+ min_total_depth,
209
+ min_alt_allele_count,
210
+ gain_threshold,
211
+ loss_threshold,
212
+ seg_length,
213
+ filtered_records,
214
+ filtered_maf_records,
215
+ filtered_cnv_records,
216
+ )
217
+ .await?;
218
+
219
+ if let Some(out_lst) = row {
220
+ parsed_data.push(out_lst);
221
+ };
179
222
  }
180
223
 
181
224
  if columns_indices.is_empty() {
@@ -189,6 +232,248 @@ fn parse_content(
189
232
  Ok(parsed_data)
190
233
  }
191
234
 
235
+ // Set up column indices for processing
236
+ fn setup_columns(
237
+ header: &[String],
238
+ config: &DataTypeConfig,
239
+ columns_indices: &mut Vec<usize>,
240
+ variant_classification_index: &mut Option<usize>,
241
+ case_id: &str,
242
+ data_type: &str,
243
+ ) -> Result<(), (String, String, String)> {
244
+ for col in &config.output_columns {
245
+ match header.iter().position(|x| x == col) {
246
+ Some(index) => columns_indices.push(index),
247
+ None => {
248
+ return Err((
249
+ case_id.to_string(),
250
+ data_type.to_string(),
251
+ format!("Column {} was not found", col),
252
+ ));
253
+ }
254
+ }
255
+ }
256
+
257
+ if data_type == "maf" {
258
+ *variant_classification_index = header.iter().position(|x| x == "Variant_Classification");
259
+ if variant_classification_index.is_none() {
260
+ return Err((
261
+ case_id.to_string(),
262
+ data_type.to_string(),
263
+ "Column Variant_Classification was not found".to_string(),
264
+ ));
265
+ }
266
+ }
267
+
268
+ Ok(())
269
+ }
270
+
271
+ // Process a single row of data
272
+ async fn process_row(
273
+ line: &str,
274
+ case_id: &str,
275
+ data_type: &str,
276
+ header: &[String],
277
+ columns_indices: &[usize],
278
+ variant_classification_index: Option<usize>,
279
+ consequences: &Option<Vec<String>>,
280
+ min_total_depth: i32,
281
+ min_alt_allele_count: i32,
282
+ gain_threshold: f32,
283
+ loss_threshold: f32,
284
+ seg_length: i32,
285
+ filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
286
+ filtered_maf_records: &AtomicUsize,
287
+ filtered_cnv_records: &AtomicUsize,
288
+ ) -> Result<Option<Vec<String>>, (String, String, String)> {
289
+ let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
290
+ let mut out_lst = vec![case_id.to_string()];
291
+
292
+ // Initialize or update case details
293
+ let mut filtered_map = filtered_records.lock().await;
294
+ filtered_map
295
+ .entry(case_id.to_string())
296
+ .or_insert_with(|| FilteredCaseDetails {
297
+ maf: FilteredMafDetails::default(),
298
+ cnv: FilteredCnvDetails::default(),
299
+ });
300
+
301
+ let case_details = filtered_map.get_mut(case_id).unwrap();
302
+
303
+ // Check consequence filtering for MAF files
304
+ if data_type == "maf" && !is_valid_consequence(&cont_lst, variant_classification_index, consequences) {
305
+ case_details.maf.invalid_consequences += 1;
306
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
307
+ return Ok(None);
308
+ }
309
+
310
+ // Extract relevant columns
311
+ for &x in columns_indices {
312
+ if x >= cont_lst.len() {
313
+ if data_type == "maf" {
314
+ case_details.maf.invalid_rows += 1;
315
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
316
+ } else if data_type == "cnv" {
317
+ case_details.cnv.invalid_rows += 1;
318
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
319
+ }
320
+ return Ok(None); // Invalid row
321
+ }
322
+
323
+ let mut element = cont_lst[x].to_string();
324
+ if data_type == "cnv" && header[x] == "Segment_Mean" {
325
+ element = process_segment_mean(&element, case_id, data_type, gain_threshold, loss_threshold)?;
326
+ if element.is_empty() {
327
+ case_details.cnv.segment_mean += 1;
328
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
329
+ return Ok(None);
330
+ }
331
+ }
332
+ out_lst.push(element);
333
+ }
334
+
335
+ // Additional MAF-specific processing
336
+ if data_type == "maf" {
337
+ if out_lst.len() < 6 {
338
+ case_details.maf.invalid_rows += 1;
339
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
340
+ return Ok(None); // Not enough columns
341
+ }
342
+
343
+ let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
344
+ case_details.maf.invalid_rows += 1;
345
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
346
+ (
347
+ case_id.to_string(),
348
+ data_type.to_string(),
349
+ "Failed to convert t_depth to integer.".to_string(),
350
+ )
351
+ })?;
352
+
353
+ let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
354
+ case_details.maf.invalid_rows += 1;
355
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
356
+ (
357
+ case_id.to_string(),
358
+ data_type.to_string(),
359
+ "Failed to convert t_alt_count to integer.".to_string(),
360
+ )
361
+ })?;
362
+
363
+ if alle_depth < min_total_depth {
364
+ case_details.maf.t_depth += 1;
365
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
366
+ return Ok(None);
367
+ }
368
+ if alt_count < min_alt_allele_count {
369
+ case_details.maf.t_alt_count += 1;
370
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
371
+ return Ok(None);
372
+ }
373
+
374
+ // Keep case_id, chr, start, end, and add "mutation"
375
+ out_lst = out_lst[0..4].to_vec();
376
+ out_lst.push("mutation".to_string());
377
+ }
378
+
379
+ // filter cnvs based on segment length. Default: 0 (no filtering)
380
+ if data_type == "cnv" {
381
+ // calculate segment length (End_Position - Start_Position)
382
+ let end_position = out_lst[3].parse::<i32>().map_err(|_| {
383
+ case_details.cnv.invalid_rows += 1;
384
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
385
+ (
386
+ case_id.to_string(),
387
+ data_type.to_string(),
388
+ "Failed to convert End Position of cnv to integer.".to_string(),
389
+ )
390
+ })?;
391
+
392
+ let start_position = out_lst[2].parse::<i32>().map_err(|_| {
393
+ case_details.cnv.invalid_rows += 1;
394
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
395
+ (
396
+ case_id.to_string(),
397
+ data_type.to_string(),
398
+ "Failed to convert Start Position of cnv to integer.".to_string(),
399
+ )
400
+ })?;
401
+ let cnv_length = end_position - start_position;
402
+ if seg_length > 0 && cnv_length > seg_length {
403
+ case_details.cnv.seg_length += 1;
404
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
405
+ return Ok(None);
406
+ }
407
+ }
408
+
409
+ Ok(Some(out_lst))
410
+ }
411
+
412
+ // Check if the row meets consequence filtering criteria
413
+ fn is_valid_consequence(
414
+ cont_lst: &[String],
415
+ variant_classification_index: Option<usize>,
416
+ consequences: &Option<Vec<String>>,
417
+ ) -> bool {
418
+ if let Some(consequence_filter) = consequences {
419
+ if !consequence_filter.is_empty() {
420
+ if let Some(var_class_idx) = variant_classification_index {
421
+ if var_class_idx < cont_lst.len() {
422
+ let variant_classification = &cont_lst[var_class_idx];
423
+ if let Some(normalized_consequence) = normalize_consequence(variant_classification) {
424
+ return consequence_filter.contains(&normalized_consequence);
425
+ }
426
+ }
427
+ return false; // Invalid row or unknown consequence
428
+ }
429
+ }
430
+ }
431
+ true // No filtering or empty filter
432
+ }
433
+
434
+ // Process Segment_Mean for CNV files
435
+ fn process_segment_mean(
436
+ element: &str,
437
+ case_id: &str,
438
+ data_type: &str,
439
+ gain_threshold: f32,
440
+ loss_threshold: f32,
441
+ ) -> Result<String, (String, String, String)> {
442
+ let seg_mean = element.parse::<f32>().map_err(|_| {
443
+ (
444
+ case_id.to_string(),
445
+ data_type.to_string(),
446
+ "Segment_Mean in cnv file is not float".to_string(),
447
+ )
448
+ })?;
449
+
450
+ if seg_mean >= gain_threshold {
451
+ Ok("gain".to_string())
452
+ } else if seg_mean <= loss_threshold {
453
+ Ok("loss".to_string())
454
+ } else {
455
+ Ok(String::new())
456
+ }
457
+ }
458
+
459
+ /// Updated helper function to normalize MAF consequence types to frontend format
460
+ /// Returns None for unknown consequence types (which will be filtered out)
461
+ fn normalize_consequence(maf_consequence: &str) -> Option<String> {
462
+ match maf_consequence.to_lowercase().as_str() {
463
+ // Only map the consequence types we actually support
464
+ "missense_mutation" => Some("missense".to_string()),
465
+ "nonsense_mutation" | "stop_gained" | "stop_lost" => Some("nonsense".to_string()),
466
+ "frame_shift_del" | "frame_shift_ins" | "frameshift_variant" => Some("frameshift".to_string()),
467
+ "silent" | "synonymous_variant" => Some("silent".to_string()),
468
+ "in_frame_del" => Some("deletion".to_string()),
469
+ "in_frame_ins" => Some("insertion".to_string()),
470
+ "splice_site" | "splice_acceptor_variant" | "splice_donor_variant" => Some("splice_site".to_string()),
471
+ "tandem_duplication" | "duplication" => Some("duplication".to_string()),
472
+ "inversion" => Some("inversion".to_string()),
473
+ // Return None for all unknown consequence types - they will be filtered out
474
+ _ => None,
475
+ }
476
+ }
192
477
  /// Downloads a single file with minimal retry logic for transient failures
193
478
  async fn download_single_file(
194
479
  case_id: String,
@@ -291,14 +576,19 @@ async fn download_single_file(
291
576
  ))
292
577
  }
293
578
 
294
- /// Main download function with structured JSON output including errors
295
- async fn download_data(
579
+ /// Streaming download function
580
+ /// Outputs JSONL format: one JSON object per line
581
+ /// Node.js will read this line-by-line but still wait for completion
582
+ async fn download_data_streaming(
296
583
  data4dl: HashMap<String, DataType>,
297
584
  host: &str,
298
585
  min_total_depth: i32,
299
586
  min_alt_allele_count: i32,
587
+ consequences: &Option<Vec<String>>,
588
+ gain_threshold: f32,
589
+ loss_threshold: f32,
590
+ seg_length: i32,
300
591
  ) {
301
- // Generate URLs from data4dl, handling optional cnv and maf
302
592
  let data_urls: Vec<(String, String, String)> = data4dl
303
593
  .into_iter()
304
594
  .flat_map(|(case_id, data_types)| {
@@ -315,42 +605,76 @@ async fn download_data(
315
605
 
316
606
  let total_files = data_urls.len();
317
607
 
318
- // Use atomic counters that can be safely shared across async closures
608
+ // Counters for final summary
319
609
  let successful_downloads = Arc::new(AtomicUsize::new(0));
320
610
  let failed_downloads = Arc::new(AtomicUsize::new(0));
611
+ let filtered_maf_records = Arc::new(AtomicUsize::new(0));
612
+ let filtered_cnv_records = Arc::new(AtomicUsize::new(0));
613
+ let filtered_records = Arc::new(Mutex::new(HashMap::<String, FilteredCaseDetails>::new()));
321
614
 
322
- // Create shared vectors to collect successful data and errors
323
- let successful_data = Arc::new(Mutex::new(Vec::<Vec<Vec<String>>>::new()));
615
+ // Only collect errors (successful data is output immediately)
324
616
  let errors = Arc::new(Mutex::new(Vec::<ErrorEntry>::new()));
325
617
 
326
- // Create download futures with smart retry logic
327
- let download_futures = futures::stream::iter(data_urls.into_iter().map(|(case_id, data_type, url)| {
328
- async move {
329
- // Try each file up to 2 times for transient failures
330
- download_single_file(case_id, data_type, url, 2).await
331
- }
332
- }));
618
+ let download_futures = futures::stream::iter(
619
+ data_urls
620
+ .into_iter()
621
+ .map(|(case_id, data_type, url)| async move { download_single_file(case_id, data_type, url, 2).await }),
622
+ );
333
623
 
334
- // Execute downloads concurrently with high concurrency for speed
624
+ // Process downloads and output results immediately as JSONL
335
625
  download_futures
336
- .buffer_unordered(15) // Increased to 15 concurrent downloads for speed
626
+ .buffer_unordered(20) // Increased concurrency for better performance
337
627
  .for_each(|download_result| {
338
628
  let successful_downloads = Arc::clone(&successful_downloads);
339
629
  let failed_downloads = Arc::clone(&failed_downloads);
340
- let successful_data = Arc::clone(&successful_data);
630
+ let filtered_maf_records = Arc::clone(&filtered_maf_records);
631
+ let filtered_cnv_records = Arc::clone(&filtered_cnv_records);
632
+ let filtered_records = Arc::clone(&filtered_records);
341
633
  let errors = Arc::clone(&errors);
342
634
 
343
635
  async move {
344
636
  match download_result {
345
637
  Ok((case_id, data_type, content)) => {
346
- // Successfully downloaded, now try to parse
347
- match parse_content(&content, &case_id, &data_type, min_total_depth, min_alt_allele_count) {
638
+ // Try to parse the content
639
+ match parse_content(
640
+ &content,
641
+ &case_id,
642
+ &data_type,
643
+ min_total_depth,
644
+ min_alt_allele_count,
645
+ &consequences,
646
+ gain_threshold,
647
+ loss_threshold,
648
+ seg_length,
649
+ &filtered_records,
650
+ &filtered_maf_records,
651
+ &filtered_cnv_records,
652
+ )
653
+ .await
654
+ {
348
655
  Ok(parsed_data) => {
349
- // Store successful data
350
- successful_data.lock().await.push(parsed_data);
656
+ // SUCCESS: Output immediately as JSONL
657
+ let success_output = SuccessfulFileOutput {
658
+ output_type: "data".to_string(),
659
+ case_id: case_id.clone(),
660
+ data_type: data_type.clone(),
661
+ data: parsed_data,
662
+ };
663
+
664
+ // Output this successful result immediately - Node.js will see this in real-time
665
+ if let Ok(json) = serde_json::to_string(&success_output) {
666
+ println!("{}", json); // IMMEDIATE output to stdout
667
+ // Force flush to ensure Node.js sees it immediately
668
+ use std::io::Write;
669
+ let _ = std::io::stdout().flush();
670
+ // Optional: Add small delay to separate lines
671
+ sleep(Duration::from_millis(10));
672
+ }
673
+
351
674
  successful_downloads.fetch_add(1, Ordering::Relaxed);
352
675
  }
353
676
  Err((cid, dtp, error)) => {
677
+ // Parsing failed - add to errors
354
678
  failed_downloads.fetch_add(1, Ordering::Relaxed);
355
679
  let error = ErrorEntry {
356
680
  case_id: cid,
@@ -364,9 +688,9 @@ async fn download_data(
364
688
  }
365
689
  }
366
690
  Err((case_id, data_type, error_details, attempts)) => {
691
+ // Download failed - add to errors
367
692
  failed_downloads.fetch_add(1, Ordering::Relaxed);
368
693
 
369
- // Parse error type from error details
370
694
  let (error_type, clean_details) = if error_details.contains(":") {
371
695
  let parts: Vec<&str> = error_details.splitn(2, ": ").collect();
372
696
  (parts[0].to_string(), parts[1].to_string())
@@ -388,27 +712,29 @@ async fn download_data(
388
712
  })
389
713
  .await;
390
714
 
391
- // Create final output structure
715
+ // Output final summary as the last line
392
716
  let success_count = successful_downloads.load(Ordering::Relaxed);
393
717
  let failed_count = failed_downloads.load(Ordering::Relaxed);
394
-
395
- let output = GdcOutput {
396
- successful_data: successful_data.lock().await.clone(),
397
- failed_files: errors.lock().await.clone(),
398
- summary: OutputSummary {
399
- total_files,
400
- successful_files: success_count,
401
- failed_files: failed_count,
402
- },
718
+ let filtered_maf_count = filtered_maf_records.load(Ordering::Relaxed);
719
+ let filtered_cnv_count = filtered_cnv_records.load(Ordering::Relaxed);
720
+
721
+ let summary = FinalSummary {
722
+ output_type: "summary".to_string(),
723
+ total_files,
724
+ successful_files: success_count,
725
+ failed_files: failed_count,
726
+ errors: errors.lock().await.clone(),
727
+ filtered_records: filtered_maf_count + filtered_cnv_count,
728
+ filtered_maf_records: filtered_maf_count,
729
+ filtered_cnv_records: filtered_cnv_count,
730
+ filtered_records_by_case: filtered_records.lock().await.clone(),
403
731
  };
404
732
 
405
- // Output the complete structure as JSON
406
- match serde_json::to_string(&output) {
407
- Ok(json) => println!("{}", json),
408
- Err(_) => {
409
- // Silent failure - exit without stderr
410
- std::process::exit(1);
411
- }
733
+ // Output final summary - Node.js will know processing is complete when it sees this
734
+ if let Ok(json) = serde_json::to_string(&summary) {
735
+ println!("{}", json);
736
+ use std::io::Write;
737
+ let _ = std::io::stdout().flush();
412
738
  }
413
739
  }
414
740
 
@@ -455,13 +781,33 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
455
781
  let case_files = input_js.case_files;
456
782
 
457
783
  // Set default maf_options
458
- let (min_total_depth, min_alt_allele_count) = match input_js.maf_options {
459
- Some(options) => (options.min_total_depth, options.min_alt_allele_count),
460
- None => (10, 2), // Default values
784
+ let (min_total_depth, min_alt_allele_count, consequences) = match input_js.maf_options {
785
+ Some(options) => (
786
+ options.min_total_depth,
787
+ options.min_alt_allele_count,
788
+ options.consequences.clone(),
789
+ ),
790
+ None => (10, 2, None), // Default values
791
+ };
792
+
793
+ // Set default cnv_options
794
+ let (gain_threshold, loss_threshold, seg_length) = match input_js.cnv_options {
795
+ Some(options) => (options.gain_threshold, options.loss_threshold, options.seg_length),
796
+ None => (0.3, -0.4, 0), // Default values
461
797
  };
462
798
 
463
799
  // Download data - this will now handle errors gracefully
464
- download_data(case_files, HOST, min_total_depth, min_alt_allele_count).await;
800
+ download_data_streaming(
801
+ case_files,
802
+ HOST,
803
+ min_total_depth,
804
+ min_alt_allele_count,
805
+ &consequences,
806
+ gain_threshold,
807
+ loss_threshold,
808
+ seg_length,
809
+ )
810
+ .await;
465
811
 
466
812
  // Always exit successfully - individual file failures are logged but don't stop the process
467
813
  Ok(())