@sjcrh/proteinpaint-rust 2.129.1-80343740e.0 → 2.129.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -13,7 +13,7 @@ kodama = "0.3"
13
13
  rayon = "1.7.0"
14
14
  bgzip = "0.3.1"
15
15
  petgraph = "0.6.3"
16
- rusqlite="0.35"
16
+ rusqlite="0.31.0"
17
17
  ndarray = "0.16.1"
18
18
  hdf5 = { package = "hdf5-metno", version = "0.9.0" }
19
19
  nalgebra = {version = "0.32.2", features = ["serde-serialize"]}
@@ -35,8 +35,6 @@ flate2 = "1"
35
35
  futures = "0.3"
36
36
  num_cpus = "1.16.0"
37
37
  memchr = "2"
38
- r2d2_sqlite = "0.28.0"
39
- r2d2 = "0.8.10"
40
38
 
41
39
  [profile.release]
42
40
  lto = "fat"
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.129.1-80343740e.0",
2
+ "version": "2.129.2",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "type": "module",
5
5
  "description": "Rust-based utilities for proteinpaint",
package/src/cerno.rs CHANGED
@@ -2,20 +2,16 @@
2
2
  #![allow(non_snake_case)]
3
3
  use json::JsonValue;
4
4
  use r_mathlib::chi_squared_cdf;
5
- use r2d2;
6
- use r2d2_sqlite::SqliteConnectionManager;
7
5
  use rusqlite::{Connection, Result};
8
6
  use serde::{Deserialize, Serialize};
9
7
  use serde_json;
10
8
  use std::cmp::Ordering;
11
9
  use std::collections::HashSet;
12
10
  use std::io;
13
- use std::sync::{Arc, Mutex}; // Multithreading library
14
- use std::thread;
15
11
 
16
12
  #[allow(non_camel_case_types)]
17
13
  #[allow(non_snake_case)]
18
- #[derive(Debug, Clone)]
14
+ #[derive(Debug)]
19
15
  struct GO_pathway {
20
16
  GO_id: String,
21
17
  }
@@ -25,7 +21,7 @@ struct GO_pathway {
25
21
  #[derive(Debug, Clone, PartialEq, PartialOrd)]
26
22
  struct gene_order {
27
23
  gene_name: String,
28
- fold_change: f32,
24
+ fold_change: f64,
29
25
  rank: Option<usize>,
30
26
  }
31
27
 
@@ -35,11 +31,11 @@ struct gene_order {
35
31
  //#[allow(dead_code)]
36
32
  struct pathway_p_value {
37
33
  pathway_name: String,
38
- p_value_original: f32,
39
- p_value_adjusted: Option<f32>,
34
+ p_value_original: f64,
35
+ p_value_adjusted: Option<f64>,
40
36
  gene_set_hits: String,
41
- auc: f32,
42
- es: f32,
37
+ auc: f64,
38
+ es: f64,
43
39
  gene_set_size: usize,
44
40
  }
45
41
 
@@ -48,16 +44,13 @@ struct pathway_p_value {
48
44
  #[derive(Debug, Serialize, Deserialize)]
49
45
  //#[allow(dead_code)]
50
46
  struct output_struct {
51
- pval: f32,
52
- fdr: f32,
47
+ pval: f64,
48
+ fdr: f64,
53
49
  leading_edge: String,
54
- auc: f32,
55
- es: f32,
50
+ auc: f64,
51
+ es: f64,
56
52
  geneset_size: usize,
57
53
  }
58
- const PAR_CUTOFF: usize = 1000; // Cutoff for triggering multithreading processing of data
59
- #[allow(non_upper_case_globals)]
60
- const max_threads: usize = 3; // Max number of threads in case the parallel processing of reads is invoked
61
54
 
62
55
  fn main() -> Result<()> {
63
56
  let mut input = String::new();
@@ -88,17 +81,17 @@ fn main() -> Result<()> {
88
81
  //println!("sample_genes:{:?}", sample_genes);
89
82
 
90
83
  let fold_change_input: &JsonValue = &json_string["fold_change"];
91
- let mut fold_change_f32 = Vec::<f32>::new();
84
+ let mut fold_change_f64 = Vec::<f64>::new();
92
85
  for iter in 0..fold_change_input.len() {
93
- let item = fold_change_input[iter].as_f32().unwrap();
94
- fold_change_f32.push(item);
86
+ let item = fold_change_input[iter].as_f64().unwrap();
87
+ fold_change_f64.push(item);
95
88
  }
96
89
 
97
90
  if sample_genes.len() == 0 {
98
91
  panic!("No sample genes provided");
99
92
  }
100
93
 
101
- if sample_genes.len() != fold_change_f32.len() {
94
+ if sample_genes.len() != fold_change_f64.len() {
102
95
  panic!("Length of genes array and fold change array are not equal");
103
96
  }
104
97
 
@@ -106,7 +99,7 @@ fn main() -> Result<()> {
106
99
  for i in 0..sample_genes.len() {
107
100
  let item: gene_order = gene_order {
108
101
  gene_name: sample_genes[i].to_string(),
109
- fold_change: fold_change_f32[i],
102
+ fold_change: fold_change_f64[i],
110
103
  rank: None, // Will be calculated later
111
104
  };
112
105
  genes_vector.push(item)
@@ -161,7 +154,7 @@ fn main() -> Result<()> {
161
154
  //println!("sample_genes:{:?}", sample_genes);
162
155
  //println!("background_genes:{:?}", background_genes);
163
156
 
164
- let msigdbconn = Connection::open(&msigdb)?;
157
+ let msigdbconn = Connection::open(msigdb)?;
165
158
  let stmt_result = msigdbconn
166
159
  .prepare(&("select id from terms where parent_id='".to_owned() + &genesetgroup + "'"));
167
160
  match stmt_result {
@@ -169,148 +162,58 @@ fn main() -> Result<()> {
169
162
  #[allow(non_snake_case)]
170
163
  let GO_iter = stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
171
164
  #[allow(non_snake_case)]
172
- let mut genesets = Vec::<String>::new();
173
165
  for GO_term in GO_iter {
174
166
  match GO_term {
175
167
  Ok(n) => {
176
- genesets.push(n.GO_id);
177
- }
178
- Err(_) => {
179
- println!("GO term not found!")
180
- }
181
- }
182
- }
183
-
184
- if genesets.len() < PAR_CUTOFF {
185
- for gs in genesets {
186
- let sql_statement =
187
- "select genes from term2genes where id='".to_owned() + &gs + &"'";
188
- //println!("sql_statement:{}", sql_statement);
189
- let mut gene_stmt = msigdbconn.prepare(&(sql_statement))?;
190
- //println!("gene_stmt:{:?}", gene_stmt);
191
-
192
- let mut rows = gene_stmt.query([])?;
193
- let mut names = HashSet::<String>::new();
194
- while let Some(row) = rows.next()? {
195
- let a: String = row.get(0)?;
196
- let input_gene_json = json::parse(&a);
197
- match input_gene_json {
198
- Ok(json_genes) => {
199
- for json_iter in 0..json_genes.len() {
200
- names.insert(json_genes[json_iter]["symbol"].to_string());
201
- }
202
- }
203
- Err(_) => {
204
- panic!("Symbol, ensg, enstCanonical structure is missing!")
205
- }
206
- }
207
- }
208
- let gene_set_size = names.len();
209
- let (p_value, auc, es, matches, gene_set_hits) = cerno(&sample_coding_genes, names);
210
-
211
- if matches >= 1.0
212
- && p_value.is_nan() == false
213
- && es.is_nan() == false
214
- && es != f32::INFINITY
215
- && auc != f32::INFINITY
216
- && auc.is_nan() == false
217
- {
218
- pathway_p_values.push(pathway_p_value {
219
- pathway_name: gs,
220
- p_value_original: p_value,
221
- p_value_adjusted: None,
222
- auc: auc,
223
- es: es,
224
- gene_set_hits: gene_set_hits,
225
- gene_set_size: gene_set_size,
226
- })
227
- }
228
- }
229
- } else {
230
- // Multithreaded implementation
231
- let manager = SqliteConnectionManager::file(&msigdb); // This enables sqlite query from multiple threads simultaneously
232
- let pool = r2d2::Pool::new(manager).unwrap(); // This enables sqlite query from multiple threads simultaneously
233
- let genesets = Arc::new(genesets);
234
- let pool_arc = Arc::new(pool);
235
- let sample_coding_genes = Arc::new(sample_coding_genes);
236
- let pathway_p_values_temp =
237
- Arc::new(Mutex::new(Vec::<pathway_p_value>::with_capacity(genesets.len())));
238
- let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
239
- for thread_num in 0..max_threads {
240
- let genesets = Arc::clone(&genesets);
241
- let pool_arc = Arc::clone(&pool_arc);
242
- let sample_coding_genes = Arc::clone(&sample_coding_genes);
243
- let pathway_p_values_temp = Arc::clone(&pathway_p_values_temp);
244
- let handle = thread::spawn(move || {
245
- let mut pathway_p_values_thread: Vec<pathway_p_value> =
246
- Vec::with_capacity(10000);
247
- for iter in 0..genesets.len() {
248
- let remainder: usize = iter % max_threads;
249
- if remainder == thread_num {
250
- let sql_statement = "select genes from term2genes where id='"
251
- .to_owned()
252
- + &genesets[iter]
253
- + &"'";
254
- //println!("sql_statement:{}", sql_statement);
255
- let conn = pool_arc.get().unwrap();
256
- let mut gene_stmt = conn.prepare(&sql_statement).unwrap();
257
- //println!("gene_stmt:{:?}", gene_stmt);
258
-
259
- let mut rows = gene_stmt.query([]).unwrap();
260
- let mut names = HashSet::<String>::new();
261
- while let Some(row) = rows.next().unwrap() {
262
- let a: String = row.get(0).unwrap();
263
- let input_gene_json = json::parse(&a);
264
- match input_gene_json {
265
- Ok(json_genes) => {
266
- for json_iter in 0..json_genes.len() {
267
- names.insert(
268
- json_genes[json_iter]["symbol"].to_string(),
269
- );
270
- }
271
- }
272
- Err(_) => {
273
- panic!("Symbol, ensg, enstCanonical structure is missing!")
274
- }
168
+ //println!("GO term {:?}", n);
169
+ let sql_statement =
170
+ "select genes from term2genes where id='".to_owned() + &n.GO_id + &"'";
171
+ //println!("sql_statement:{}", sql_statement);
172
+ let mut gene_stmt = msigdbconn.prepare(&(sql_statement))?;
173
+ //println!("gene_stmt:{:?}", gene_stmt);
174
+
175
+ let mut rows = gene_stmt.query([])?;
176
+ let mut names = HashSet::<String>::new();
177
+ while let Some(row) = rows.next()? {
178
+ let a: String = row.get(0)?;
179
+ let input_gene_json = json::parse(&a);
180
+ match input_gene_json {
181
+ Ok(json_genes) => {
182
+ for json_iter in 0..json_genes.len() {
183
+ names.insert(json_genes[json_iter]["symbol"].to_string());
275
184
  }
276
185
  }
277
- let gene_set_size = names.len();
278
- let (p_value, auc, es, matches, gene_set_hits) =
279
- cerno(&sample_coding_genes, names);
280
-
281
- if matches >= 1.0
282
- && p_value.is_nan() == false
283
- && es.is_nan() == false
284
- && es != f32::INFINITY
285
- && auc != f32::INFINITY
286
- && auc.is_nan() == false
287
- {
288
- pathway_p_values_thread.push(pathway_p_value {
289
- pathway_name: genesets[iter].clone(),
290
- p_value_original: p_value,
291
- p_value_adjusted: None,
292
- auc: auc,
293
- es: es,
294
- gene_set_hits: gene_set_hits,
295
- gene_set_size: gene_set_size,
296
- })
186
+ Err(_) => {
187
+ panic!("Symbol, ensg, enstCanonical structure is missing!")
297
188
  }
298
189
  }
299
190
  }
300
- pathway_p_values_temp
301
- .lock()
302
- .unwrap()
303
- .append(&mut pathway_p_values_thread);
304
- drop(pathway_p_values_temp);
305
- });
306
- handles.push(handle); // The handle (which contains the thread) is stored in the handles vector
307
- }
308
- for handle in handles {
309
- // Wait for all threads to finish before proceeding further
310
- handle.join().unwrap();
191
+ let gene_set_size = names.len();
192
+ let (p_value, auc, es, matches, gene_set_hits) =
193
+ cerno(&sample_coding_genes, names);
194
+
195
+ if matches >= 1.0
196
+ && p_value.is_nan() == false
197
+ && es.is_nan() == false
198
+ && es != f64::INFINITY
199
+ && auc != f64::INFINITY
200
+ && auc.is_nan() == false
201
+ {
202
+ pathway_p_values.push(pathway_p_value {
203
+ pathway_name: n.GO_id,
204
+ p_value_original: p_value,
205
+ p_value_adjusted: None,
206
+ auc: auc,
207
+ es: es,
208
+ gene_set_hits: gene_set_hits,
209
+ gene_set_size: gene_set_size,
210
+ })
211
+ }
212
+ }
213
+ Err(_) => {
214
+ println!("GO term not found!")
215
+ }
311
216
  }
312
- // Combining data from all different threads
313
- pathway_p_values.append(&mut *pathway_p_values_temp.lock().unwrap());
314
217
  }
315
218
  }
316
219
  Err(_) => panic!("sqlite database file not found"),
@@ -326,15 +229,15 @@ fn main() -> Result<()> {
326
229
  Ok(())
327
230
  }
328
231
 
329
- fn cerno(sample_coding_genes: &Vec<gene_order>, genes_in_pathway: HashSet<String>) -> (f32, f32, f32, f32, String) {
232
+ fn cerno(sample_coding_genes: &Vec<gene_order>, genes_in_pathway: HashSet<String>) -> (f64, f64, f64, f64, String) {
330
233
  // Filter the sample_coding_genes vector to only include those whose gene_names are in the HashSet genes_in_pathway
331
234
  let gene_intersections: Vec<&gene_order> = sample_coding_genes
332
235
  .iter()
333
236
  .filter(|sample_coding_genes| genes_in_pathway.contains(&sample_coding_genes.gene_name)) // Check if name is in the HashSet genes_in_pathway
334
237
  .collect(); // Collect the results into a new vector
335
238
 
336
- let N1 = gene_intersections.len() as f32;
337
- let N = sample_coding_genes.len() as f32;
239
+ let N1 = gene_intersections.len() as f64;
240
+ let N = sample_coding_genes.len() as f64;
338
241
  let mut gene_set_hits: String = "".to_string();
339
242
  for gene in &gene_intersections {
340
243
  gene_set_hits += &(gene.gene_name.to_string() + &",");
@@ -349,21 +252,21 @@ fn cerno(sample_coding_genes: &Vec<gene_order>, genes_in_pathway: HashSet<String
349
252
  .map(|x| x.rank.unwrap())
350
253
  .collect::<Vec<usize>>();
351
254
 
352
- let cerno: f32 = ranks // -2 * sum( log(ranks/N) )
255
+ let cerno: f64 = ranks // -2 * sum( log(ranks/N) )
353
256
  .iter()
354
- .map(|x| ((*x as f32) / N).ln())
355
- .collect::<Vec<f32>>()
257
+ .map(|x| ((*x as f64) / N).ln())
258
+ .collect::<Vec<f64>>()
356
259
  .iter()
357
- .sum::<f32>()
260
+ .sum::<f64>()
358
261
  * (-2.0);
359
262
 
360
- let cES: f32 = cerno / (2.0 * (N1 as f32)); // cES <- cerno/(2*N1)
263
+ let cES: f64 = cerno / (2.0 * (N1 as f64)); // cES <- cerno/(2*N1)
361
264
  let N2 = N - N1; // N2 = N - N1
362
- let R1 = ranks.iter().sum::<usize>() as f32; // R1 <- sum(ranks)
265
+ let R1 = ranks.iter().sum::<usize>() as f64; // R1 <- sum(ranks)
363
266
  let U = N1 * N2 + N1 * (N1 + 1.0) / 2.0 - R1; // U <- N1*N2+N1*(N1+1)/2-R1
364
267
  let AUC = U / (N1 * N2); // AUC <- U/(N1*N2)
365
- let p_value = chi_squared_cdf(cerno as f64, (2.0 * N1) as f64, false, false); // pchisq(ret$cerno, 2*N1, lower.tail=FALSE)
366
- (p_value as f32, AUC, cES, N1, gene_set_hits)
268
+ let p_value = chi_squared_cdf(cerno, 2.0 * N1, false, false); // pchisq(ret$cerno, 2*N1, lower.tail=FALSE)
269
+ (p_value, AUC, cES, N1, gene_set_hits)
367
270
  }
368
271
 
369
272
  fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>) -> String {
@@ -375,13 +278,13 @@ fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>) -> String {
375
278
  });
376
279
 
377
280
  let mut adjusted_p_values: Vec<pathway_p_value> = Vec::with_capacity(original_p_values.len());
378
- let mut old_p_value: f32 = 0.0;
379
- let mut rank: f32 = original_p_values.len() as f32;
281
+ let mut old_p_value: f64 = 0.0;
282
+ let mut rank: f64 = original_p_values.len() as f64;
380
283
  for j in 0..original_p_values.len() {
381
284
  let i = original_p_values.len() - j - 1;
382
285
 
383
286
  //println!("p_val:{}", p_val);
384
- let mut adjusted_p_val: f32 = original_p_values[i].p_value_original * (original_p_values.len() as f32 / rank); // adjusted p-value = original_p_value * (N/rank)
287
+ let mut adjusted_p_val: f64 = original_p_values[i].p_value_original * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
385
288
  if adjusted_p_val > 1.0 {
386
289
  // p_value should NEVER be greater than 1
387
290
  adjusted_p_val = 1.0;
package/src/gdcGRIN2.rs CHANGED
@@ -1,19 +1,58 @@
1
+ /*
2
+ This script downloads cohort maf files from GDC and gracefully handles timeout and other possible errors related to GDC api processing for use by the client file summary div
3
+
4
+ Key improvements:
5
+ 1. Graceful error handling - individual file failures don't stop the entire process
6
+ 2. Better timeout handling with retries
7
+ 3. More detailed error reporting
8
+ 4. Continues processing even when some files fail
9
+
10
+ Input JSON:
11
+ caseFiles
12
+ mafOptions: For SNVindel filtering
13
+ Output mutations as JSON array.
14
+
15
+ Example of usage:
16
+ echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}},"mafOptions": {"minTotalDepth": 10,"minAltAlleleCount": 2}}' | ./target/release/gdcGRIN2
17
+ */
18
+
1
19
  use flate2::read::GzDecoder;
2
20
  use futures::StreamExt;
3
21
  use memchr::memchr;
4
22
  use serde::Deserialize;
5
23
  use serde_json;
6
24
  use std::collections::HashMap;
7
- use std::io::{self, Read, Write};
25
+ use std::io::{self, Read};
26
+ use std::sync::Arc;
27
+ use std::sync::atomic::{AtomicUsize, Ordering};
8
28
  use std::time::Duration;
9
29
  use tokio::io::{AsyncReadExt, BufReader};
30
+ use tokio::sync::Mutex;
10
31
  use tokio::time::timeout;
11
32
 
12
- // Struct to hold error information
13
- #[derive(serde::Serialize)]
33
+ // Struct to hold error information for JSON output
34
+ #[derive(serde::Serialize, Clone)]
14
35
  struct ErrorEntry {
15
- case: String,
16
- error: String,
36
+ case_id: String,
37
+ data_type: String,
38
+ error_type: String,
39
+ error_details: String,
40
+ attempts_made: u32,
41
+ }
42
+
43
+ // Struct for the final output that includes both successful data and errors
44
+ #[derive(serde::Serialize)]
45
+ struct GdcOutput {
46
+ successful_data: Vec<Vec<Vec<String>>>, // Array of successful file data arrays
47
+ failed_files: Vec<ErrorEntry>,
48
+ summary: OutputSummary,
49
+ }
50
+
51
+ #[derive(serde::Serialize)]
52
+ struct OutputSummary {
53
+ total_files: usize,
54
+ successful_files: usize,
55
+ failed_files: usize,
17
56
  }
18
57
 
19
58
  // Define the structure for datadd
@@ -23,32 +62,52 @@ struct DataType {
23
62
  maf: Option<String>,
24
63
  }
25
64
 
65
+ // Define the structure for mafOptions
66
+ #[derive(Deserialize, Debug)]
67
+ struct MafOptions {
68
+ #[serde(rename = "minTotalDepth")]
69
+ min_total_depth: i32,
70
+ #[serde(rename = "minAltAlleleCount")]
71
+ min_alt_allele_count: i32,
72
+ }
73
+
74
+ // Define the top-level input structure
75
+ #[derive(Deserialize, Debug)]
76
+ struct InputData {
77
+ #[serde(rename = "caseFiles")]
78
+ case_files: HashMap<String, DataType>,
79
+ #[serde(rename = "mafOptions")]
80
+ maf_options: Option<MafOptions>,
81
+ }
82
+
26
83
  // Function to parse TSV content
27
- // CNV:
28
- // Select cnv columns ["Chromosome","Start","End","Segment_Mean"]
29
- // Segment_Mean >= 0.2 => gain; Segment_Mean <= -0.2 => loss
30
- // MAF:
31
- // Select MAF columns ["Chromosome","Start_Position","End_Position"]
32
- fn parse_content(content: &str, case_id: &str, data_type: &str) -> Result<String, (String, String, String)> {
84
+ fn parse_content(
85
+ content: &str,
86
+ case_id: &str,
87
+ data_type: &str,
88
+ min_total_depth: i32,
89
+ min_alt_allele_count: i32,
90
+ ) -> Result<Vec<Vec<String>>, (String, String, String)> {
33
91
  let lines = content.lines();
34
- //let mut parsed_data = Vec::new();
35
- let mut parsed_data: String = String::new();
92
+ let mut parsed_data = Vec::new();
36
93
  let mut columns_indices: Vec<usize> = Vec::new();
37
94
  let mut header_mk: &str = "";
38
- let mut columns = Vec::new(); // columns selected from GDC file
95
+ let mut columns = Vec::new();
96
+
39
97
  if data_type == "cnv" {
40
98
  header_mk = "GDC_Aliquot_ID";
41
99
  columns = vec!["Chromosome", "Start", "End", "Segment_Mean"]
42
100
  } else if data_type == "maf" {
43
101
  header_mk = "Hugo_Symbol";
44
- columns = vec!["Chromosome", "Start_Position", "End_Position"]
102
+ columns = vec!["Chromosome", "Start_Position", "End_Position", "t_depth", "t_alt_count"]
45
103
  };
46
- let mut header: Vec<String> = Vec::new(); // GDC file header
104
+
105
+ let mut header: Vec<String> = Vec::new();
106
+
47
107
  for line in lines {
48
108
  if line.starts_with("#") {
49
109
  continue;
50
110
  } else if line.contains(&header_mk) {
51
- // header line
52
111
  header = line.split("\t").map(|s| s.to_string()).collect();
53
112
  for col in &columns {
54
113
  match header.iter().position(|x| x == col) {
@@ -65,12 +124,12 @@ fn parse_content(content: &str, case_id: &str, data_type: &str) -> Result<String
65
124
  let mut keep_ck: bool = true;
66
125
  let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
67
126
  let mut out_lst: Vec<String> = Vec::new();
68
- // add sample ID first
69
127
  out_lst.push(case_id.to_string());
128
+
70
129
  for x in columns_indices.iter() {
71
130
  let mut element = cont_lst[*x].to_string();
131
+
72
132
  if data_type == "cnv" && &header[*x] == "Segment_Mean" {
73
- // convert to f32 (segment_mean)
74
133
  let seg_mean = match element.parse::<f32>() {
75
134
  Ok(val) => val,
76
135
  Err(_e) => {
@@ -78,9 +137,9 @@ fn parse_content(content: &str, case_id: &str, data_type: &str) -> Result<String
78
137
  return Err((case_id.to_string(), data_type.to_string(), error_msg));
79
138
  }
80
139
  };
81
- if seg_mean >= 0.2 {
140
+ if seg_mean >= 0.3 {
82
141
  element = "gain".to_string();
83
- } else if seg_mean <= -0.2 {
142
+ } else if seg_mean <= -0.4 {
84
143
  element = "loss".to_string();
85
144
  } else {
86
145
  keep_ck = false;
@@ -88,16 +147,37 @@ fn parse_content(content: &str, case_id: &str, data_type: &str) -> Result<String
88
147
  }
89
148
  out_lst.push(element);
90
149
  }
91
- // add lsn.type to snv
150
+
92
151
  if data_type == "maf" {
93
- out_lst.push("mutation".to_string());
152
+ let alle_depth = match out_lst[4].parse::<i32>() {
153
+ Ok(value) => value,
154
+ Err(_) => {
155
+ let error_msg = "Failed to convert t_depth to i32.".to_string();
156
+ return Err((case_id.to_string(), data_type.to_string(), error_msg));
157
+ }
158
+ };
159
+ let alt_count = match out_lst[5].parse::<i32>() {
160
+ Ok(value) => value,
161
+ Err(_) => {
162
+ let error_msg = "Failed to convert t_alt_count to i32.".to_string();
163
+ return Err((case_id.to_string(), data_type.to_string(), error_msg));
164
+ }
165
+ };
166
+
167
+ if alle_depth >= min_total_depth && alt_count >= min_alt_allele_count {
168
+ out_lst = out_lst[0..4].to_vec();
169
+ out_lst.push("mutation".to_string());
170
+ } else {
171
+ keep_ck = false;
172
+ }
94
173
  }
174
+
95
175
  if keep_ck {
96
- parsed_data.push_str(out_lst.join("\t").as_str());
97
- parsed_data.push_str("\n");
176
+ parsed_data.push(out_lst);
98
177
  }
99
178
  }
100
179
  }
180
+
101
181
  if columns_indices.is_empty() {
102
182
  return Err((
103
183
  case_id.to_string(),
@@ -105,14 +185,121 @@ fn parse_content(content: &str, case_id: &str, data_type: &str) -> Result<String
105
185
  "No matching columns found. Problematic file!".to_string(),
106
186
  ));
107
187
  };
188
+
108
189
  Ok(parsed_data)
109
190
  }
110
191
 
111
- // Function to download data
112
- //async fn download_data(data4dl: HashMap<String,DataType>, host: &str) -> Vec<Result<(String, String), (String, String)>> {
113
- async fn download_data(data4dl: HashMap<String, DataType>, host: &str) -> () {
192
+ /// Downloads a single file with minimal retry logic for transient failures
193
+ async fn download_single_file(
194
+ case_id: String,
195
+ data_type: String,
196
+ url: String,
197
+ max_attempts: u32,
198
+ ) -> Result<(String, String, String), (String, String, String, u32)> {
199
+ let mut last_error = String::new();
200
+ let mut error_type = String::new();
201
+
202
+ for attempt in 0..max_attempts {
203
+ // Build HTTP client with aggressive timeouts for real-time processing
204
+ let client = match reqwest::Client::builder()
205
+ .timeout(Duration::from_secs(10)) // 10 second timeout per request
206
+ .connect_timeout(Duration::from_secs(3)) // 3 second connect timeout
207
+ .build()
208
+ {
209
+ Ok(client) => client,
210
+ Err(e) => {
211
+ last_error = format!("Client build error: {}", e);
212
+ error_type = "client_build_error".to_string();
213
+ continue;
214
+ }
215
+ };
216
+
217
+ // Attempt download with tight timeout - fail fast if server is slow
218
+ match timeout(Duration::from_secs(12), client.get(&url).send()).await {
219
+ Ok(Ok(resp)) if resp.status().is_success() => {
220
+ match resp.bytes().await {
221
+ Ok(content) => {
222
+ // Handle both compressed and uncompressed content
223
+ let text = if memchr(0x00, &content).is_some() {
224
+ // Likely compressed (gzipped) content
225
+ let mut decoder = GzDecoder::new(&content[..]);
226
+ let mut decompressed_content = Vec::new();
227
+ match decoder.read_to_end(&mut decompressed_content) {
228
+ Ok(_) => String::from_utf8_lossy(&decompressed_content).to_string(),
229
+ Err(e) => {
230
+ last_error = format!("Decompression failed: {}", e);
231
+ error_type = "decompression_error".to_string();
232
+ continue; // Retry on decompression failure
233
+ }
234
+ }
235
+ } else {
236
+ // Plain text content
237
+ String::from_utf8_lossy(&content).to_string()
238
+ };
239
+
240
+ // Success! Return immediately
241
+ return Ok((case_id, data_type, text));
242
+ }
243
+ Err(e) => {
244
+ last_error = format!("Failed to read response bytes: {}", e);
245
+ error_type = "connection_error".to_string();
246
+ // This could be "connection closed before message completed"
247
+ // Worth retrying for transient network issues
248
+ }
249
+ }
250
+ }
251
+ Ok(Ok(resp)) => {
252
+ last_error = format!(
253
+ "HTTP error {}: {}",
254
+ resp.status(),
255
+ resp.status().canonical_reason().unwrap_or("Unknown")
256
+ );
257
+ error_type = if resp.status().is_client_error() {
258
+ "client_error".to_string()
259
+ } else {
260
+ "server_error".to_string()
261
+ };
262
+ // Don't retry 4xx errors (client errors), but retry 5xx (server errors)
263
+ if resp.status().is_client_error() {
264
+ break; // No point retrying client errors
265
+ }
266
+ }
267
+ Ok(Err(e)) => {
268
+ last_error = format!("Request error: {}", e);
269
+ error_type = "network_error".to_string();
270
+ // Network errors are worth retrying
271
+ }
272
+ Err(_) => {
273
+ last_error = "Request timeout (12s) - server too slow".to_string();
274
+ error_type = "timeout_error".to_string();
275
+ // Timeouts might be transient, worth a quick retry
276
+ }
277
+ }
278
+
279
+ // If this isn't the last attempt, wait briefly before retrying
280
+ if attempt < max_attempts - 1 {
281
+ // Silent retry - no stderr noise
282
+ tokio::time::sleep(Duration::from_secs(1)).await; // 1 second between retries
283
+ }
284
+ }
285
+
286
+ Err((
287
+ case_id,
288
+ data_type,
289
+ format!("{}: {}", error_type, last_error),
290
+ max_attempts,
291
+ ))
292
+ }
293
+
294
+ /// Main download function with structured JSON output including errors
295
+ async fn download_data(
296
+ data4dl: HashMap<String, DataType>,
297
+ host: &str,
298
+ min_total_depth: i32,
299
+ min_alt_allele_count: i32,
300
+ ) {
114
301
  // Generate URLs from data4dl, handling optional cnv and maf
115
- let data_urls = data4dl
302
+ let data_urls: Vec<(String, String, String)> = data4dl
116
303
  .into_iter()
117
304
  .flat_map(|(case_id, data_types)| {
118
305
  let mut urls = Vec::new();
@@ -124,172 +311,158 @@ async fn download_data(data4dl: HashMap<String, DataType>, host: &str) -> () {
124
311
  }
125
312
  urls
126
313
  })
127
- .collect::<Vec<_>>();
314
+ .collect();
315
+
316
+ let total_files = data_urls.len();
317
+
318
+ // Use atomic counters that can be safely shared across async closures
319
+ let successful_downloads = Arc::new(AtomicUsize::new(0));
320
+ let failed_downloads = Arc::new(AtomicUsize::new(0));
321
+
322
+ // Create shared vectors to collect successful data and errors
323
+ let successful_data = Arc::new(Mutex::new(Vec::<Vec<Vec<String>>>::new()));
324
+ let errors = Arc::new(Mutex::new(Vec::<ErrorEntry>::new()));
325
+
326
+ // Create download futures with smart retry logic
128
327
  let download_futures = futures::stream::iter(data_urls.into_iter().map(|(case_id, data_type, url)| {
129
328
  async move {
130
- //let case_dt = format!("{}/{}",case_id,data_type).to_string();
131
- // Build HTTP client with timeouts
132
- let client = reqwest::Client::builder()
133
- .timeout(Duration::from_secs(60)) // 60-second timeout per request
134
- .connect_timeout(Duration::from_secs(30))
135
- .build()
136
- .map_err(|_e| "Client build error".to_string());
137
- // Handle client creation result
138
- match client {
139
- Ok(client) => {
140
- match client.get(&url).send().await {
141
- Ok(resp) if resp.status().is_success() => {
142
- match resp.bytes().await {
143
- Ok(content) => {
144
- // if data_type == "cnv" {
145
- if !memchr(0x00, &content).is_some() {
146
- // CNV files are plain text
147
- let text = String::from_utf8_lossy(&content).to_string();
148
- Ok((case_id.clone(), data_type.clone(), text))
149
- } else {
150
- let mut decoder = GzDecoder::new(&content[..]);
151
- let mut decompressed_content = Vec::new();
152
- match decoder.read_to_end(&mut decompressed_content) {
153
- Ok(_) => {
154
- let text = String::from_utf8_lossy(&decompressed_content).to_string();
155
- Ok((case_id.clone(), data_type.clone(), text))
156
- }
157
- Err(e) => {
158
- let error_msg = format!(
159
- "Failed to decompress {} file for {}: {}",
160
- data_type, case_id, e
161
- );
162
- Err((case_id.clone(), data_type.clone(), error_msg))
163
- }
164
- }
165
- }
166
- }
167
- Err(e) => {
168
- let error_msg =
169
- format!("Failed to read bytes for {} file for {}: {}", data_type, case_id, e);
170
- Err((case_id.clone(), data_type.clone(), error_msg))
171
- }
172
- }
173
- }
174
- Ok(resp) => {
175
- let error_msg =
176
- format!("HTTP error for {} file for {}: {}", data_type, case_id, resp.status());
177
- Err((case_id.clone(), data_type.clone(), error_msg))
178
- }
179
- Err(e) => {
180
- let error_msg =
181
- format!("Server request failed for {} file for {}: {}", data_type, case_id, e);
182
- Err((case_id.clone(), data_type.clone(), error_msg))
183
- }
184
- }
185
- }
186
- Err(_e) => {
187
- let error_msg = "Client build error".to_string();
188
- Err((case_id, data_type, error_msg))
189
- }
190
- }
329
+ // Try each file up to 2 times for transient failures
330
+ download_single_file(case_id, data_type, url, 2).await
191
331
  }
192
332
  }));
193
333
 
194
- // Execute downloads concurrently and collect results
334
+ // Execute downloads concurrently with high concurrency for speed
195
335
  download_futures
196
- .buffer_unordered(10)
197
- .for_each(|result| async {
198
- match result {
199
- Ok((case_id, data_type, content)) => match parse_content(&content, &case_id, &data_type) {
200
- Ok(parsed_data) => match serde_json::to_string(&parsed_data) {
201
- Ok(json) => println!("{}", json),
202
- Err(e) => {
203
- let error = ErrorEntry {
204
- case: format!("{}: {}", case_id, data_type),
205
- error: format!("Failed to convert data to JSON {}", e),
206
- };
207
- let error_js = serde_json::to_string(&error).unwrap();
208
- eprintln!("{}", error_js);
336
+ .buffer_unordered(15) // Increased to 15 concurrent downloads for speed
337
+ .for_each(|download_result| {
338
+ let successful_downloads = Arc::clone(&successful_downloads);
339
+ let failed_downloads = Arc::clone(&failed_downloads);
340
+ let successful_data = Arc::clone(&successful_data);
341
+ let errors = Arc::clone(&errors);
342
+
343
+ async move {
344
+ match download_result {
345
+ Ok((case_id, data_type, content)) => {
346
+ // Successfully downloaded, now try to parse
347
+ match parse_content(&content, &case_id, &data_type, min_total_depth, min_alt_allele_count) {
348
+ Ok(parsed_data) => {
349
+ // Store successful data
350
+ successful_data.lock().await.push(parsed_data);
351
+ successful_downloads.fetch_add(1, Ordering::Relaxed);
352
+ }
353
+ Err((cid, dtp, error)) => {
354
+ failed_downloads.fetch_add(1, Ordering::Relaxed);
355
+ let error = ErrorEntry {
356
+ case_id: cid,
357
+ data_type: dtp,
358
+ error_type: "parsing_error".to_string(),
359
+ error_details: error,
360
+ attempts_made: 1,
361
+ };
362
+ errors.lock().await.push(error);
363
+ }
209
364
  }
210
- },
211
- Err((cid, dtp, error)) => {
365
+ }
366
+ Err((case_id, data_type, error_details, attempts)) => {
367
+ failed_downloads.fetch_add(1, Ordering::Relaxed);
368
+
369
+ // Parse error type from error details
370
+ let (error_type, clean_details) = if error_details.contains(":") {
371
+ let parts: Vec<&str> = error_details.splitn(2, ": ").collect();
372
+ (parts[0].to_string(), parts[1].to_string())
373
+ } else {
374
+ ("unknown_error".to_string(), error_details)
375
+ };
376
+
212
377
  let error = ErrorEntry {
213
- case: format!("{}: {}", cid, dtp),
214
- error,
378
+ case_id,
379
+ data_type,
380
+ error_type,
381
+ error_details: clean_details,
382
+ attempts_made: attempts,
215
383
  };
216
- let error_js = serde_json::to_string(&error).unwrap();
217
- eprintln!("{}", error_js);
384
+ errors.lock().await.push(error);
218
385
  }
219
- },
220
- Err((case_id, data_type, error)) => {
221
- let error = ErrorEntry {
222
- case: format!("{}: {}", case_id, data_type),
223
- error,
224
- };
225
- let error_js = serde_json::to_string(&error).unwrap();
226
- eprintln!("{}", error_js);
227
386
  }
228
387
  }
229
388
  })
230
389
  .await;
390
+
391
+ // Create final output structure
392
+ let success_count = successful_downloads.load(Ordering::Relaxed);
393
+ let failed_count = failed_downloads.load(Ordering::Relaxed);
394
+
395
+ let output = GdcOutput {
396
+ successful_data: successful_data.lock().await.clone(),
397
+ failed_files: errors.lock().await.clone(),
398
+ summary: OutputSummary {
399
+ total_files,
400
+ successful_files: success_count,
401
+ failed_files: failed_count,
402
+ },
403
+ };
404
+
405
+ // Output the complete structure as JSON
406
+ match serde_json::to_string(&output) {
407
+ Ok(json) => println!("{}", json),
408
+ Err(_) => {
409
+ // Silent failure - exit without stderr
410
+ std::process::exit(1);
411
+ }
412
+ }
231
413
  }
232
414
 
233
415
  #[tokio::main]
234
416
  async fn main() -> Result<(), Box<dyn std::error::Error>> {
235
417
  const HOST: &str = "https://api.gdc.cancer.gov/data/";
236
418
 
237
- // Accepting the piped input json from nodejs
238
- let timeout_duration = Duration::from_secs(5); // Set a 5-second timeout
419
+ // Read input with timeout
420
+ let timeout_duration = Duration::from_secs(10); // Increased timeout for input
239
421
 
240
- // Wrap the read operation in a timeout
241
422
  let result = timeout(timeout_duration, async {
242
- let mut buffer = String::new(); // Initialize an empty string to store input
243
- let mut reader = BufReader::new(tokio::io::stdin()); // Create a buffered reader for stdin
244
- reader.read_to_string(&mut buffer).await?; // Read a line asynchronously
245
- Ok::<String, io::Error>(buffer) // Return the input as a Result
423
+ let mut buffer = String::new();
424
+ let mut reader = BufReader::new(tokio::io::stdin());
425
+ reader.read_to_string(&mut buffer).await?;
426
+ Ok::<String, io::Error>(buffer)
246
427
  })
247
428
  .await;
248
429
 
249
- // Handle the result of the input timeout operation
250
- let input_js: HashMap<String, DataType> = match result {
430
+ // Handle input parsing (silently)
431
+ let input_js: InputData = match result {
251
432
  Ok(Ok(buffer)) => match serde_json::from_str(&buffer) {
252
433
  Ok(js) => js,
253
- Err(e) => {
254
- let stdin_error = ErrorEntry {
255
- case: String::new(),
256
- error: format!("Input JSON parsing error: {}", e),
257
- };
258
- writeln!(io::stderr(), "{}", serde_json::to_string(&stdin_error).unwrap()).unwrap();
259
- return Err(Box::new(std::io::Error::new(
260
- std::io::ErrorKind::InvalidInput,
261
- "Input JSON parsing Error!",
262
- )) as Box<dyn std::error::Error>);
434
+ Err(_e) => {
435
+ // Silent failure - exit without stderr
436
+ std::process::exit(1);
263
437
  }
264
438
  },
265
439
  Ok(Err(_e)) => {
266
- let stdin_error = ErrorEntry {
267
- case: String::new(),
268
- error: "Error reading from stdin.".to_string(),
269
- };
270
- let stdin_error_js = serde_json::to_string(&stdin_error).unwrap();
271
- writeln!(io::stderr(), "{}", stdin_error_js).expect("Failed to output stderr!");
272
- return Err(Box::new(std::io::Error::new(
273
- std::io::ErrorKind::InvalidInput,
274
- "Error reading from stdin!",
275
- )) as Box<dyn std::error::Error>);
440
+ // Silent failure - exit without stderr
441
+ std::process::exit(1);
276
442
  }
277
443
  Err(_) => {
278
- let stdin_error = ErrorEntry {
279
- case: String::new(),
280
- error: "Timeout while reading from stdin.".to_string(),
281
- };
282
- let stdin_error_js = serde_json::to_string(&stdin_error).unwrap();
283
- writeln!(io::stderr(), "{}", stdin_error_js).expect("Failed to output stderr!");
284
- return Err(Box::new(std::io::Error::new(
285
- std::io::ErrorKind::InvalidInput,
286
- "Timeout while reading from stdin.",
287
- )) as Box<dyn std::error::Error>);
444
+ // Silent failure - exit without stderr
445
+ std::process::exit(1);
288
446
  }
289
447
  };
290
448
 
291
- // Download data
292
- download_data(input_js, HOST).await;
449
+ // Validate input (silently)
450
+ if input_js.case_files.is_empty() {
451
+ // Silent failure - exit without stderr
452
+ std::process::exit(1);
453
+ }
454
+
455
+ let case_files = input_js.case_files;
456
+
457
+ // Set default maf_options
458
+ let (min_total_depth, min_alt_allele_count) = match input_js.maf_options {
459
+ Some(options) => (options.min_total_depth, options.min_alt_allele_count),
460
+ None => (10, 2), // Default values
461
+ };
462
+
463
+ // Download data - this will now handle errors gracefully
464
+ download_data(case_files, HOST, min_total_depth, min_alt_allele_count).await;
293
465
 
466
+ // Always exit successfully - individual file failures are logged but don't stop the process
294
467
  Ok(())
295
468
  }