@sjcrh/proteinpaint-rust 2.37.0 → 2.38.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/gdcmaf.rs +43 -60
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.37.0",
2
+ "version": "2.38.1",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
package/src/gdcmaf.rs CHANGED
@@ -1,3 +1,15 @@
1
+ /*
2
+ This script download cohort maf files from GDC, concatenate them into a single file that includes user specified columns.
3
+
4
+ Input JSON:
5
+ host: GDC host
6
+ fileIdLst: An array of uuid
7
+ Output gzip compressed maf file to stdout.
8
+
9
+ Example of usage:
10
+ echo '{"host": "https://api.gdc.cancer.gov/data/", "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
11
+ */
12
+
1
13
  use flate2::read::GzDecoder;
2
14
  use flate2::write::GzEncoder;
3
15
  use flate2::Compression;
@@ -5,16 +17,12 @@ use serde_json::Value;
5
17
  use std::path::Path;
6
18
  use futures::StreamExt;
7
19
  use std::io::{self,Read,Write};
8
- use std::sync::mpsc;
9
20
 
10
21
 
11
22
 
12
- fn gen_vec(d:String) -> (Vec<String>,Vec<Vec<u8>>) {
13
- let mut maf_bit: Vec<Vec<u8>> = Vec::new();
14
- let mut lst_chrom_pos: Vec<String> = Vec::new();
23
+ fn select_maf_col(d:String) -> Vec<u8> {
24
+ let mut maf_str: String = String::new();
15
25
  let mut header_indices: Vec<usize> = Vec::new();
16
- let mut chrom_index: usize = 9999;
17
- let mut pos_index: usize = 9999;
18
26
  let lines = d.trim_end().split("\n");
19
27
  for line in lines {
20
28
  if line.starts_with("#") {
@@ -24,41 +32,20 @@ fn gen_vec(d:String) -> (Vec<String>,Vec<Vec<u8>>) {
24
32
  for col in MAF_COL {
25
33
  let col_index: usize = header.iter().position(|x| x == col).unwrap();
26
34
  header_indices.push(col_index);
27
- if col == "Chromosome" {
28
- chrom_index = col_index;
29
- } else if col == "Start_Position" {
30
- pos_index = col_index;
31
- }
32
35
  }
33
36
  } else {
34
37
  let maf_cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
35
38
  let mut maf_out_lst: Vec<String> = Vec::new();
36
- let mut chrom = String::new();
37
- let mut pos = String::new();
38
- for (i,x) in header_indices.iter().enumerate() {
39
+ for x in header_indices.iter() {
39
40
  maf_out_lst.push(maf_cont_lst[*x].to_string());
40
- if chrom_index != 9999 && i == chrom_index {
41
- chrom = maf_cont_lst[*x].to_string();
42
- } else if pos_index != 9999 && i == pos_index {
43
- pos = maf_cont_lst[*x].to_string();
44
- }
45
41
  };
46
42
  maf_out_lst.push("\n".to_string());
47
- maf_bit.push(maf_out_lst.join("\t").as_bytes().to_vec());
48
- lst_chrom_pos.push(chrom+"\t"+&pos);
43
+ maf_str.push_str(maf_out_lst.join("\t").as_str());
49
44
  }
50
45
  };
51
- (lst_chrom_pos,maf_bit)
46
+ maf_str.as_bytes().to_vec()
52
47
  }
53
48
 
54
- fn get_sorted_indices(lst: &Vec<String>) -> Vec<usize>{
55
- let mut indices = (0..lst.len()).collect::<Vec<usize>>();
56
- indices.sort_by(|a,b| {
57
- lst[*a].split('\t').collect::<Vec<&str>>()[0].cmp(lst[*b].split('\t').collect::<Vec<&str>>()[0])
58
- .then(lst[*a].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap().cmp(&lst[*b].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap()))
59
- });
60
- indices
61
- }
62
49
 
63
50
  // GDC MAF columns (96)
64
51
  const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
@@ -85,58 +72,54 @@ const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Bui
85
72
  async fn main() -> Result<(),Box<dyn std::error::Error>> {
86
73
  // Accepting the piped input json from jodejs and assign to the variable
87
74
  // host: GDC host
88
- // save output into json string
89
75
  // url: urls to download single maf files
90
76
  let mut buffer = String::new();
91
77
  io::stdin().read_line(&mut buffer)?;
92
78
  let file_id_lst_js = serde_json::from_str::<Value>(&buffer).expect("Error reading input and serializing to JSON");
93
- let host = &file_id_lst_js["host"].as_str().unwrap();
79
+ let host = file_id_lst_js.get("host").expect("Host was not provided").as_str().expect("Host is not a string");
94
80
  let mut url: Vec<String> = Vec::new();
95
- for v in file_id_lst_js["fileIdLst"].as_array().unwrap() {
81
+ let file_id_lst = file_id_lst_js.get("fileIdLst").expect("File ID list is missed!").as_array().expect("File ID list is not an array");
82
+ for v in file_id_lst {
96
83
  url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
97
84
  };
98
85
 
99
86
  //downloading maf files parallelly and merge them into single maf file
100
- let (tx, rx) = mpsc::channel();
101
- let fetches = futures::stream::iter(
87
+ let download_futures = futures::stream::iter(
102
88
  url.into_iter().map(|url|{
103
- let txt = tx.clone();
104
89
  async move {
105
- if let Ok(resp) = reqwest::get(&url).await {
90
+ let result = reqwest::get(&url).await;
91
+ if let Ok(resp) = result {
106
92
  let content = resp.bytes().await.unwrap();
107
93
  let mut decoder = GzDecoder::new(&content[..]);
108
94
  let mut decompressed_content = Vec::new();
109
- if let Ok(_) = decoder.read_to_end(&mut decompressed_content) {
110
- let text = String::from_utf8_lossy(&decompressed_content);
111
- let (lst_chrom_pos,maf_bit) = gen_vec(text.to_string());
112
- txt.send((lst_chrom_pos,maf_bit)).unwrap();
95
+ let read_content = decoder.read_to_end(&mut decompressed_content);
96
+ if let Ok(_) = read_content {
97
+ let text = String::from_utf8_lossy(&decompressed_content).to_string();
98
+ text
99
+ } else {
100
+ let error_msg = "Failed to read content downloaded from: ".to_string() + &url;
101
+ error_msg
113
102
  }
103
+ } else {
104
+ let error_msg = "Failed to download: ".to_string() + &url;
105
+ error_msg
114
106
  }
115
107
  }
116
108
  })
117
- ).buffer_unordered(20).collect::<Vec<()>>();
118
- fetches.await;
119
- drop(tx);
120
-
121
- // write downloaded maf (GZIP format) into a Vector
122
- // lst_chrom_pos: a vector including chromsome&position info for sorting maf
123
- // idx_sorted: indices after sorting basedon chromsome&position
124
- let mut maf_bit: Vec<Vec<u8>> = Vec::new();
125
- let mut lst_chrom_pos: Vec<String> = Vec::new();
126
- for (chr_pos_lst,maf_bit_lst) in rx {
127
- maf_bit.extend_from_slice(&maf_bit_lst);
128
- lst_chrom_pos.extend_from_slice(&chr_pos_lst);
129
- };
130
- let idx_sorted = get_sorted_indices(&lst_chrom_pos);
109
+ );
131
110
 
132
111
  // output
133
- // maf_out_bit: A vector of GZIPPED maf
134
- // compress_header: output header
135
112
  let mut encoder = GzEncoder::new(io::stdout(), Compression::default());
136
113
  let _ = encoder.write_all(&MAF_COL.join("\t").as_bytes().to_vec()).expect("Failed to write header");
137
114
  let _ = encoder.write_all(b"\n").expect("Failed to write newline");
138
- for i in idx_sorted.iter() {
139
- let _ = encoder.write_all(&maf_bit[*i]).expect("Failed to write file");
140
- };
115
+ download_futures.buffer_unordered(20).for_each(|item| {
116
+ if item.starts_with("Failed") {
117
+ eprintln!("{}",item);
118
+ } else {
119
+ let maf_bit = select_maf_col(item);
120
+ let _ = encoder.write_all(&maf_bit).expect("Failed to write file");
121
+ };
122
+ async {}
123
+ }).await;
141
124
  Ok(())
142
125
  }