@sjcrh/proteinpaint-rust 2.37.0 → 2.38.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/gdcmaf.rs +43 -60
package/package.json
CHANGED
package/src/gdcmaf.rs
CHANGED
|
@@ -1,3 +1,15 @@
|
|
|
1
|
+
/*
|
|
2
|
+
This script download cohort maf files from GDC, concatenate them into a single file that includes user specified columns.
|
|
3
|
+
|
|
4
|
+
Input JSON:
|
|
5
|
+
host: GDC host
|
|
6
|
+
fileIdLst: An array of uuid
|
|
7
|
+
Output gzip compressed maf file to stdout.
|
|
8
|
+
|
|
9
|
+
Example of usage:
|
|
10
|
+
echo '{"host": "https://api.gdc.cancer.gov/data/", "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
|
|
11
|
+
*/
|
|
12
|
+
|
|
1
13
|
use flate2::read::GzDecoder;
|
|
2
14
|
use flate2::write::GzEncoder;
|
|
3
15
|
use flate2::Compression;
|
|
@@ -5,16 +17,12 @@ use serde_json::Value;
|
|
|
5
17
|
use std::path::Path;
|
|
6
18
|
use futures::StreamExt;
|
|
7
19
|
use std::io::{self,Read,Write};
|
|
8
|
-
use std::sync::mpsc;
|
|
9
20
|
|
|
10
21
|
|
|
11
22
|
|
|
12
|
-
fn
|
|
13
|
-
let mut
|
|
14
|
-
let mut lst_chrom_pos: Vec<String> = Vec::new();
|
|
23
|
+
fn select_maf_col(d:String) -> Vec<u8> {
|
|
24
|
+
let mut maf_str: String = String::new();
|
|
15
25
|
let mut header_indices: Vec<usize> = Vec::new();
|
|
16
|
-
let mut chrom_index: usize = 9999;
|
|
17
|
-
let mut pos_index: usize = 9999;
|
|
18
26
|
let lines = d.trim_end().split("\n");
|
|
19
27
|
for line in lines {
|
|
20
28
|
if line.starts_with("#") {
|
|
@@ -24,41 +32,20 @@ fn gen_vec(d:String) -> (Vec<String>,Vec<Vec<u8>>) {
|
|
|
24
32
|
for col in MAF_COL {
|
|
25
33
|
let col_index: usize = header.iter().position(|x| x == col).unwrap();
|
|
26
34
|
header_indices.push(col_index);
|
|
27
|
-
if col == "Chromosome" {
|
|
28
|
-
chrom_index = col_index;
|
|
29
|
-
} else if col == "Start_Position" {
|
|
30
|
-
pos_index = col_index;
|
|
31
|
-
}
|
|
32
35
|
}
|
|
33
36
|
} else {
|
|
34
37
|
let maf_cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
35
38
|
let mut maf_out_lst: Vec<String> = Vec::new();
|
|
36
|
-
|
|
37
|
-
let mut pos = String::new();
|
|
38
|
-
for (i,x) in header_indices.iter().enumerate() {
|
|
39
|
+
for x in header_indices.iter() {
|
|
39
40
|
maf_out_lst.push(maf_cont_lst[*x].to_string());
|
|
40
|
-
if chrom_index != 9999 && i == chrom_index {
|
|
41
|
-
chrom = maf_cont_lst[*x].to_string();
|
|
42
|
-
} else if pos_index != 9999 && i == pos_index {
|
|
43
|
-
pos = maf_cont_lst[*x].to_string();
|
|
44
|
-
}
|
|
45
41
|
};
|
|
46
42
|
maf_out_lst.push("\n".to_string());
|
|
47
|
-
|
|
48
|
-
lst_chrom_pos.push(chrom+"\t"+&pos);
|
|
43
|
+
maf_str.push_str(maf_out_lst.join("\t").as_str());
|
|
49
44
|
}
|
|
50
45
|
};
|
|
51
|
-
(
|
|
46
|
+
maf_str.as_bytes().to_vec()
|
|
52
47
|
}
|
|
53
48
|
|
|
54
|
-
fn get_sorted_indices(lst: &Vec<String>) -> Vec<usize>{
|
|
55
|
-
let mut indices = (0..lst.len()).collect::<Vec<usize>>();
|
|
56
|
-
indices.sort_by(|a,b| {
|
|
57
|
-
lst[*a].split('\t').collect::<Vec<&str>>()[0].cmp(lst[*b].split('\t').collect::<Vec<&str>>()[0])
|
|
58
|
-
.then(lst[*a].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap().cmp(&lst[*b].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap()))
|
|
59
|
-
});
|
|
60
|
-
indices
|
|
61
|
-
}
|
|
62
49
|
|
|
63
50
|
// GDC MAF columns (96)
|
|
64
51
|
const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
|
|
@@ -85,58 +72,54 @@ const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Bui
|
|
|
85
72
|
async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
86
73
|
// Accepting the piped input json from jodejs and assign to the variable
|
|
87
74
|
// host: GDC host
|
|
88
|
-
// save output into json string
|
|
89
75
|
// url: urls to download single maf files
|
|
90
76
|
let mut buffer = String::new();
|
|
91
77
|
io::stdin().read_line(&mut buffer)?;
|
|
92
78
|
let file_id_lst_js = serde_json::from_str::<Value>(&buffer).expect("Error reading input and serializing to JSON");
|
|
93
|
-
let host =
|
|
79
|
+
let host = file_id_lst_js.get("host").expect("Host was not provided").as_str().expect("Host is not a string");
|
|
94
80
|
let mut url: Vec<String> = Vec::new();
|
|
95
|
-
|
|
81
|
+
let file_id_lst = file_id_lst_js.get("fileIdLst").expect("File ID list is missed!").as_array().expect("File ID list is not an array");
|
|
82
|
+
for v in file_id_lst {
|
|
96
83
|
url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
|
|
97
84
|
};
|
|
98
85
|
|
|
99
86
|
//downloading maf files parallelly and merge them into single maf file
|
|
100
|
-
let
|
|
101
|
-
let fetches = futures::stream::iter(
|
|
87
|
+
let download_futures = futures::stream::iter(
|
|
102
88
|
url.into_iter().map(|url|{
|
|
103
|
-
let txt = tx.clone();
|
|
104
89
|
async move {
|
|
105
|
-
|
|
90
|
+
let result = reqwest::get(&url).await;
|
|
91
|
+
if let Ok(resp) = result {
|
|
106
92
|
let content = resp.bytes().await.unwrap();
|
|
107
93
|
let mut decoder = GzDecoder::new(&content[..]);
|
|
108
94
|
let mut decompressed_content = Vec::new();
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
let
|
|
112
|
-
|
|
95
|
+
let read_content = decoder.read_to_end(&mut decompressed_content);
|
|
96
|
+
if let Ok(_) = read_content {
|
|
97
|
+
let text = String::from_utf8_lossy(&decompressed_content).to_string();
|
|
98
|
+
text
|
|
99
|
+
} else {
|
|
100
|
+
let error_msg = "Failed to read content downloaded from: ".to_string() + &url;
|
|
101
|
+
error_msg
|
|
113
102
|
}
|
|
103
|
+
} else {
|
|
104
|
+
let error_msg = "Failed to download: ".to_string() + &url;
|
|
105
|
+
error_msg
|
|
114
106
|
}
|
|
115
107
|
}
|
|
116
108
|
})
|
|
117
|
-
)
|
|
118
|
-
fetches.await;
|
|
119
|
-
drop(tx);
|
|
120
|
-
|
|
121
|
-
// write downloaded maf (GZIP format) into a Vector
|
|
122
|
-
// lst_chrom_pos: a vector including chromsome&position info for sorting maf
|
|
123
|
-
// idx_sorted: indices after sorting basedon chromsome&position
|
|
124
|
-
let mut maf_bit: Vec<Vec<u8>> = Vec::new();
|
|
125
|
-
let mut lst_chrom_pos: Vec<String> = Vec::new();
|
|
126
|
-
for (chr_pos_lst,maf_bit_lst) in rx {
|
|
127
|
-
maf_bit.extend_from_slice(&maf_bit_lst);
|
|
128
|
-
lst_chrom_pos.extend_from_slice(&chr_pos_lst);
|
|
129
|
-
};
|
|
130
|
-
let idx_sorted = get_sorted_indices(&lst_chrom_pos);
|
|
109
|
+
);
|
|
131
110
|
|
|
132
111
|
// output
|
|
133
|
-
// maf_out_bit: A vector of GZIPPED maf
|
|
134
|
-
// compress_header: output header
|
|
135
112
|
let mut encoder = GzEncoder::new(io::stdout(), Compression::default());
|
|
136
113
|
let _ = encoder.write_all(&MAF_COL.join("\t").as_bytes().to_vec()).expect("Failed to write header");
|
|
137
114
|
let _ = encoder.write_all(b"\n").expect("Failed to write newline");
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
115
|
+
download_futures.buffer_unordered(20).for_each(|item| {
|
|
116
|
+
if item.starts_with("Failed") {
|
|
117
|
+
eprintln!("{}",item);
|
|
118
|
+
} else {
|
|
119
|
+
let maf_bit = select_maf_col(item);
|
|
120
|
+
let _ = encoder.write_all(&maf_bit).expect("Failed to write file");
|
|
121
|
+
};
|
|
122
|
+
async {}
|
|
123
|
+
}).await;
|
|
141
124
|
Ok(())
|
|
142
125
|
}
|