@sjcrh/proteinpaint-rust 2.38.0 → 2.38.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/gdcmaf.rs +29 -58
package/package.json
CHANGED
package/src/gdcmaf.rs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/*
|
|
2
|
-
This script download cohort maf files from GDC,
|
|
2
|
+
This script download cohort maf files from GDC, concatenate them into a single file that includes user specified columns.
|
|
3
3
|
|
|
4
4
|
Input JSON:
|
|
5
5
|
host: GDC host
|
|
@@ -17,16 +17,12 @@ use serde_json::Value;
|
|
|
17
17
|
use std::path::Path;
|
|
18
18
|
use futures::StreamExt;
|
|
19
19
|
use std::io::{self,Read,Write};
|
|
20
|
-
use std::sync::mpsc;
|
|
21
20
|
|
|
22
21
|
|
|
23
22
|
|
|
24
|
-
fn
|
|
25
|
-
let mut
|
|
26
|
-
let mut lst_chrom_pos: Vec<String> = Vec::new();
|
|
23
|
+
fn select_maf_col(d:String) -> Vec<u8> {
|
|
24
|
+
let mut maf_str: String = String::new();
|
|
27
25
|
let mut header_indices: Vec<usize> = Vec::new();
|
|
28
|
-
let mut chrom_index: usize = 9999;
|
|
29
|
-
let mut pos_index: usize = 9999;
|
|
30
26
|
let lines = d.trim_end().split("\n");
|
|
31
27
|
for line in lines {
|
|
32
28
|
if line.starts_with("#") {
|
|
@@ -36,41 +32,20 @@ fn gen_vec(d:String) -> (Vec<String>,Vec<Vec<u8>>) {
|
|
|
36
32
|
for col in MAF_COL {
|
|
37
33
|
let col_index: usize = header.iter().position(|x| x == col).unwrap();
|
|
38
34
|
header_indices.push(col_index);
|
|
39
|
-
if col == "Chromosome" {
|
|
40
|
-
chrom_index = col_index;
|
|
41
|
-
} else if col == "Start_Position" {
|
|
42
|
-
pos_index = col_index;
|
|
43
|
-
}
|
|
44
35
|
}
|
|
45
36
|
} else {
|
|
46
37
|
let maf_cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
47
38
|
let mut maf_out_lst: Vec<String> = Vec::new();
|
|
48
|
-
|
|
49
|
-
let mut pos = String::new();
|
|
50
|
-
for (i,x) in header_indices.iter().enumerate() {
|
|
39
|
+
for x in header_indices.iter() {
|
|
51
40
|
maf_out_lst.push(maf_cont_lst[*x].to_string());
|
|
52
|
-
if chrom_index != 9999 && i == chrom_index {
|
|
53
|
-
chrom = maf_cont_lst[*x].to_string();
|
|
54
|
-
} else if pos_index != 9999 && i == pos_index {
|
|
55
|
-
pos = maf_cont_lst[*x].to_string();
|
|
56
|
-
}
|
|
57
41
|
};
|
|
58
42
|
maf_out_lst.push("\n".to_string());
|
|
59
|
-
|
|
60
|
-
lst_chrom_pos.push(chrom+"\t"+&pos);
|
|
43
|
+
maf_str.push_str(maf_out_lst.join("\t").as_str());
|
|
61
44
|
}
|
|
62
45
|
};
|
|
63
|
-
(
|
|
46
|
+
maf_str.as_bytes().to_vec()
|
|
64
47
|
}
|
|
65
48
|
|
|
66
|
-
fn get_sorted_indices(lst: &Vec<String>) -> Vec<usize>{
|
|
67
|
-
let mut indices = (0..lst.len()).collect::<Vec<usize>>();
|
|
68
|
-
indices.sort_by(|a,b| {
|
|
69
|
-
lst[*a].split('\t').collect::<Vec<&str>>()[0].cmp(lst[*b].split('\t').collect::<Vec<&str>>()[0])
|
|
70
|
-
.then(lst[*a].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap().cmp(&lst[*b].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap()))
|
|
71
|
-
});
|
|
72
|
-
indices
|
|
73
|
-
}
|
|
74
49
|
|
|
75
50
|
// GDC MAF columns (96)
|
|
76
51
|
const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
|
|
@@ -109,46 +84,42 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
|
109
84
|
};
|
|
110
85
|
|
|
111
86
|
//downloading maf files parallelly and merge them into single maf file
|
|
112
|
-
let
|
|
113
|
-
let fetches = futures::stream::iter(
|
|
87
|
+
let download_futures = futures::stream::iter(
|
|
114
88
|
url.into_iter().map(|url|{
|
|
115
|
-
let txt = tx.clone();
|
|
116
89
|
async move {
|
|
117
|
-
|
|
90
|
+
let result = reqwest::get(&url).await;
|
|
91
|
+
if let Ok(resp) = result {
|
|
118
92
|
let content = resp.bytes().await.unwrap();
|
|
119
93
|
let mut decoder = GzDecoder::new(&content[..]);
|
|
120
94
|
let mut decompressed_content = Vec::new();
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
let
|
|
124
|
-
|
|
95
|
+
let read_content = decoder.read_to_end(&mut decompressed_content);
|
|
96
|
+
if let Ok(_) = read_content {
|
|
97
|
+
let text = String::from_utf8_lossy(&decompressed_content).to_string();
|
|
98
|
+
text
|
|
99
|
+
} else {
|
|
100
|
+
let error_msg = "Failed to read content downloaded from: ".to_string() + &url;
|
|
101
|
+
error_msg
|
|
125
102
|
}
|
|
103
|
+
} else {
|
|
104
|
+
let error_msg = "Failed to download: ".to_string() + &url;
|
|
105
|
+
error_msg
|
|
126
106
|
}
|
|
127
107
|
}
|
|
128
108
|
})
|
|
129
|
-
)
|
|
130
|
-
fetches.await;
|
|
131
|
-
drop(tx);
|
|
132
|
-
|
|
133
|
-
// write downloaded maf (GZIP format) into a Vector
|
|
134
|
-
// lst_chrom_pos: a vector including chromsome&position info for sorting maf
|
|
135
|
-
// idx_sorted: indices after sorting basedon chromsome&position
|
|
136
|
-
let mut maf_bit: Vec<Vec<u8>> = Vec::new();
|
|
137
|
-
let mut lst_chrom_pos: Vec<String> = Vec::new();
|
|
138
|
-
for (chr_pos_lst,maf_bit_lst) in rx {
|
|
139
|
-
maf_bit.extend_from_slice(&maf_bit_lst);
|
|
140
|
-
lst_chrom_pos.extend_from_slice(&chr_pos_lst);
|
|
141
|
-
};
|
|
142
|
-
let idx_sorted = get_sorted_indices(&lst_chrom_pos);
|
|
109
|
+
);
|
|
143
110
|
|
|
144
111
|
// output
|
|
145
|
-
// maf_out_bit: A vector of GZIPPED maf
|
|
146
|
-
// compress_header: output header
|
|
147
112
|
let mut encoder = GzEncoder::new(io::stdout(), Compression::default());
|
|
148
113
|
let _ = encoder.write_all(&MAF_COL.join("\t").as_bytes().to_vec()).expect("Failed to write header");
|
|
149
114
|
let _ = encoder.write_all(b"\n").expect("Failed to write newline");
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
115
|
+
download_futures.buffer_unordered(20).for_each(|item| {
|
|
116
|
+
if item.starts_with("Failed") {
|
|
117
|
+
eprintln!("{}",item);
|
|
118
|
+
} else {
|
|
119
|
+
let maf_bit = select_maf_col(item);
|
|
120
|
+
let _ = encoder.write_all(&maf_bit).expect("Failed to write file");
|
|
121
|
+
};
|
|
122
|
+
async {}
|
|
123
|
+
}).await;
|
|
153
124
|
Ok(())
|
|
154
125
|
}
|