@sjcrh/proteinpaint-rust 2.30.2 → 2.31.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/gdcmaf.rs +68 -68
package/package.json
CHANGED
package/src/gdcmaf.rs
CHANGED
|
@@ -1,35 +1,56 @@
|
|
|
1
1
|
use flate2::read::GzDecoder;
|
|
2
2
|
use flate2::write::GzEncoder;
|
|
3
|
+
use flate2::Compression;
|
|
3
4
|
use serde_json::Value;
|
|
4
|
-
use std::fs::File;
|
|
5
5
|
use std::path::Path;
|
|
6
6
|
use futures::StreamExt;
|
|
7
7
|
use std::io;
|
|
8
8
|
use std::io::{Read,Write};
|
|
9
9
|
use std::sync::mpsc;
|
|
10
|
-
use std::collections::HashMap;
|
|
11
10
|
|
|
12
11
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
let mut
|
|
12
|
+
|
|
13
|
+
fn gen_vec(d:String) -> (Vec<String>,Vec<Vec<u8>>) {
|
|
14
|
+
let mut maf_bit: Vec<Vec<u8>> = Vec::new();
|
|
15
|
+
let mut lst_chrom_pos: Vec<String> = Vec::new();
|
|
16
|
+
let mut header_indices: Vec<usize> = Vec::new();
|
|
17
|
+
let mut chrom_index: usize = 9999;
|
|
18
|
+
let mut pos_index: usize = 9999;
|
|
16
19
|
let lines = d.trim_end().split("\n");
|
|
17
20
|
for line in lines {
|
|
18
21
|
if line.starts_with("#") {
|
|
19
22
|
continue
|
|
20
23
|
} else if line.contains("Hugo_Symbol") {
|
|
21
|
-
header = line.split("\t").map(|s| s.to_string()).collect();
|
|
22
|
-
for
|
|
23
|
-
|
|
24
|
+
let header: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
25
|
+
for col in MAF_COL {
|
|
26
|
+
let col_index: usize = header.iter().position(|x| x == col).unwrap();
|
|
27
|
+
header_indices.push(col_index);
|
|
28
|
+
if col == "Chromosome" {
|
|
29
|
+
chrom_index = col_index;
|
|
30
|
+
} else if col == "Start_Position" {
|
|
31
|
+
pos_index = col_index;
|
|
32
|
+
}
|
|
24
33
|
}
|
|
25
34
|
} else {
|
|
26
35
|
let maf_cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
36
|
+
let mut maf_out_lst: Vec<String> = Vec::new();
|
|
37
|
+
let mut chrom = String::new();
|
|
38
|
+
let mut pos = String::new();
|
|
39
|
+
for (i,x) in header_indices.iter().enumerate() {
|
|
40
|
+
maf_out_lst.push(maf_cont_lst[*x].to_string());
|
|
41
|
+
if chrom_index != 9999 && i == chrom_index {
|
|
42
|
+
chrom = maf_cont_lst[*x].to_string();
|
|
43
|
+
} else if pos_index != 9999 && i == pos_index {
|
|
44
|
+
pos = maf_cont_lst[*x].to_string();
|
|
45
|
+
}
|
|
46
|
+
};
|
|
47
|
+
maf_out_lst.push("\n".to_string());
|
|
48
|
+
let maf_compress_data = gen_gzip_vec(maf_out_lst);
|
|
49
|
+
maf_bit.push(maf_compress_data);
|
|
50
|
+
lst_chrom_pos.push(chrom+"\t"+&pos);
|
|
30
51
|
}
|
|
31
|
-
}
|
|
32
|
-
|
|
52
|
+
};
|
|
53
|
+
(lst_chrom_pos,maf_bit)
|
|
33
54
|
}
|
|
34
55
|
|
|
35
56
|
fn get_sorted_indices(lst: &Vec<String>) -> Vec<usize>{
|
|
@@ -41,6 +62,14 @@ fn get_sorted_indices(lst: &Vec<String>) -> Vec<usize>{
|
|
|
41
62
|
indices
|
|
42
63
|
}
|
|
43
64
|
|
|
65
|
+
// convert vector (maf row) to GZIP encoded format
|
|
66
|
+
fn gen_gzip_vec(s:Vec<String>) -> Vec<u8> {
|
|
67
|
+
let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
|
|
68
|
+
let _ = encoder.write_all(s.join("\t").as_bytes());
|
|
69
|
+
let compress_data = encoder.finish().unwrap();
|
|
70
|
+
compress_data
|
|
71
|
+
}
|
|
72
|
+
|
|
44
73
|
// GDC MAF columns (96)
|
|
45
74
|
const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
|
|
46
75
|
"Start_Position", "End_Position", "Strand", "Variant_Classification",
|
|
@@ -66,13 +95,12 @@ const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Bui
|
|
|
66
95
|
async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
67
96
|
// Accepting the piped input json from jodejs and assign to the variable
|
|
68
97
|
// host: GDC host
|
|
69
|
-
//
|
|
98
|
+
// save output into json string
|
|
70
99
|
// url: urls to download single maf files
|
|
71
100
|
let mut buffer = String::new();
|
|
72
101
|
io::stdin().read_line(&mut buffer)?;
|
|
73
102
|
let file_id_lst_js = serde_json::from_str::<Value>(&buffer).expect("Error reading input and serializing to JSON");
|
|
74
103
|
let host = &file_id_lst_js["host"].as_str().unwrap();
|
|
75
|
-
let out_file = &file_id_lst_js["outFile"].as_str().unwrap();
|
|
76
104
|
let mut url: Vec<String> = Vec::new();
|
|
77
105
|
for v in file_id_lst_js["fileIdLst"].as_array().unwrap() {
|
|
78
106
|
url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
|
|
@@ -89,9 +117,11 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
|
89
117
|
let content = resp.bytes().await.unwrap();
|
|
90
118
|
let mut decoder = GzDecoder::new(&content[..]);
|
|
91
119
|
let mut decompressed_content = Vec::new();
|
|
92
|
-
decoder.read_to_end(&mut decompressed_content)
|
|
93
|
-
|
|
94
|
-
|
|
120
|
+
if let Ok(_) = decoder.read_to_end(&mut decompressed_content) {
|
|
121
|
+
let text = String::from_utf8_lossy(&decompressed_content);
|
|
122
|
+
let (lst_chrom_pos,maf_bit) = gen_vec(text.to_string());
|
|
123
|
+
txt.send((lst_chrom_pos,maf_bit)).unwrap();
|
|
124
|
+
}
|
|
95
125
|
}
|
|
96
126
|
Err(_) => println!("ERROR downloading {}", url),
|
|
97
127
|
}
|
|
@@ -101,61 +131,31 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
|
101
131
|
fetches.await;
|
|
102
132
|
drop(tx);
|
|
103
133
|
|
|
104
|
-
// write downloaded maf into
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
// store downloaed mafs into one HashMap data sturcture based on the common column names
|
|
111
|
-
let mut maf = HashMap::new();
|
|
112
|
-
for maf_data in received_values {
|
|
113
|
-
if maf.is_empty() {
|
|
114
|
-
maf = gen_map(maf_data);
|
|
115
|
-
// remove columns if column name is found from MAF_COL
|
|
116
|
-
let mut keys_to_remove_in_maf: Vec<String> = Vec::new();
|
|
117
|
-
for key in maf.keys() {
|
|
118
|
-
if !(MAF_COL.contains(&key.as_str())) {
|
|
119
|
-
keys_to_remove_in_maf.push(key.to_string());
|
|
120
|
-
}
|
|
121
|
-
};
|
|
122
|
-
for key in keys_to_remove_in_maf {
|
|
123
|
-
maf.remove(&key);
|
|
124
|
-
}
|
|
125
|
-
} else {
|
|
126
|
-
let maf1 = gen_map(maf_data);
|
|
127
|
-
let keys_in_maf1: Vec<String> = maf1.keys().cloned().collect();
|
|
128
|
-
for key in keys_in_maf1 {
|
|
129
|
-
if maf.contains_key(&key) {
|
|
130
|
-
let key_value = maf1[&key].clone();
|
|
131
|
-
maf.get_mut(&key).map(|val| val.extend(key_value));
|
|
132
|
-
}
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
};
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
// generate a Vec with "chrom\tpos" for sorting
|
|
139
|
-
// generated indices after sorting
|
|
134
|
+
// write downloaded maf (GZIP format) into a Vector
|
|
135
|
+
// lst_chrom_pos: a vector including chromsome&position info for sorting maf
|
|
136
|
+
// idx_sorted: indices after sorting basedon chromsome&position
|
|
137
|
+
let mut maf_bit: Vec<Vec<u8>> = Vec::new();
|
|
140
138
|
let mut lst_chrom_pos: Vec<String> = Vec::new();
|
|
141
|
-
for (
|
|
142
|
-
|
|
139
|
+
for (chr_pos_lst,maf_bit_lst) in rx {
|
|
140
|
+
maf_bit.extend_from_slice(&maf_bit_lst);
|
|
141
|
+
lst_chrom_pos.extend_from_slice(&chr_pos_lst);
|
|
143
142
|
};
|
|
144
143
|
let idx_sorted = get_sorted_indices(&lst_chrom_pos);
|
|
145
144
|
|
|
146
|
-
//
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
145
|
+
// output
|
|
146
|
+
// maf_out_bit: A vector of GZIPPED maf
|
|
147
|
+
// compress_header: output header
|
|
148
|
+
let mut maf_out_bit: Vec<u8> = Vec::new();
|
|
149
|
+
let compress_header = gen_gzip_vec(MAF_COL.iter().map(|s| s.to_string()).collect());
|
|
150
|
+
maf_out_bit.extend(compress_header);
|
|
151
|
+
let compress_header_line_break = gen_gzip_vec(["\n".to_string()].to_vec());
|
|
152
|
+
maf_out_bit.extend(compress_header_line_break);
|
|
151
153
|
for i in idx_sorted.iter() {
|
|
152
|
-
|
|
153
|
-
for k in MAF_COL {
|
|
154
|
-
val_lst.push(maf[k][*i].to_owned());
|
|
155
|
-
};
|
|
156
|
-
let val_out = val_lst.join("\t")+"\n";
|
|
157
|
-
encoder.write_all(val_out.as_bytes())?;
|
|
154
|
+
maf_out_bit.extend(&maf_bit[*i]);
|
|
158
155
|
};
|
|
159
|
-
|
|
156
|
+
|
|
157
|
+
// standard output
|
|
158
|
+
println!("{:?}",maf_out_bit);
|
|
159
|
+
std::io::stdout().flush().expect("Failed to flush stdout");
|
|
160
160
|
Ok(())
|
|
161
161
|
}
|