@sjcrh/proteinpaint-rust 2.30.2 → 2.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/gdcmaf.rs +68 -68
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.30.2",
2
+ "version": "2.31.0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
package/src/gdcmaf.rs CHANGED
@@ -1,35 +1,56 @@
1
1
  use flate2::read::GzDecoder;
2
2
  use flate2::write::GzEncoder;
3
+ use flate2::Compression;
3
4
  use serde_json::Value;
4
- use std::fs::File;
5
5
  use std::path::Path;
6
6
  use futures::StreamExt;
7
7
  use std::io;
8
8
  use std::io::{Read,Write};
9
9
  use std::sync::mpsc;
10
- use std::collections::HashMap;
11
10
 
12
11
 
13
- fn gen_map(d:String) -> HashMap<String,Vec<String>> {
14
- let mut map: HashMap<String, Vec<String>> = HashMap::new();
15
- let mut header: Vec<String> = Vec::new();
12
+
13
+ fn gen_vec(d:String) -> (Vec<String>,Vec<Vec<u8>>) {
14
+ let mut maf_bit: Vec<Vec<u8>> = Vec::new();
15
+ let mut lst_chrom_pos: Vec<String> = Vec::new();
16
+ let mut header_indices: Vec<usize> = Vec::new();
17
+ let mut chrom_index: usize = 9999;
18
+ let mut pos_index: usize = 9999;
16
19
  let lines = d.trim_end().split("\n");
17
20
  for line in lines {
18
21
  if line.starts_with("#") {
19
22
  continue
20
23
  } else if line.contains("Hugo_Symbol") {
21
- header = line.split("\t").map(|s| s.to_string()).collect();
22
- for k in &header {
23
- map.insert(k.to_string(),Vec::new());
24
+ let header: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
25
+ for col in MAF_COL {
26
+ let col_index: usize = header.iter().position(|x| x == col).unwrap();
27
+ header_indices.push(col_index);
28
+ if col == "Chromosome" {
29
+ chrom_index = col_index;
30
+ } else if col == "Start_Position" {
31
+ pos_index = col_index;
32
+ }
24
33
  }
25
34
  } else {
26
35
  let maf_cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
27
- for (i,x) in maf_cont_lst.iter().enumerate() {
28
- map.get_mut(&header[i]).map(|val| val.push(x.to_string()));
29
- }
36
+ let mut maf_out_lst: Vec<String> = Vec::new();
37
+ let mut chrom = String::new();
38
+ let mut pos = String::new();
39
+ for (i,x) in header_indices.iter().enumerate() {
40
+ maf_out_lst.push(maf_cont_lst[*x].to_string());
41
+ if chrom_index != 9999 && i == chrom_index {
42
+ chrom = maf_cont_lst[*x].to_string();
43
+ } else if pos_index != 9999 && i == pos_index {
44
+ pos = maf_cont_lst[*x].to_string();
45
+ }
46
+ };
47
+ maf_out_lst.push("\n".to_string());
48
+ let maf_compress_data = gen_gzip_vec(maf_out_lst);
49
+ maf_bit.push(maf_compress_data);
50
+ lst_chrom_pos.push(chrom+"\t"+&pos);
30
51
  }
31
- }
32
- map
52
+ };
53
+ (lst_chrom_pos,maf_bit)
33
54
  }
34
55
 
35
56
  fn get_sorted_indices(lst: &Vec<String>) -> Vec<usize>{
@@ -41,6 +62,14 @@ fn get_sorted_indices(lst: &Vec<String>) -> Vec<usize>{
41
62
  indices
42
63
  }
43
64
 
65
+ // convert vector (maf row) to GZIP encoded format
66
+ fn gen_gzip_vec(s:Vec<String>) -> Vec<u8> {
67
+ let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
68
+ let _ = encoder.write_all(s.join("\t").as_bytes());
69
+ let compress_data = encoder.finish().unwrap();
70
+ compress_data
71
+ }
72
+
44
73
  // GDC MAF columns (96)
45
74
  const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
46
75
  "Start_Position", "End_Position", "Strand", "Variant_Classification",
@@ -66,13 +95,12 @@ const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Bui
66
95
  async fn main() -> Result<(),Box<dyn std::error::Error>> {
67
96
  // Accepting the piped input json from jodejs and assign to the variable
68
97
  // host: GDC host
69
- // out_file: save maf to out_file under cachedir
98
+ // save output into json string
70
99
  // url: urls to download single maf files
71
100
  let mut buffer = String::new();
72
101
  io::stdin().read_line(&mut buffer)?;
73
102
  let file_id_lst_js = serde_json::from_str::<Value>(&buffer).expect("Error reading input and serializing to JSON");
74
103
  let host = &file_id_lst_js["host"].as_str().unwrap();
75
- let out_file = &file_id_lst_js["outFile"].as_str().unwrap();
76
104
  let mut url: Vec<String> = Vec::new();
77
105
  for v in file_id_lst_js["fileIdLst"].as_array().unwrap() {
78
106
  url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
@@ -89,9 +117,11 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
89
117
  let content = resp.bytes().await.unwrap();
90
118
  let mut decoder = GzDecoder::new(&content[..]);
91
119
  let mut decompressed_content = Vec::new();
92
- decoder.read_to_end(&mut decompressed_content).unwrap();
93
- let text = String::from_utf8_lossy(&decompressed_content);
94
- txt.send(text.to_string()).unwrap();
120
+ if let Ok(_) = decoder.read_to_end(&mut decompressed_content) {
121
+ let text = String::from_utf8_lossy(&decompressed_content);
122
+ let (lst_chrom_pos,maf_bit) = gen_vec(text.to_string());
123
+ txt.send((lst_chrom_pos,maf_bit)).unwrap();
124
+ }
95
125
  }
96
126
  Err(_) => println!("ERROR downloading {}", url),
97
127
  }
@@ -101,61 +131,31 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
101
131
  fetches.await;
102
132
  drop(tx);
103
133
 
104
- // write downloaded maf into variable received_values
105
- let mut received_values: Vec<String> = Vec::new();
106
- for value in rx {
107
- received_values.push(value);
108
- }
109
-
110
- // store downloaed mafs into one HashMap data sturcture based on the common column names
111
- let mut maf = HashMap::new();
112
- for maf_data in received_values {
113
- if maf.is_empty() {
114
- maf = gen_map(maf_data);
115
- // remove columns if column name is found from MAF_COL
116
- let mut keys_to_remove_in_maf: Vec<String> = Vec::new();
117
- for key in maf.keys() {
118
- if !(MAF_COL.contains(&key.as_str())) {
119
- keys_to_remove_in_maf.push(key.to_string());
120
- }
121
- };
122
- for key in keys_to_remove_in_maf {
123
- maf.remove(&key);
124
- }
125
- } else {
126
- let maf1 = gen_map(maf_data);
127
- let keys_in_maf1: Vec<String> = maf1.keys().cloned().collect();
128
- for key in keys_in_maf1 {
129
- if maf.contains_key(&key) {
130
- let key_value = maf1[&key].clone();
131
- maf.get_mut(&key).map(|val| val.extend(key_value));
132
- }
133
- }
134
- }
135
- };
136
-
137
-
138
- // generate a Vec with "chrom\tpos" for sorting
139
- // generated indices after sorting
134
+ // write downloaded maf (GZIP format) into a Vector
135
+ // lst_chrom_pos: a vector including chromsome&position info for sorting maf
136
+ // idx_sorted: indices after sorting basedon chromsome&position
137
+ let mut maf_bit: Vec<Vec<u8>> = Vec::new();
140
138
  let mut lst_chrom_pos: Vec<String> = Vec::new();
141
- for (i,v) in maf["Chromosome"].iter().enumerate() {
142
- lst_chrom_pos.push(v.to_owned()+"\t"+&maf["Start_Position"][i]);
139
+ for (chr_pos_lst,maf_bit_lst) in rx {
140
+ maf_bit.extend_from_slice(&maf_bit_lst);
141
+ lst_chrom_pos.extend_from_slice(&chr_pos_lst);
143
142
  };
144
143
  let idx_sorted = get_sorted_indices(&lst_chrom_pos);
145
144
 
146
- // write to file
147
- let file = File::create(out_file).expect("could not create file");
148
- let mut encoder = GzEncoder::new(file, Default::default());
149
- encoder.write_all(MAF_COL.join("\t").as_bytes())?;
150
- encoder.write_all("\n".as_bytes())?;
145
+ // output
146
+ // maf_out_bit: A vector of GZIPPED maf
147
+ // compress_header: output header
148
+ let mut maf_out_bit: Vec<u8> = Vec::new();
149
+ let compress_header = gen_gzip_vec(MAF_COL.iter().map(|s| s.to_string()).collect());
150
+ maf_out_bit.extend(compress_header);
151
+ let compress_header_line_break = gen_gzip_vec(["\n".to_string()].to_vec());
152
+ maf_out_bit.extend(compress_header_line_break);
151
153
  for i in idx_sorted.iter() {
152
- let mut val_lst: Vec<String> = Vec::new();
153
- for k in MAF_COL {
154
- val_lst.push(maf[k][*i].to_owned());
155
- };
156
- let val_out = val_lst.join("\t")+"\n";
157
- encoder.write_all(val_out.as_bytes())?;
154
+ maf_out_bit.extend(&maf_bit[*i]);
158
155
  };
159
- encoder.finish()?;
156
+
157
+ // standard output
158
+ println!("{:?}",maf_out_bit);
159
+ std::io::stdout().flush().expect("Failed to flush stdout");
160
160
  Ok(())
161
161
  }