@sjcrh/proteinpaint-rust 2.29.6 → 2.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -27,6 +27,11 @@ serde_json="^1.0.88"
27
27
  num = "^0.4.1"
28
28
  csv = "^1.2.2"
29
29
  #r_mathlib="^0.2.0" # Uncomment this line to activate DE expression app for high sample sizes
30
+ tokio = { version="1", features = ["full"] }
31
+ reqwest = "0.11"
32
+ flate2 = "1"
33
+ futures = "0.3"
34
+
30
35
 
31
36
 
32
37
  [profile.release]
@@ -63,6 +68,10 @@ path="src/sv.rs"
63
68
  name="cluster"
64
69
  path="src/cluster.rs"
65
70
 
71
+ [[bin]]
72
+ name="gdcmaf"
73
+ path="src/gdcmaf.rs"
74
+
66
75
  #[[bin]]
67
76
  #name="wilcoxon"
68
77
  #path="src/wilcoxon.rs"
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.29.6",
2
+ "version": "2.30.2",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
package/src/DEanalysis.rs CHANGED
@@ -1,4 +1,5 @@
1
- // cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/expression
1
+ // cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
2
+ // cd .. && cargo build --release && cat ~/sjpp/test.txt | target/release/DEanalysis
2
3
  #![allow(non_snake_case)]
3
4
  use json;
4
5
  use nalgebra::base::dimension::Const;
@@ -53,6 +54,8 @@ fn input_data(
53
54
  let mut control_indexes_original: Vec<usize> = Vec::with_capacity(control_list.len());
54
55
  let gene_name_index = headers.iter().position(|r| r == &"geneID");
55
56
  let gene_symbol_index = headers.iter().position(|r| r == &"geneSymbol");
57
+ //let mut case_samples_not_found: Vec<&str> = Vec::with_capacity(case_list.len());
58
+ //let mut control_samples_not_found: Vec<&str> = Vec::with_capacity(control_list.len());
56
59
 
57
60
  for item in case_list {
58
61
  //println!("item:{}", item);
@@ -60,8 +63,8 @@ fn input_data(
60
63
  match index {
61
64
  Some(n) => case_indexes_original.push(n),
62
65
  None => {
63
- // When sample not found, give error stating the sample name is not found
64
- panic!("Case sample not found:{}", item);
66
+ //panic!("Case sample not found:{}", item);
67
+ //case_samples_not_found.push(item);
65
68
  }
66
69
  }
67
70
  }
@@ -72,8 +75,8 @@ fn input_data(
72
75
  match index {
73
76
  Some(n) => control_indexes_original.push(n),
74
77
  None => {
75
- // When sample not found, give error stating the sample name is not found
76
- panic!("Control sample not found:{}", item);
78
+ //panic!("Control sample not found:{}", item);
79
+ //control_samples_not_found.push(item);
77
80
  }
78
81
  }
79
82
  }
@@ -198,8 +201,8 @@ fn main() {
198
201
  let (filtered_matrix, lib_sizes, filtered_genes, filtered_gene_symbols) =
199
202
  filter_by_expr(
200
203
  &input_matrix,
201
- case_list.len(),
202
- control_list.len(),
204
+ case_indexes.len(),
205
+ control_indexes.len(),
203
206
  gene_names,
204
207
  gene_symbols,
205
208
  );
@@ -259,13 +262,23 @@ fn main() {
259
262
  ); // Setting continuity correction to true in case of normal approximation
260
263
  let treated_mean = Data::new(treated).mean();
261
264
  let control_mean = Data::new(control).mean();
262
- p_values.push(PValueIndexes {
263
- index: i,
264
- gene_name: filtered_genes[i].to_owned(),
265
- gene_symbol: filtered_gene_symbols[i].to_owned(),
266
- fold_change: (treated_mean.unwrap() / control_mean.unwrap()).log2(),
267
- p_value: p_value,
268
- });
265
+ if (treated_mean.unwrap() / control_mean.unwrap())
266
+ .log2()
267
+ .is_nan()
268
+ == false
269
+ && (treated_mean.unwrap() / control_mean.unwrap())
270
+ .log2()
271
+ .is_infinite()
272
+ == false
273
+ {
274
+ p_values.push(PValueIndexes {
275
+ index: i,
276
+ gene_name: filtered_genes[i].to_owned(),
277
+ gene_symbol: filtered_gene_symbols[i].to_owned(),
278
+ fold_change: (treated_mean.unwrap() / control_mean.unwrap()).log2(),
279
+ p_value: p_value,
280
+ });
281
+ }
269
282
  }
270
283
  //println!("p_values:{:?}", p_values);
271
284
  println!(
package/src/gdcmaf.rs ADDED
@@ -0,0 +1,161 @@
1
+ use flate2::read::GzDecoder;
2
+ use flate2::write::GzEncoder;
3
+ use serde_json::Value;
4
+ use std::fs::File;
5
+ use std::path::Path;
6
+ use futures::StreamExt;
7
+ use std::io;
8
+ use std::io::{Read,Write};
9
+ use std::sync::mpsc;
10
+ use std::collections::HashMap;
11
+
12
+
13
+ fn gen_map(d:String) -> HashMap<String,Vec<String>> {
14
+ let mut map: HashMap<String, Vec<String>> = HashMap::new();
15
+ let mut header: Vec<String> = Vec::new();
16
+ let lines = d.trim_end().split("\n");
17
+ for line in lines {
18
+ if line.starts_with("#") {
19
+ continue
20
+ } else if line.contains("Hugo_Symbol") {
21
+ header = line.split("\t").map(|s| s.to_string()).collect();
22
+ for k in &header {
23
+ map.insert(k.to_string(),Vec::new());
24
+ }
25
+ } else {
26
+ let maf_cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
27
+ for (i,x) in maf_cont_lst.iter().enumerate() {
28
+ map.get_mut(&header[i]).map(|val| val.push(x.to_string()));
29
+ }
30
+ }
31
+ }
32
+ map
33
+ }
34
+
35
+ fn get_sorted_indices(lst: &Vec<String>) -> Vec<usize>{
36
+ let mut indices = (0..lst.len()).collect::<Vec<usize>>();
37
+ indices.sort_by(|a,b| {
38
+ lst[*a].split('\t').collect::<Vec<&str>>()[0].cmp(lst[*b].split('\t').collect::<Vec<&str>>()[0])
39
+ .then(lst[*a].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap().cmp(&lst[*b].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap()))
40
+ });
41
+ indices
42
+ }
43
+
44
+ // GDC MAF columns (96)
45
+ const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
46
+ "Start_Position", "End_Position", "Strand", "Variant_Classification",
47
+ "Variant_Type", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2",
48
+ "dbSNP_RS", "dbSNP_Val_Status", "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode",
49
+ "Match_Norm_Seq_Allele1", "Match_Norm_Seq_Allele2", "Tumor_Validation_Allele1",
50
+ "Tumor_Validation_Allele2", "Match_Norm_Validation_Allele1", "Match_Norm_Validation_Allele2",
51
+ "Verification_Status", "Validation_Status", "Mutation_Status", "Sequencing_Phase",
52
+ "Sequence_Source", "Validation_Method", "Score", "BAM_File", "Sequencer",
53
+ "Tumor_Sample_UUID", "Matched_Norm_Sample_UUID", "HGVSc", "HGVSp", "HGVSp_Short",
54
+ "Transcript_ID", "Exon_Number", "t_depth", "t_ref_count", "t_alt_count", "n_depth",
55
+ "n_ref_count", "n_alt_count", "all_effects", "Allele", "Gene", "Feature", "Feature_type",
56
+ "One_Consequence", "Consequence", "cDNA_position", "CDS_position", "Protein_position",
57
+ "Amino_acids", "Codons", "Existing_variation", "DISTANCE", "TRANSCRIPT_STRAND", "SYMBOL",
58
+ "SYMBOL_SOURCE", "HGNC_ID", "BIOTYPE", "CANONICAL", "CCDS", "ENSP", "SWISSPROT", "TREMBL",
59
+ "UNIPARC", "RefSeq", "SIFT", "PolyPhen", "EXON", "INTRON", "DOMAINS", "CLIN_SIG", "SOMATIC",
60
+ "PUBMED", "MOTIF_NAME", "MOTIF_POS", "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "IMPACT", "PICK",
61
+ "VARIANT_CLASS", "TSL", "HGVS_OFFSET", "PHENO", "GENE_PHENO", "CONTEXT", "tumor_bam_uuid",
62
+ "normal_bam_uuid", "case_id", "GDC_FILTER", "COSMIC"];
63
+
64
+
65
+ #[tokio::main]
66
+ async fn main() -> Result<(),Box<dyn std::error::Error>> {
67
+ // Accepting the piped input json from jodejs and assign to the variable
68
+ // host: GDC host
69
+ // out_file: save maf to out_file under cachedir
70
+ // url: urls to download single maf files
71
+ let mut buffer = String::new();
72
+ io::stdin().read_line(&mut buffer)?;
73
+ let file_id_lst_js = serde_json::from_str::<Value>(&buffer).expect("Error reading input and serializing to JSON");
74
+ let host = &file_id_lst_js["host"].as_str().unwrap();
75
+ let out_file = &file_id_lst_js["outFile"].as_str().unwrap();
76
+ let mut url: Vec<String> = Vec::new();
77
+ for v in file_id_lst_js["fileIdLst"].as_array().unwrap() {
78
+ url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
79
+ };
80
+
81
+ //downloading maf files parallelly and merge them into single maf file
82
+ let (tx, rx) = mpsc::channel();
83
+ let fetches = futures::stream::iter(
84
+ url.into_iter().map(|url|{
85
+ let txt = tx.clone();
86
+ async move {
87
+ match reqwest::get(&url).await{
88
+ Ok(resp) => {
89
+ let content = resp.bytes().await.unwrap();
90
+ let mut decoder = GzDecoder::new(&content[..]);
91
+ let mut decompressed_content = Vec::new();
92
+ decoder.read_to_end(&mut decompressed_content).unwrap();
93
+ let text = String::from_utf8_lossy(&decompressed_content);
94
+ txt.send(text.to_string()).unwrap();
95
+ }
96
+ Err(_) => println!("ERROR downloading {}", url),
97
+ }
98
+ }
99
+ })
100
+ ).buffer_unordered(20).collect::<Vec<()>>();
101
+ fetches.await;
102
+ drop(tx);
103
+
104
+ // write downloaded maf into variable received_values
105
+ let mut received_values: Vec<String> = Vec::new();
106
+ for value in rx {
107
+ received_values.push(value);
108
+ }
109
+
110
+ // store downloaed mafs into one HashMap data sturcture based on the common column names
111
+ let mut maf = HashMap::new();
112
+ for maf_data in received_values {
113
+ if maf.is_empty() {
114
+ maf = gen_map(maf_data);
115
+ // remove columns if column name is found from MAF_COL
116
+ let mut keys_to_remove_in_maf: Vec<String> = Vec::new();
117
+ for key in maf.keys() {
118
+ if !(MAF_COL.contains(&key.as_str())) {
119
+ keys_to_remove_in_maf.push(key.to_string());
120
+ }
121
+ };
122
+ for key in keys_to_remove_in_maf {
123
+ maf.remove(&key);
124
+ }
125
+ } else {
126
+ let maf1 = gen_map(maf_data);
127
+ let keys_in_maf1: Vec<String> = maf1.keys().cloned().collect();
128
+ for key in keys_in_maf1 {
129
+ if maf.contains_key(&key) {
130
+ let key_value = maf1[&key].clone();
131
+ maf.get_mut(&key).map(|val| val.extend(key_value));
132
+ }
133
+ }
134
+ }
135
+ };
136
+
137
+
138
+ // generate a Vec with "chrom\tpos" for sorting
139
+ // generated indices after sorting
140
+ let mut lst_chrom_pos: Vec<String> = Vec::new();
141
+ for (i,v) in maf["Chromosome"].iter().enumerate() {
142
+ lst_chrom_pos.push(v.to_owned()+"\t"+&maf["Start_Position"][i]);
143
+ };
144
+ let idx_sorted = get_sorted_indices(&lst_chrom_pos);
145
+
146
+ // write to file
147
+ let file = File::create(out_file).expect("could not create file");
148
+ let mut encoder = GzEncoder::new(file, Default::default());
149
+ encoder.write_all(MAF_COL.join("\t").as_bytes())?;
150
+ encoder.write_all("\n".as_bytes())?;
151
+ for i in idx_sorted.iter() {
152
+ let mut val_lst: Vec<String> = Vec::new();
153
+ for k in MAF_COL {
154
+ val_lst.push(maf[k][*i].to_owned());
155
+ };
156
+ let val_out = val_lst.join("\t")+"\n";
157
+ encoder.write_all(val_out.as_bytes())?;
158
+ };
159
+ encoder.finish()?;
160
+ Ok(())
161
+ }