@sjcrh/proteinpaint-rust 2.29.6 → 2.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +9 -0
- package/package.json +1 -1
- package/src/DEanalysis.rs +27 -14
- package/src/gdcmaf.rs +161 -0
package/Cargo.toml
CHANGED
|
@@ -27,6 +27,11 @@ serde_json="^1.0.88"
|
|
|
27
27
|
num = "^0.4.1"
|
|
28
28
|
csv = "^1.2.2"
|
|
29
29
|
#r_mathlib="^0.2.0" # Uncomment this line to activate DE expression app for high sample sizes
|
|
30
|
+
tokio = { version="1", features = ["full"] }
|
|
31
|
+
reqwest = "0.11"
|
|
32
|
+
flate2 = "1"
|
|
33
|
+
futures = "0.3"
|
|
34
|
+
|
|
30
35
|
|
|
31
36
|
|
|
32
37
|
[profile.release]
|
|
@@ -63,6 +68,10 @@ path="src/sv.rs"
|
|
|
63
68
|
name="cluster"
|
|
64
69
|
path="src/cluster.rs"
|
|
65
70
|
|
|
71
|
+
[[bin]]
|
|
72
|
+
name="gdcmaf"
|
|
73
|
+
path="src/gdcmaf.rs"
|
|
74
|
+
|
|
66
75
|
#[[bin]]
|
|
67
76
|
#name="wilcoxon"
|
|
68
77
|
#path="src/wilcoxon.rs"
|
package/package.json
CHANGED
package/src/DEanalysis.rs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
// cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/
|
|
1
|
+
// cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
|
|
2
|
+
// cd .. && cargo build --release && cat ~/sjpp/test.txt | target/release/DEanalysis
|
|
2
3
|
#![allow(non_snake_case)]
|
|
3
4
|
use json;
|
|
4
5
|
use nalgebra::base::dimension::Const;
|
|
@@ -53,6 +54,8 @@ fn input_data(
|
|
|
53
54
|
let mut control_indexes_original: Vec<usize> = Vec::with_capacity(control_list.len());
|
|
54
55
|
let gene_name_index = headers.iter().position(|r| r == &"geneID");
|
|
55
56
|
let gene_symbol_index = headers.iter().position(|r| r == &"geneSymbol");
|
|
57
|
+
//let mut case_samples_not_found: Vec<&str> = Vec::with_capacity(case_list.len());
|
|
58
|
+
//let mut control_samples_not_found: Vec<&str> = Vec::with_capacity(control_list.len());
|
|
56
59
|
|
|
57
60
|
for item in case_list {
|
|
58
61
|
//println!("item:{}", item);
|
|
@@ -60,8 +63,8 @@ fn input_data(
|
|
|
60
63
|
match index {
|
|
61
64
|
Some(n) => case_indexes_original.push(n),
|
|
62
65
|
None => {
|
|
63
|
-
//
|
|
64
|
-
|
|
66
|
+
//panic!("Case sample not found:{}", item);
|
|
67
|
+
//case_samples_not_found.push(item);
|
|
65
68
|
}
|
|
66
69
|
}
|
|
67
70
|
}
|
|
@@ -72,8 +75,8 @@ fn input_data(
|
|
|
72
75
|
match index {
|
|
73
76
|
Some(n) => control_indexes_original.push(n),
|
|
74
77
|
None => {
|
|
75
|
-
//
|
|
76
|
-
|
|
78
|
+
//panic!("Control sample not found:{}", item);
|
|
79
|
+
//control_samples_not_found.push(item);
|
|
77
80
|
}
|
|
78
81
|
}
|
|
79
82
|
}
|
|
@@ -198,8 +201,8 @@ fn main() {
|
|
|
198
201
|
let (filtered_matrix, lib_sizes, filtered_genes, filtered_gene_symbols) =
|
|
199
202
|
filter_by_expr(
|
|
200
203
|
&input_matrix,
|
|
201
|
-
|
|
202
|
-
|
|
204
|
+
case_indexes.len(),
|
|
205
|
+
control_indexes.len(),
|
|
203
206
|
gene_names,
|
|
204
207
|
gene_symbols,
|
|
205
208
|
);
|
|
@@ -259,13 +262,23 @@ fn main() {
|
|
|
259
262
|
); // Setting continuity correction to true in case of normal approximation
|
|
260
263
|
let treated_mean = Data::new(treated).mean();
|
|
261
264
|
let control_mean = Data::new(control).mean();
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
265
|
+
if (treated_mean.unwrap() / control_mean.unwrap())
|
|
266
|
+
.log2()
|
|
267
|
+
.is_nan()
|
|
268
|
+
== false
|
|
269
|
+
&& (treated_mean.unwrap() / control_mean.unwrap())
|
|
270
|
+
.log2()
|
|
271
|
+
.is_infinite()
|
|
272
|
+
== false
|
|
273
|
+
{
|
|
274
|
+
p_values.push(PValueIndexes {
|
|
275
|
+
index: i,
|
|
276
|
+
gene_name: filtered_genes[i].to_owned(),
|
|
277
|
+
gene_symbol: filtered_gene_symbols[i].to_owned(),
|
|
278
|
+
fold_change: (treated_mean.unwrap() / control_mean.unwrap()).log2(),
|
|
279
|
+
p_value: p_value,
|
|
280
|
+
});
|
|
281
|
+
}
|
|
269
282
|
}
|
|
270
283
|
//println!("p_values:{:?}", p_values);
|
|
271
284
|
println!(
|
package/src/gdcmaf.rs
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
use flate2::read::GzDecoder;
|
|
2
|
+
use flate2::write::GzEncoder;
|
|
3
|
+
use serde_json::Value;
|
|
4
|
+
use std::fs::File;
|
|
5
|
+
use std::path::Path;
|
|
6
|
+
use futures::StreamExt;
|
|
7
|
+
use std::io;
|
|
8
|
+
use std::io::{Read,Write};
|
|
9
|
+
use std::sync::mpsc;
|
|
10
|
+
use std::collections::HashMap;
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
fn gen_map(d:String) -> HashMap<String,Vec<String>> {
|
|
14
|
+
let mut map: HashMap<String, Vec<String>> = HashMap::new();
|
|
15
|
+
let mut header: Vec<String> = Vec::new();
|
|
16
|
+
let lines = d.trim_end().split("\n");
|
|
17
|
+
for line in lines {
|
|
18
|
+
if line.starts_with("#") {
|
|
19
|
+
continue
|
|
20
|
+
} else if line.contains("Hugo_Symbol") {
|
|
21
|
+
header = line.split("\t").map(|s| s.to_string()).collect();
|
|
22
|
+
for k in &header {
|
|
23
|
+
map.insert(k.to_string(),Vec::new());
|
|
24
|
+
}
|
|
25
|
+
} else {
|
|
26
|
+
let maf_cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
27
|
+
for (i,x) in maf_cont_lst.iter().enumerate() {
|
|
28
|
+
map.get_mut(&header[i]).map(|val| val.push(x.to_string()));
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
map
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
fn get_sorted_indices(lst: &Vec<String>) -> Vec<usize>{
|
|
36
|
+
let mut indices = (0..lst.len()).collect::<Vec<usize>>();
|
|
37
|
+
indices.sort_by(|a,b| {
|
|
38
|
+
lst[*a].split('\t').collect::<Vec<&str>>()[0].cmp(lst[*b].split('\t').collect::<Vec<&str>>()[0])
|
|
39
|
+
.then(lst[*a].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap().cmp(&lst[*b].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap()))
|
|
40
|
+
});
|
|
41
|
+
indices
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// GDC MAF columns (96)
|
|
45
|
+
const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
|
|
46
|
+
"Start_Position", "End_Position", "Strand", "Variant_Classification",
|
|
47
|
+
"Variant_Type", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2",
|
|
48
|
+
"dbSNP_RS", "dbSNP_Val_Status", "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode",
|
|
49
|
+
"Match_Norm_Seq_Allele1", "Match_Norm_Seq_Allele2", "Tumor_Validation_Allele1",
|
|
50
|
+
"Tumor_Validation_Allele2", "Match_Norm_Validation_Allele1", "Match_Norm_Validation_Allele2",
|
|
51
|
+
"Verification_Status", "Validation_Status", "Mutation_Status", "Sequencing_Phase",
|
|
52
|
+
"Sequence_Source", "Validation_Method", "Score", "BAM_File", "Sequencer",
|
|
53
|
+
"Tumor_Sample_UUID", "Matched_Norm_Sample_UUID", "HGVSc", "HGVSp", "HGVSp_Short",
|
|
54
|
+
"Transcript_ID", "Exon_Number", "t_depth", "t_ref_count", "t_alt_count", "n_depth",
|
|
55
|
+
"n_ref_count", "n_alt_count", "all_effects", "Allele", "Gene", "Feature", "Feature_type",
|
|
56
|
+
"One_Consequence", "Consequence", "cDNA_position", "CDS_position", "Protein_position",
|
|
57
|
+
"Amino_acids", "Codons", "Existing_variation", "DISTANCE", "TRANSCRIPT_STRAND", "SYMBOL",
|
|
58
|
+
"SYMBOL_SOURCE", "HGNC_ID", "BIOTYPE", "CANONICAL", "CCDS", "ENSP", "SWISSPROT", "TREMBL",
|
|
59
|
+
"UNIPARC", "RefSeq", "SIFT", "PolyPhen", "EXON", "INTRON", "DOMAINS", "CLIN_SIG", "SOMATIC",
|
|
60
|
+
"PUBMED", "MOTIF_NAME", "MOTIF_POS", "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "IMPACT", "PICK",
|
|
61
|
+
"VARIANT_CLASS", "TSL", "HGVS_OFFSET", "PHENO", "GENE_PHENO", "CONTEXT", "tumor_bam_uuid",
|
|
62
|
+
"normal_bam_uuid", "case_id", "GDC_FILTER", "COSMIC"];
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
#[tokio::main]
|
|
66
|
+
async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
67
|
+
// Accepting the piped input json from jodejs and assign to the variable
|
|
68
|
+
// host: GDC host
|
|
69
|
+
// out_file: save maf to out_file under cachedir
|
|
70
|
+
// url: urls to download single maf files
|
|
71
|
+
let mut buffer = String::new();
|
|
72
|
+
io::stdin().read_line(&mut buffer)?;
|
|
73
|
+
let file_id_lst_js = serde_json::from_str::<Value>(&buffer).expect("Error reading input and serializing to JSON");
|
|
74
|
+
let host = &file_id_lst_js["host"].as_str().unwrap();
|
|
75
|
+
let out_file = &file_id_lst_js["outFile"].as_str().unwrap();
|
|
76
|
+
let mut url: Vec<String> = Vec::new();
|
|
77
|
+
for v in file_id_lst_js["fileIdLst"].as_array().unwrap() {
|
|
78
|
+
url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
//downloading maf files parallelly and merge them into single maf file
|
|
82
|
+
let (tx, rx) = mpsc::channel();
|
|
83
|
+
let fetches = futures::stream::iter(
|
|
84
|
+
url.into_iter().map(|url|{
|
|
85
|
+
let txt = tx.clone();
|
|
86
|
+
async move {
|
|
87
|
+
match reqwest::get(&url).await{
|
|
88
|
+
Ok(resp) => {
|
|
89
|
+
let content = resp.bytes().await.unwrap();
|
|
90
|
+
let mut decoder = GzDecoder::new(&content[..]);
|
|
91
|
+
let mut decompressed_content = Vec::new();
|
|
92
|
+
decoder.read_to_end(&mut decompressed_content).unwrap();
|
|
93
|
+
let text = String::from_utf8_lossy(&decompressed_content);
|
|
94
|
+
txt.send(text.to_string()).unwrap();
|
|
95
|
+
}
|
|
96
|
+
Err(_) => println!("ERROR downloading {}", url),
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
})
|
|
100
|
+
).buffer_unordered(20).collect::<Vec<()>>();
|
|
101
|
+
fetches.await;
|
|
102
|
+
drop(tx);
|
|
103
|
+
|
|
104
|
+
// write downloaded maf into variable received_values
|
|
105
|
+
let mut received_values: Vec<String> = Vec::new();
|
|
106
|
+
for value in rx {
|
|
107
|
+
received_values.push(value);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// store downloaed mafs into one HashMap data sturcture based on the common column names
|
|
111
|
+
let mut maf = HashMap::new();
|
|
112
|
+
for maf_data in received_values {
|
|
113
|
+
if maf.is_empty() {
|
|
114
|
+
maf = gen_map(maf_data);
|
|
115
|
+
// remove columns if column name is found from MAF_COL
|
|
116
|
+
let mut keys_to_remove_in_maf: Vec<String> = Vec::new();
|
|
117
|
+
for key in maf.keys() {
|
|
118
|
+
if !(MAF_COL.contains(&key.as_str())) {
|
|
119
|
+
keys_to_remove_in_maf.push(key.to_string());
|
|
120
|
+
}
|
|
121
|
+
};
|
|
122
|
+
for key in keys_to_remove_in_maf {
|
|
123
|
+
maf.remove(&key);
|
|
124
|
+
}
|
|
125
|
+
} else {
|
|
126
|
+
let maf1 = gen_map(maf_data);
|
|
127
|
+
let keys_in_maf1: Vec<String> = maf1.keys().cloned().collect();
|
|
128
|
+
for key in keys_in_maf1 {
|
|
129
|
+
if maf.contains_key(&key) {
|
|
130
|
+
let key_value = maf1[&key].clone();
|
|
131
|
+
maf.get_mut(&key).map(|val| val.extend(key_value));
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
};
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
// generate a Vec with "chrom\tpos" for sorting
|
|
139
|
+
// generated indices after sorting
|
|
140
|
+
let mut lst_chrom_pos: Vec<String> = Vec::new();
|
|
141
|
+
for (i,v) in maf["Chromosome"].iter().enumerate() {
|
|
142
|
+
lst_chrom_pos.push(v.to_owned()+"\t"+&maf["Start_Position"][i]);
|
|
143
|
+
};
|
|
144
|
+
let idx_sorted = get_sorted_indices(&lst_chrom_pos);
|
|
145
|
+
|
|
146
|
+
// write to file
|
|
147
|
+
let file = File::create(out_file).expect("could not create file");
|
|
148
|
+
let mut encoder = GzEncoder::new(file, Default::default());
|
|
149
|
+
encoder.write_all(MAF_COL.join("\t").as_bytes())?;
|
|
150
|
+
encoder.write_all("\n".as_bytes())?;
|
|
151
|
+
for i in idx_sorted.iter() {
|
|
152
|
+
let mut val_lst: Vec<String> = Vec::new();
|
|
153
|
+
for k in MAF_COL {
|
|
154
|
+
val_lst.push(maf[k][*i].to_owned());
|
|
155
|
+
};
|
|
156
|
+
let val_out = val_lst.join("\t")+"\n";
|
|
157
|
+
encoder.write_all(val_out.as_bytes())?;
|
|
158
|
+
};
|
|
159
|
+
encoder.finish()?;
|
|
160
|
+
Ok(())
|
|
161
|
+
}
|