@sjcrh/proteinpaint-rust 2.29.6 → 2.31.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +9 -0
- package/package.json +1 -1
- package/src/DEanalysis.rs +27 -14
- package/src/gdcmaf.rs +161 -0
package/Cargo.toml
CHANGED
|
@@ -27,6 +27,11 @@ serde_json="^1.0.88"
|
|
|
27
27
|
num = "^0.4.1"
|
|
28
28
|
csv = "^1.2.2"
|
|
29
29
|
#r_mathlib="^0.2.0" # Uncomment this line to activate DE expression app for high sample sizes
|
|
30
|
+
tokio = { version="1", features = ["full"] }
|
|
31
|
+
reqwest = "0.11"
|
|
32
|
+
flate2 = "1"
|
|
33
|
+
futures = "0.3"
|
|
34
|
+
|
|
30
35
|
|
|
31
36
|
|
|
32
37
|
[profile.release]
|
|
@@ -63,6 +68,10 @@ path="src/sv.rs"
|
|
|
63
68
|
name="cluster"
|
|
64
69
|
path="src/cluster.rs"
|
|
65
70
|
|
|
71
|
+
[[bin]]
|
|
72
|
+
name="gdcmaf"
|
|
73
|
+
path="src/gdcmaf.rs"
|
|
74
|
+
|
|
66
75
|
#[[bin]]
|
|
67
76
|
#name="wilcoxon"
|
|
68
77
|
#path="src/wilcoxon.rs"
|
package/package.json
CHANGED
package/src/DEanalysis.rs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
// cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/
|
|
1
|
+
// cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
|
|
2
|
+
// cd .. && cargo build --release && cat ~/sjpp/test.txt | target/release/DEanalysis
|
|
2
3
|
#![allow(non_snake_case)]
|
|
3
4
|
use json;
|
|
4
5
|
use nalgebra::base::dimension::Const;
|
|
@@ -53,6 +54,8 @@ fn input_data(
|
|
|
53
54
|
let mut control_indexes_original: Vec<usize> = Vec::with_capacity(control_list.len());
|
|
54
55
|
let gene_name_index = headers.iter().position(|r| r == &"geneID");
|
|
55
56
|
let gene_symbol_index = headers.iter().position(|r| r == &"geneSymbol");
|
|
57
|
+
//let mut case_samples_not_found: Vec<&str> = Vec::with_capacity(case_list.len());
|
|
58
|
+
//let mut control_samples_not_found: Vec<&str> = Vec::with_capacity(control_list.len());
|
|
56
59
|
|
|
57
60
|
for item in case_list {
|
|
58
61
|
//println!("item:{}", item);
|
|
@@ -60,8 +63,8 @@ fn input_data(
|
|
|
60
63
|
match index {
|
|
61
64
|
Some(n) => case_indexes_original.push(n),
|
|
62
65
|
None => {
|
|
63
|
-
//
|
|
64
|
-
|
|
66
|
+
//panic!("Case sample not found:{}", item);
|
|
67
|
+
//case_samples_not_found.push(item);
|
|
65
68
|
}
|
|
66
69
|
}
|
|
67
70
|
}
|
|
@@ -72,8 +75,8 @@ fn input_data(
|
|
|
72
75
|
match index {
|
|
73
76
|
Some(n) => control_indexes_original.push(n),
|
|
74
77
|
None => {
|
|
75
|
-
//
|
|
76
|
-
|
|
78
|
+
//panic!("Control sample not found:{}", item);
|
|
79
|
+
//control_samples_not_found.push(item);
|
|
77
80
|
}
|
|
78
81
|
}
|
|
79
82
|
}
|
|
@@ -198,8 +201,8 @@ fn main() {
|
|
|
198
201
|
let (filtered_matrix, lib_sizes, filtered_genes, filtered_gene_symbols) =
|
|
199
202
|
filter_by_expr(
|
|
200
203
|
&input_matrix,
|
|
201
|
-
|
|
202
|
-
|
|
204
|
+
case_indexes.len(),
|
|
205
|
+
control_indexes.len(),
|
|
203
206
|
gene_names,
|
|
204
207
|
gene_symbols,
|
|
205
208
|
);
|
|
@@ -259,13 +262,23 @@ fn main() {
|
|
|
259
262
|
); // Setting continuity correction to true in case of normal approximation
|
|
260
263
|
let treated_mean = Data::new(treated).mean();
|
|
261
264
|
let control_mean = Data::new(control).mean();
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
265
|
+
if (treated_mean.unwrap() / control_mean.unwrap())
|
|
266
|
+
.log2()
|
|
267
|
+
.is_nan()
|
|
268
|
+
== false
|
|
269
|
+
&& (treated_mean.unwrap() / control_mean.unwrap())
|
|
270
|
+
.log2()
|
|
271
|
+
.is_infinite()
|
|
272
|
+
== false
|
|
273
|
+
{
|
|
274
|
+
p_values.push(PValueIndexes {
|
|
275
|
+
index: i,
|
|
276
|
+
gene_name: filtered_genes[i].to_owned(),
|
|
277
|
+
gene_symbol: filtered_gene_symbols[i].to_owned(),
|
|
278
|
+
fold_change: (treated_mean.unwrap() / control_mean.unwrap()).log2(),
|
|
279
|
+
p_value: p_value,
|
|
280
|
+
});
|
|
281
|
+
}
|
|
269
282
|
}
|
|
270
283
|
//println!("p_values:{:?}", p_values);
|
|
271
284
|
println!(
|
package/src/gdcmaf.rs
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
use flate2::read::GzDecoder;
|
|
2
|
+
use flate2::write::GzEncoder;
|
|
3
|
+
use flate2::Compression;
|
|
4
|
+
use serde_json::Value;
|
|
5
|
+
use std::path::Path;
|
|
6
|
+
use futures::StreamExt;
|
|
7
|
+
use std::io;
|
|
8
|
+
use std::io::{Read,Write};
|
|
9
|
+
use std::sync::mpsc;
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
fn gen_vec(d:String) -> (Vec<String>,Vec<Vec<u8>>) {
|
|
14
|
+
let mut maf_bit: Vec<Vec<u8>> = Vec::new();
|
|
15
|
+
let mut lst_chrom_pos: Vec<String> = Vec::new();
|
|
16
|
+
let mut header_indices: Vec<usize> = Vec::new();
|
|
17
|
+
let mut chrom_index: usize = 9999;
|
|
18
|
+
let mut pos_index: usize = 9999;
|
|
19
|
+
let lines = d.trim_end().split("\n");
|
|
20
|
+
for line in lines {
|
|
21
|
+
if line.starts_with("#") {
|
|
22
|
+
continue
|
|
23
|
+
} else if line.contains("Hugo_Symbol") {
|
|
24
|
+
let header: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
25
|
+
for col in MAF_COL {
|
|
26
|
+
let col_index: usize = header.iter().position(|x| x == col).unwrap();
|
|
27
|
+
header_indices.push(col_index);
|
|
28
|
+
if col == "Chromosome" {
|
|
29
|
+
chrom_index = col_index;
|
|
30
|
+
} else if col == "Start_Position" {
|
|
31
|
+
pos_index = col_index;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
} else {
|
|
35
|
+
let maf_cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
36
|
+
let mut maf_out_lst: Vec<String> = Vec::new();
|
|
37
|
+
let mut chrom = String::new();
|
|
38
|
+
let mut pos = String::new();
|
|
39
|
+
for (i,x) in header_indices.iter().enumerate() {
|
|
40
|
+
maf_out_lst.push(maf_cont_lst[*x].to_string());
|
|
41
|
+
if chrom_index != 9999 && i == chrom_index {
|
|
42
|
+
chrom = maf_cont_lst[*x].to_string();
|
|
43
|
+
} else if pos_index != 9999 && i == pos_index {
|
|
44
|
+
pos = maf_cont_lst[*x].to_string();
|
|
45
|
+
}
|
|
46
|
+
};
|
|
47
|
+
maf_out_lst.push("\n".to_string());
|
|
48
|
+
let maf_compress_data = gen_gzip_vec(maf_out_lst);
|
|
49
|
+
maf_bit.push(maf_compress_data);
|
|
50
|
+
lst_chrom_pos.push(chrom+"\t"+&pos);
|
|
51
|
+
}
|
|
52
|
+
};
|
|
53
|
+
(lst_chrom_pos,maf_bit)
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
fn get_sorted_indices(lst: &Vec<String>) -> Vec<usize>{
|
|
57
|
+
let mut indices = (0..lst.len()).collect::<Vec<usize>>();
|
|
58
|
+
indices.sort_by(|a,b| {
|
|
59
|
+
lst[*a].split('\t').collect::<Vec<&str>>()[0].cmp(lst[*b].split('\t').collect::<Vec<&str>>()[0])
|
|
60
|
+
.then(lst[*a].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap().cmp(&lst[*b].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap()))
|
|
61
|
+
});
|
|
62
|
+
indices
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// convert vector (maf row) to GZIP encoded format
|
|
66
|
+
fn gen_gzip_vec(s:Vec<String>) -> Vec<u8> {
|
|
67
|
+
let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
|
|
68
|
+
let _ = encoder.write_all(s.join("\t").as_bytes());
|
|
69
|
+
let compress_data = encoder.finish().unwrap();
|
|
70
|
+
compress_data
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// GDC MAF columns (96)
|
|
74
|
+
const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
|
|
75
|
+
"Start_Position", "End_Position", "Strand", "Variant_Classification",
|
|
76
|
+
"Variant_Type", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2",
|
|
77
|
+
"dbSNP_RS", "dbSNP_Val_Status", "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode",
|
|
78
|
+
"Match_Norm_Seq_Allele1", "Match_Norm_Seq_Allele2", "Tumor_Validation_Allele1",
|
|
79
|
+
"Tumor_Validation_Allele2", "Match_Norm_Validation_Allele1", "Match_Norm_Validation_Allele2",
|
|
80
|
+
"Verification_Status", "Validation_Status", "Mutation_Status", "Sequencing_Phase",
|
|
81
|
+
"Sequence_Source", "Validation_Method", "Score", "BAM_File", "Sequencer",
|
|
82
|
+
"Tumor_Sample_UUID", "Matched_Norm_Sample_UUID", "HGVSc", "HGVSp", "HGVSp_Short",
|
|
83
|
+
"Transcript_ID", "Exon_Number", "t_depth", "t_ref_count", "t_alt_count", "n_depth",
|
|
84
|
+
"n_ref_count", "n_alt_count", "all_effects", "Allele", "Gene", "Feature", "Feature_type",
|
|
85
|
+
"One_Consequence", "Consequence", "cDNA_position", "CDS_position", "Protein_position",
|
|
86
|
+
"Amino_acids", "Codons", "Existing_variation", "DISTANCE", "TRANSCRIPT_STRAND", "SYMBOL",
|
|
87
|
+
"SYMBOL_SOURCE", "HGNC_ID", "BIOTYPE", "CANONICAL", "CCDS", "ENSP", "SWISSPROT", "TREMBL",
|
|
88
|
+
"UNIPARC", "RefSeq", "SIFT", "PolyPhen", "EXON", "INTRON", "DOMAINS", "CLIN_SIG", "SOMATIC",
|
|
89
|
+
"PUBMED", "MOTIF_NAME", "MOTIF_POS", "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "IMPACT", "PICK",
|
|
90
|
+
"VARIANT_CLASS", "TSL", "HGVS_OFFSET", "PHENO", "GENE_PHENO", "CONTEXT", "tumor_bam_uuid",
|
|
91
|
+
"normal_bam_uuid", "case_id", "GDC_FILTER", "COSMIC"];
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
#[tokio::main]
|
|
95
|
+
async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
96
|
+
// Accepting the piped input json from jodejs and assign to the variable
|
|
97
|
+
// host: GDC host
|
|
98
|
+
// save output into json string
|
|
99
|
+
// url: urls to download single maf files
|
|
100
|
+
let mut buffer = String::new();
|
|
101
|
+
io::stdin().read_line(&mut buffer)?;
|
|
102
|
+
let file_id_lst_js = serde_json::from_str::<Value>(&buffer).expect("Error reading input and serializing to JSON");
|
|
103
|
+
let host = &file_id_lst_js["host"].as_str().unwrap();
|
|
104
|
+
let mut url: Vec<String> = Vec::new();
|
|
105
|
+
for v in file_id_lst_js["fileIdLst"].as_array().unwrap() {
|
|
106
|
+
url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
//downloading maf files parallelly and merge them into single maf file
|
|
110
|
+
let (tx, rx) = mpsc::channel();
|
|
111
|
+
let fetches = futures::stream::iter(
|
|
112
|
+
url.into_iter().map(|url|{
|
|
113
|
+
let txt = tx.clone();
|
|
114
|
+
async move {
|
|
115
|
+
match reqwest::get(&url).await{
|
|
116
|
+
Ok(resp) => {
|
|
117
|
+
let content = resp.bytes().await.unwrap();
|
|
118
|
+
let mut decoder = GzDecoder::new(&content[..]);
|
|
119
|
+
let mut decompressed_content = Vec::new();
|
|
120
|
+
if let Ok(_) = decoder.read_to_end(&mut decompressed_content) {
|
|
121
|
+
let text = String::from_utf8_lossy(&decompressed_content);
|
|
122
|
+
let (lst_chrom_pos,maf_bit) = gen_vec(text.to_string());
|
|
123
|
+
txt.send((lst_chrom_pos,maf_bit)).unwrap();
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
Err(_) => println!("ERROR downloading {}", url),
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
})
|
|
130
|
+
).buffer_unordered(20).collect::<Vec<()>>();
|
|
131
|
+
fetches.await;
|
|
132
|
+
drop(tx);
|
|
133
|
+
|
|
134
|
+
// write downloaded maf (GZIP format) into a Vector
|
|
135
|
+
// lst_chrom_pos: a vector including chromsome&position info for sorting maf
|
|
136
|
+
// idx_sorted: indices after sorting basedon chromsome&position
|
|
137
|
+
let mut maf_bit: Vec<Vec<u8>> = Vec::new();
|
|
138
|
+
let mut lst_chrom_pos: Vec<String> = Vec::new();
|
|
139
|
+
for (chr_pos_lst,maf_bit_lst) in rx {
|
|
140
|
+
maf_bit.extend_from_slice(&maf_bit_lst);
|
|
141
|
+
lst_chrom_pos.extend_from_slice(&chr_pos_lst);
|
|
142
|
+
};
|
|
143
|
+
let idx_sorted = get_sorted_indices(&lst_chrom_pos);
|
|
144
|
+
|
|
145
|
+
// output
|
|
146
|
+
// maf_out_bit: A vector of GZIPPED maf
|
|
147
|
+
// compress_header: output header
|
|
148
|
+
let mut maf_out_bit: Vec<u8> = Vec::new();
|
|
149
|
+
let compress_header = gen_gzip_vec(MAF_COL.iter().map(|s| s.to_string()).collect());
|
|
150
|
+
maf_out_bit.extend(compress_header);
|
|
151
|
+
let compress_header_line_break = gen_gzip_vec(["\n".to_string()].to_vec());
|
|
152
|
+
maf_out_bit.extend(compress_header_line_break);
|
|
153
|
+
for i in idx_sorted.iter() {
|
|
154
|
+
maf_out_bit.extend(&maf_bit[*i]);
|
|
155
|
+
};
|
|
156
|
+
|
|
157
|
+
// standard output
|
|
158
|
+
println!("{:?}",maf_out_bit);
|
|
159
|
+
std::io::stdout().flush().expect("Failed to flush stdout");
|
|
160
|
+
Ok(())
|
|
161
|
+
}
|