@sjcrh/proteinpaint-rust 2.38.1 → 2.39.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +4 -4
- package/package.json +1 -1
- package/src/gdcmaf.rs +26 -28
- package/src/{gene_variance.rs → topGeneByExpressionVariance.rs} +50 -76
package/Cargo.toml
CHANGED
|
@@ -11,6 +11,7 @@ autobins = false
|
|
|
11
11
|
[dependencies]
|
|
12
12
|
kodama = "0.3"
|
|
13
13
|
rayon = "1.7.0"
|
|
14
|
+
bgzip = "0.3.1"
|
|
14
15
|
petgraph = "0.6.3"
|
|
15
16
|
ndarray = "0.15.6"
|
|
16
17
|
nalgebra = {version = "0.32.2", features = ["serde-serialize"]}
|
|
@@ -26,7 +27,7 @@ serde = {version = "^1.0.147", features = ["derive"]}
|
|
|
26
27
|
serde_json="^1.0.88"
|
|
27
28
|
num = "^0.4.1"
|
|
28
29
|
csv = "^1.2.2"
|
|
29
|
-
r_mathlib="^0.2.0"
|
|
30
|
+
r_mathlib="^0.2.0"
|
|
30
31
|
tokio = { version="1", features = ["full"] }
|
|
31
32
|
reqwest = "0.11"
|
|
32
33
|
flate2 = "1"
|
|
@@ -73,14 +74,13 @@ name="gdcmaf"
|
|
|
73
74
|
path="src/gdcmaf.rs"
|
|
74
75
|
|
|
75
76
|
[[bin]]
|
|
76
|
-
name="
|
|
77
|
-
path="src/
|
|
77
|
+
name="topGeneByExpressionVariance"
|
|
78
|
+
path="src/topGeneByExpressionVariance.rs"
|
|
78
79
|
|
|
79
80
|
#[[bin]]
|
|
80
81
|
#name="wilcoxon"
|
|
81
82
|
#path="src/wilcoxon.rs"
|
|
82
83
|
|
|
83
|
-
# Uncomment the lines below to use DE app for higher sample sizes
|
|
84
84
|
[[bin]]
|
|
85
85
|
name="DEanalysis"
|
|
86
86
|
path="src/DEanalysis.rs"
|
package/package.json
CHANGED
package/src/gdcmaf.rs
CHANGED
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
Output gzip compressed maf file to stdout.
|
|
8
8
|
|
|
9
9
|
Example of usage:
|
|
10
|
-
echo '{"host": "https://api.gdc.cancer.gov/data/", "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
|
|
10
|
+
echo '{"host": "https://api.gdc.cancer.gov/data/","columns": ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome", "Start_Position"], "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
13
|
use flate2::read::GzDecoder;
|
|
@@ -20,7 +20,7 @@ use std::io::{self,Read,Write};
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
fn select_maf_col(d:String) -> Vec<u8> {
|
|
23
|
+
fn select_maf_col(d:String,columns:&Vec<String>) -> Vec<u8> {
|
|
24
24
|
let mut maf_str: String = String::new();
|
|
25
25
|
let mut header_indices: Vec<usize> = Vec::new();
|
|
26
26
|
let lines = d.trim_end().split("\n");
|
|
@@ -29,9 +29,12 @@ fn select_maf_col(d:String) -> Vec<u8> {
|
|
|
29
29
|
continue
|
|
30
30
|
} else if line.contains("Hugo_Symbol") {
|
|
31
31
|
let header: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
32
|
-
for col in
|
|
33
|
-
let
|
|
34
|
-
|
|
32
|
+
for col in columns {
|
|
33
|
+
if let Some(index) = header.iter().position(|x| x == col) {
|
|
34
|
+
header_indices.push(index);
|
|
35
|
+
} else {
|
|
36
|
+
panic!("{} was not found!",col);
|
|
37
|
+
}
|
|
35
38
|
}
|
|
36
39
|
} else {
|
|
37
40
|
let maf_cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
@@ -47,27 +50,6 @@ fn select_maf_col(d:String) -> Vec<u8> {
|
|
|
47
50
|
}
|
|
48
51
|
|
|
49
52
|
|
|
50
|
-
// GDC MAF columns (96)
|
|
51
|
-
const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
|
|
52
|
-
"Start_Position", "End_Position", "Strand", "Variant_Classification",
|
|
53
|
-
"Variant_Type", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2",
|
|
54
|
-
"dbSNP_RS", "dbSNP_Val_Status", "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode",
|
|
55
|
-
"Match_Norm_Seq_Allele1", "Match_Norm_Seq_Allele2", "Tumor_Validation_Allele1",
|
|
56
|
-
"Tumor_Validation_Allele2", "Match_Norm_Validation_Allele1", "Match_Norm_Validation_Allele2",
|
|
57
|
-
"Verification_Status", "Validation_Status", "Mutation_Status", "Sequencing_Phase",
|
|
58
|
-
"Sequence_Source", "Validation_Method", "Score", "BAM_File", "Sequencer",
|
|
59
|
-
"Tumor_Sample_UUID", "Matched_Norm_Sample_UUID", "HGVSc", "HGVSp", "HGVSp_Short",
|
|
60
|
-
"Transcript_ID", "Exon_Number", "t_depth", "t_ref_count", "t_alt_count", "n_depth",
|
|
61
|
-
"n_ref_count", "n_alt_count", "all_effects", "Allele", "Gene", "Feature", "Feature_type",
|
|
62
|
-
"One_Consequence", "Consequence", "cDNA_position", "CDS_position", "Protein_position",
|
|
63
|
-
"Amino_acids", "Codons", "Existing_variation", "DISTANCE", "TRANSCRIPT_STRAND", "SYMBOL",
|
|
64
|
-
"SYMBOL_SOURCE", "HGNC_ID", "BIOTYPE", "CANONICAL", "CCDS", "ENSP", "SWISSPROT", "TREMBL",
|
|
65
|
-
"UNIPARC", "RefSeq", "SIFT", "PolyPhen", "EXON", "INTRON", "DOMAINS", "CLIN_SIG", "SOMATIC",
|
|
66
|
-
"PUBMED", "MOTIF_NAME", "MOTIF_POS", "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "IMPACT", "PICK",
|
|
67
|
-
"VARIANT_CLASS", "TSL", "HGVS_OFFSET", "PHENO", "GENE_PHENO", "CONTEXT", "tumor_bam_uuid",
|
|
68
|
-
"normal_bam_uuid", "case_id", "GDC_FILTER", "COSMIC"];
|
|
69
|
-
|
|
70
|
-
|
|
71
53
|
#[tokio::main]
|
|
72
54
|
async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
73
55
|
// Accepting the piped input json from jodejs and assign to the variable
|
|
@@ -83,6 +65,22 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
|
83
65
|
url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
|
|
84
66
|
};
|
|
85
67
|
|
|
68
|
+
// read columns as array from input json and convert data type from Vec<Value> to Vec<String>
|
|
69
|
+
let maf_col:Vec<String>;
|
|
70
|
+
if let Some(maf_col_value) = file_id_lst_js.get("columns") {
|
|
71
|
+
//convert Vec<Value> to Vec<String>
|
|
72
|
+
if let Some(maf_col_array) = maf_col_value.as_array() {
|
|
73
|
+
maf_col = maf_col_array
|
|
74
|
+
.iter()
|
|
75
|
+
.map(|v| v.to_string().replace("\"",""))
|
|
76
|
+
.collect::<Vec<String>>();
|
|
77
|
+
} else {
|
|
78
|
+
panic!("Columns is not an array");
|
|
79
|
+
}
|
|
80
|
+
} else {
|
|
81
|
+
panic!("Columns was not selected");
|
|
82
|
+
};
|
|
83
|
+
|
|
86
84
|
//downloading maf files parallelly and merge them into single maf file
|
|
87
85
|
let download_futures = futures::stream::iter(
|
|
88
86
|
url.into_iter().map(|url|{
|
|
@@ -110,13 +108,13 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
|
110
108
|
|
|
111
109
|
// output
|
|
112
110
|
let mut encoder = GzEncoder::new(io::stdout(), Compression::default());
|
|
113
|
-
let _ = encoder.write_all(&
|
|
111
|
+
let _ = encoder.write_all(&maf_col.join("\t").as_bytes().to_vec()).expect("Failed to write header");
|
|
114
112
|
let _ = encoder.write_all(b"\n").expect("Failed to write newline");
|
|
115
113
|
download_futures.buffer_unordered(20).for_each(|item| {
|
|
116
114
|
if item.starts_with("Failed") {
|
|
117
115
|
eprintln!("{}",item);
|
|
118
116
|
} else {
|
|
119
|
-
let maf_bit = select_maf_col(item);
|
|
117
|
+
let maf_bit = select_maf_col(item,&maf_col);
|
|
120
118
|
let _ = encoder.write_all(&maf_bit).expect("Failed to write file");
|
|
121
119
|
};
|
|
122
120
|
async {}
|
|
@@ -11,6 +11,7 @@ Various JSON parameters:
|
|
|
11
11
|
Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","input_file":"/path/to/input/file","filter_extreme_values":true,"num_genes":100, "param":"var"}' && time echo $json | target/release/gene_variance
|
|
12
12
|
*/
|
|
13
13
|
#![allow(non_snake_case)]
|
|
14
|
+
use bgzip::BGZFReader;
|
|
14
15
|
use json;
|
|
15
16
|
use nalgebra::base::dimension::Dyn;
|
|
16
17
|
use nalgebra::base::Matrix;
|
|
@@ -23,8 +24,9 @@ use statrs::statistics::Median;
|
|
|
23
24
|
use statrs::statistics::OrderStatistics;
|
|
24
25
|
use statrs::statistics::Statistics;
|
|
25
26
|
use std::cmp::Ordering;
|
|
27
|
+
use std::fs;
|
|
26
28
|
use std::io;
|
|
27
|
-
use std::
|
|
29
|
+
use std::io::Read;
|
|
28
30
|
use std::str::FromStr;
|
|
29
31
|
use std::time::Instant;
|
|
30
32
|
|
|
@@ -34,100 +36,78 @@ fn input_data(
|
|
|
34
36
|
) -> (
|
|
35
37
|
Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
36
38
|
Vec<String>,
|
|
37
|
-
Vec<String>,
|
|
38
39
|
) {
|
|
39
40
|
// Build the CSV reader and iterate over each record.
|
|
40
|
-
let
|
|
41
|
-
let mut rdr = csv::Reader::from_path(path).unwrap();
|
|
41
|
+
let mut reader = BGZFReader::new(fs::File::open(filename).unwrap()).unwrap();
|
|
42
42
|
let mut num_lines: usize = 0;
|
|
43
|
-
let mut
|
|
44
|
-
let mut gene_names: Vec<String> = Vec::with_capacity(65000);
|
|
45
|
-
let mut gene_symbols: Vec<String> = Vec::with_capacity(65000);
|
|
46
|
-
let mut num_columns: usize = 0;
|
|
43
|
+
let mut gene_symbols: Vec<String> = Vec::with_capacity(500);
|
|
47
44
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
let mut headers: Vec<&str> = Vec::with_capacity(1500);
|
|
51
|
-
for field in header_line.iter() {
|
|
52
|
-
headers = field.split('\t').collect::<Vec<&str>>();
|
|
53
|
-
}
|
|
54
|
-
//println!("headers:{:?}", headers);
|
|
55
|
-
let mut sample_indexes_original: Vec<usize> = Vec::with_capacity(sample_list.len());
|
|
56
|
-
let gene_name_index = headers.iter().position(|r| r == &"geneID");
|
|
57
|
-
let gene_symbol_index = headers.iter().position(|r| r == &"geneSymbol");
|
|
58
|
-
//let mut case_samples_not_found: Vec<&str> = Vec::with_capacity(sample_list.len());
|
|
59
|
-
//let mut control_samples_not_found: Vec<&str> = Vec::with_capacity(control_list.len());
|
|
45
|
+
let mut buffer = String::new();
|
|
46
|
+
reader.read_to_string(&mut buffer).unwrap();
|
|
60
47
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
48
|
+
let lines = buffer.split("\n");
|
|
49
|
+
let mut first = true;
|
|
50
|
+
let mut input_vector: Vec<f64> = Vec::with_capacity(1000 * 500);
|
|
51
|
+
let mut column_numbers: Vec<usize> = Vec::with_capacity(300);
|
|
52
|
+
for line in lines {
|
|
53
|
+
if first == true {
|
|
54
|
+
first = false;
|
|
55
|
+
let columns: Vec<&str> = line.split("\t").collect();
|
|
56
|
+
// Finding column numbers corresponding to each sample given in the input list
|
|
57
|
+
for item in sample_list {
|
|
58
|
+
if let Some(index) = columns.iter().position(|num| num == item) {
|
|
59
|
+
column_numbers.push(index)
|
|
60
|
+
} else {
|
|
61
|
+
panic!("Sample {} not found:", item)
|
|
62
|
+
}
|
|
69
63
|
}
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
if num_lines == 0 {
|
|
94
|
-
samples_indexes.push(num_columns);
|
|
95
|
-
num_columns += 1;
|
|
64
|
+
} else {
|
|
65
|
+
let line2: Vec<&str> = line.split("\t").collect();
|
|
66
|
+
if line2.len() == 1 {
|
|
67
|
+
break; // end of file
|
|
68
|
+
} else {
|
|
69
|
+
num_lines += 1;
|
|
70
|
+
//println!("line2:{:?}", line2);
|
|
71
|
+
gene_symbols.push(line2[3].to_string());
|
|
72
|
+
for i in &column_numbers {
|
|
73
|
+
let field = line2[*i];
|
|
74
|
+
let num = FromStr::from_str(field);
|
|
75
|
+
match num {
|
|
76
|
+
Ok(n) => {
|
|
77
|
+
//println!("n:{}", n);
|
|
78
|
+
input_vector.push(n);
|
|
79
|
+
}
|
|
80
|
+
Err(_n) => {
|
|
81
|
+
panic!(
|
|
82
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
83
|
+
field,
|
|
84
|
+
num_lines + 1,
|
|
85
|
+
i + 1
|
|
86
|
+
);
|
|
96
87
|
}
|
|
97
|
-
}
|
|
98
|
-
Err(_n) => {
|
|
99
|
-
panic!(
|
|
100
|
-
"Number {} in line {} and column {} is not a decimal number",
|
|
101
|
-
field,
|
|
102
|
-
num_lines + 1,
|
|
103
|
-
index + 1
|
|
104
|
-
);
|
|
105
88
|
}
|
|
106
89
|
}
|
|
107
90
|
}
|
|
108
|
-
index += 1;
|
|
109
91
|
}
|
|
110
|
-
num_lines += 1;
|
|
111
92
|
}
|
|
93
|
+
|
|
112
94
|
//println!("case_indexes:{:?}", case_indexes);
|
|
113
95
|
//println!("control_indexes:{:?}", control_indexes);
|
|
114
96
|
|
|
115
|
-
let dm = DMatrix::from_row_slice(num_lines,
|
|
97
|
+
let dm = DMatrix::from_row_slice(num_lines, sample_list.len(), &input_vector);
|
|
116
98
|
//println!("dm:{:?}", dm);
|
|
117
|
-
(dm,
|
|
99
|
+
(dm, gene_symbols)
|
|
118
100
|
}
|
|
119
101
|
|
|
120
102
|
#[allow(dead_code)]
|
|
121
103
|
#[derive(Debug, Serialize, Deserialize)]
|
|
122
104
|
struct GeneInfo {
|
|
123
|
-
gene_name: String,
|
|
124
105
|
gene_symbol: String,
|
|
125
106
|
param: f64,
|
|
126
107
|
}
|
|
127
108
|
|
|
128
109
|
fn calculate_variance(
|
|
129
110
|
input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
130
|
-
gene_names: Vec<String>,
|
|
131
111
|
gene_symbols: Vec<String>,
|
|
132
112
|
mut min_sample_size: f64,
|
|
133
113
|
filter_extreme_values: bool,
|
|
@@ -196,13 +176,11 @@ fn calculate_variance(
|
|
|
196
176
|
{
|
|
197
177
|
gene_infos.push(GeneInfo {
|
|
198
178
|
param: gene_counts.variance(),
|
|
199
|
-
gene_name: gene_names[row].clone(),
|
|
200
179
|
gene_symbol: gene_symbols[row].clone(),
|
|
201
180
|
});
|
|
202
181
|
} else if filter_extreme_values == false {
|
|
203
182
|
gene_infos.push(GeneInfo {
|
|
204
183
|
param: gene_counts.variance(),
|
|
205
|
-
gene_name: gene_names[row].clone(),
|
|
206
184
|
gene_symbol: gene_symbols[row].clone(),
|
|
207
185
|
});
|
|
208
186
|
}
|
|
@@ -216,13 +194,11 @@ fn calculate_variance(
|
|
|
216
194
|
{
|
|
217
195
|
gene_infos.push(GeneInfo {
|
|
218
196
|
param: gene_counts_data.interquartile_range(),
|
|
219
|
-
gene_name: gene_names[row].clone(),
|
|
220
197
|
gene_symbol: gene_symbols[row].clone(),
|
|
221
198
|
});
|
|
222
199
|
} else if filter_extreme_values == false {
|
|
223
200
|
gene_infos.push(GeneInfo {
|
|
224
201
|
param: gene_counts_data.interquartile_range(),
|
|
225
|
-
gene_name: gene_names[row].clone(),
|
|
226
202
|
gene_symbol: gene_symbols[row].clone(),
|
|
227
203
|
});
|
|
228
204
|
}
|
|
@@ -321,11 +297,9 @@ fn main() {
|
|
|
321
297
|
}
|
|
322
298
|
|
|
323
299
|
let samples_list: Vec<&str> = samples_string.split(",").collect();
|
|
324
|
-
let (input_matrix,
|
|
325
|
-
input_data(&file_name, &samples_list);
|
|
300
|
+
let (input_matrix, gene_symbols) = input_data(&file_name, &samples_list);
|
|
326
301
|
let gene_infos = calculate_variance(
|
|
327
302
|
input_matrix,
|
|
328
|
-
gene_names,
|
|
329
303
|
gene_symbols,
|
|
330
304
|
samples_list.len() as f64,
|
|
331
305
|
filter_extreme_values,
|
|
@@ -343,7 +317,7 @@ fn main() {
|
|
|
343
317
|
}
|
|
344
318
|
}
|
|
345
319
|
output_string += &"]".to_string();
|
|
346
|
-
println!("{}", output_string);
|
|
320
|
+
println!("output_json:{}", output_string);
|
|
347
321
|
println!("Time for calculating variances:{:?}", now.elapsed());
|
|
348
322
|
}
|
|
349
323
|
Err(error) => println!("Incorrect json: {}", error),
|