@sjcrh/proteinpaint-rust 2.38.0 → 2.39.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -11,6 +11,7 @@ autobins = false
11
11
  [dependencies]
12
12
  kodama = "0.3"
13
13
  rayon = "1.7.0"
14
+ bgzip = "0.3.1"
14
15
  petgraph = "0.6.3"
15
16
  ndarray = "0.15.6"
16
17
  nalgebra = {version = "0.32.2", features = ["serde-serialize"]}
@@ -26,7 +27,7 @@ serde = {version = "^1.0.147", features = ["derive"]}
26
27
  serde_json="^1.0.88"
27
28
  num = "^0.4.1"
28
29
  csv = "^1.2.2"
29
- r_mathlib="^0.2.0" # Uncomment this line to activate DE expression app for high sample sizes
30
+ r_mathlib="^0.2.0"
30
31
  tokio = { version="1", features = ["full"] }
31
32
  reqwest = "0.11"
32
33
  flate2 = "1"
@@ -73,14 +74,13 @@ name="gdcmaf"
73
74
  path="src/gdcmaf.rs"
74
75
 
75
76
  [[bin]]
76
- name="gene_variance"
77
- path="src/gene_variance.rs"
77
+ name="topGeneByExpressionVariance"
78
+ path="src/topGeneByExpressionVariance.rs"
78
79
 
79
80
  #[[bin]]
80
81
  #name="wilcoxon"
81
82
  #path="src/wilcoxon.rs"
82
83
 
83
- # Uncomment the lines below to use DE app for higher sample sizes
84
84
  [[bin]]
85
85
  name="DEanalysis"
86
86
  path="src/DEanalysis.rs"
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.38.0",
2
+ "version": "2.39.0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
package/src/gdcmaf.rs CHANGED
@@ -1,5 +1,5 @@
1
1
  /*
2
- This script download cohort maf files from GDC, combine them into a single file, and output the sorted file based on chromsome and Start_Position.
2
+ This script download cohort maf files from GDC, concatenate them into a single file that includes user specified columns.
3
3
 
4
4
  Input JSON:
5
5
  host: GDC host
@@ -7,7 +7,7 @@
7
7
  Output gzip compressed maf file to stdout.
8
8
 
9
9
  Example of usage:
10
- echo '{"host": "https://api.gdc.cancer.gov/data/", "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
10
+ echo '{"host": "https://api.gdc.cancer.gov/data/","columns": ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome", "Start_Position"], "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
11
11
  */
12
12
 
13
13
  use flate2::read::GzDecoder;
@@ -17,81 +17,38 @@ use serde_json::Value;
17
17
  use std::path::Path;
18
18
  use futures::StreamExt;
19
19
  use std::io::{self,Read,Write};
20
- use std::sync::mpsc;
21
20
 
22
21
 
23
22
 
24
- fn gen_vec(d:String) -> (Vec<String>,Vec<Vec<u8>>) {
25
- let mut maf_bit: Vec<Vec<u8>> = Vec::new();
26
- let mut lst_chrom_pos: Vec<String> = Vec::new();
23
+ fn select_maf_col(d:String,columns:&Vec<String>) -> Vec<u8> {
24
+ let mut maf_str: String = String::new();
27
25
  let mut header_indices: Vec<usize> = Vec::new();
28
- let mut chrom_index: usize = 9999;
29
- let mut pos_index: usize = 9999;
30
26
  let lines = d.trim_end().split("\n");
31
27
  for line in lines {
32
28
  if line.starts_with("#") {
33
29
  continue
34
30
  } else if line.contains("Hugo_Symbol") {
35
31
  let header: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
36
- for col in MAF_COL {
37
- let col_index: usize = header.iter().position(|x| x == col).unwrap();
38
- header_indices.push(col_index);
39
- if col == "Chromosome" {
40
- chrom_index = col_index;
41
- } else if col == "Start_Position" {
42
- pos_index = col_index;
32
+ for col in columns {
33
+ if let Some(index) = header.iter().position(|x| x == col) {
34
+ header_indices.push(index);
35
+ } else {
36
+ panic!("{} was not found!",col);
43
37
  }
44
38
  }
45
39
  } else {
46
40
  let maf_cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
47
41
  let mut maf_out_lst: Vec<String> = Vec::new();
48
- let mut chrom = String::new();
49
- let mut pos = String::new();
50
- for (i,x) in header_indices.iter().enumerate() {
42
+ for x in header_indices.iter() {
51
43
  maf_out_lst.push(maf_cont_lst[*x].to_string());
52
- if chrom_index != 9999 && i == chrom_index {
53
- chrom = maf_cont_lst[*x].to_string();
54
- } else if pos_index != 9999 && i == pos_index {
55
- pos = maf_cont_lst[*x].to_string();
56
- }
57
44
  };
58
45
  maf_out_lst.push("\n".to_string());
59
- maf_bit.push(maf_out_lst.join("\t").as_bytes().to_vec());
60
- lst_chrom_pos.push(chrom+"\t"+&pos);
46
+ maf_str.push_str(maf_out_lst.join("\t").as_str());
61
47
  }
62
48
  };
63
- (lst_chrom_pos,maf_bit)
64
- }
65
-
66
- fn get_sorted_indices(lst: &Vec<String>) -> Vec<usize>{
67
- let mut indices = (0..lst.len()).collect::<Vec<usize>>();
68
- indices.sort_by(|a,b| {
69
- lst[*a].split('\t').collect::<Vec<&str>>()[0].cmp(lst[*b].split('\t').collect::<Vec<&str>>()[0])
70
- .then(lst[*a].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap().cmp(&lst[*b].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap()))
71
- });
72
- indices
49
+ maf_str.as_bytes().to_vec()
73
50
  }
74
51
 
75
- // GDC MAF columns (96)
76
- const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
77
- "Start_Position", "End_Position", "Strand", "Variant_Classification",
78
- "Variant_Type", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2",
79
- "dbSNP_RS", "dbSNP_Val_Status", "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode",
80
- "Match_Norm_Seq_Allele1", "Match_Norm_Seq_Allele2", "Tumor_Validation_Allele1",
81
- "Tumor_Validation_Allele2", "Match_Norm_Validation_Allele1", "Match_Norm_Validation_Allele2",
82
- "Verification_Status", "Validation_Status", "Mutation_Status", "Sequencing_Phase",
83
- "Sequence_Source", "Validation_Method", "Score", "BAM_File", "Sequencer",
84
- "Tumor_Sample_UUID", "Matched_Norm_Sample_UUID", "HGVSc", "HGVSp", "HGVSp_Short",
85
- "Transcript_ID", "Exon_Number", "t_depth", "t_ref_count", "t_alt_count", "n_depth",
86
- "n_ref_count", "n_alt_count", "all_effects", "Allele", "Gene", "Feature", "Feature_type",
87
- "One_Consequence", "Consequence", "cDNA_position", "CDS_position", "Protein_position",
88
- "Amino_acids", "Codons", "Existing_variation", "DISTANCE", "TRANSCRIPT_STRAND", "SYMBOL",
89
- "SYMBOL_SOURCE", "HGNC_ID", "BIOTYPE", "CANONICAL", "CCDS", "ENSP", "SWISSPROT", "TREMBL",
90
- "UNIPARC", "RefSeq", "SIFT", "PolyPhen", "EXON", "INTRON", "DOMAINS", "CLIN_SIG", "SOMATIC",
91
- "PUBMED", "MOTIF_NAME", "MOTIF_POS", "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "IMPACT", "PICK",
92
- "VARIANT_CLASS", "TSL", "HGVS_OFFSET", "PHENO", "GENE_PHENO", "CONTEXT", "tumor_bam_uuid",
93
- "normal_bam_uuid", "case_id", "GDC_FILTER", "COSMIC"];
94
-
95
52
 
96
53
  #[tokio::main]
97
54
  async fn main() -> Result<(),Box<dyn std::error::Error>> {
@@ -108,47 +65,59 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
108
65
  url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
109
66
  };
110
67
 
68
+ // read columns as array from input json and convert data type from Vec<Value> to Vec<String>
69
+ let maf_col:Vec<String>;
70
+ if let Some(maf_col_value) = file_id_lst_js.get("columns") {
71
+ //convert Vec<Value> to Vec<String>
72
+ if let Some(maf_col_array) = maf_col_value.as_array() {
73
+ maf_col = maf_col_array
74
+ .iter()
75
+ .map(|v| v.to_string().replace("\"",""))
76
+ .collect::<Vec<String>>();
77
+ } else {
78
+ panic!("Columns is not an array");
79
+ }
80
+ } else {
81
+ panic!("Columns was not selected");
82
+ };
83
+
111
84
  //downloading maf files parallelly and merge them into single maf file
112
- let (tx, rx) = mpsc::channel();
113
- let fetches = futures::stream::iter(
85
+ let download_futures = futures::stream::iter(
114
86
  url.into_iter().map(|url|{
115
- let txt = tx.clone();
116
87
  async move {
117
- if let Ok(resp) = reqwest::get(&url).await {
88
+ let result = reqwest::get(&url).await;
89
+ if let Ok(resp) = result {
118
90
  let content = resp.bytes().await.unwrap();
119
91
  let mut decoder = GzDecoder::new(&content[..]);
120
92
  let mut decompressed_content = Vec::new();
121
- if let Ok(_) = decoder.read_to_end(&mut decompressed_content) {
122
- let text = String::from_utf8_lossy(&decompressed_content);
123
- let (lst_chrom_pos,maf_bit) = gen_vec(text.to_string());
124
- txt.send((lst_chrom_pos,maf_bit)).unwrap();
93
+ let read_content = decoder.read_to_end(&mut decompressed_content);
94
+ if let Ok(_) = read_content {
95
+ let text = String::from_utf8_lossy(&decompressed_content).to_string();
96
+ text
97
+ } else {
98
+ let error_msg = "Failed to read content downloaded from: ".to_string() + &url;
99
+ error_msg
125
100
  }
101
+ } else {
102
+ let error_msg = "Failed to download: ".to_string() + &url;
103
+ error_msg
126
104
  }
127
105
  }
128
106
  })
129
- ).buffer_unordered(20).collect::<Vec<()>>();
130
- fetches.await;
131
- drop(tx);
132
-
133
- // write downloaded maf (GZIP format) into a Vector
134
- // lst_chrom_pos: a vector including chromsome&position info for sorting maf
135
- // idx_sorted: indices after sorting basedon chromsome&position
136
- let mut maf_bit: Vec<Vec<u8>> = Vec::new();
137
- let mut lst_chrom_pos: Vec<String> = Vec::new();
138
- for (chr_pos_lst,maf_bit_lst) in rx {
139
- maf_bit.extend_from_slice(&maf_bit_lst);
140
- lst_chrom_pos.extend_from_slice(&chr_pos_lst);
141
- };
142
- let idx_sorted = get_sorted_indices(&lst_chrom_pos);
107
+ );
143
108
 
144
109
  // output
145
- // maf_out_bit: A vector of GZIPPED maf
146
- // compress_header: output header
147
110
  let mut encoder = GzEncoder::new(io::stdout(), Compression::default());
148
- let _ = encoder.write_all(&MAF_COL.join("\t").as_bytes().to_vec()).expect("Failed to write header");
111
+ let _ = encoder.write_all(&maf_col.join("\t").as_bytes().to_vec()).expect("Failed to write header");
149
112
  let _ = encoder.write_all(b"\n").expect("Failed to write newline");
150
- for i in idx_sorted.iter() {
151
- let _ = encoder.write_all(&maf_bit[*i]).expect("Failed to write file");
152
- };
113
+ download_futures.buffer_unordered(20).for_each(|item| {
114
+ if item.starts_with("Failed") {
115
+ eprintln!("{}",item);
116
+ } else {
117
+ let maf_bit = select_maf_col(item,&maf_col);
118
+ let _ = encoder.write_all(&maf_bit).expect("Failed to write file");
119
+ };
120
+ async {}
121
+ }).await;
153
122
  Ok(())
154
123
  }
@@ -11,6 +11,7 @@ Various JSON parameters:
11
11
  Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","input_file":"/path/to/input/file","filter_extreme_values":true,"num_genes":100, "param":"var"}' && time echo $json | target/release/gene_variance
12
12
  */
13
13
  #![allow(non_snake_case)]
14
+ use bgzip::BGZFReader;
14
15
  use json;
15
16
  use nalgebra::base::dimension::Dyn;
16
17
  use nalgebra::base::Matrix;
@@ -23,8 +24,9 @@ use statrs::statistics::Median;
23
24
  use statrs::statistics::OrderStatistics;
24
25
  use statrs::statistics::Statistics;
25
26
  use std::cmp::Ordering;
27
+ use std::fs;
26
28
  use std::io;
27
- use std::path::Path;
29
+ use std::io::Read;
28
30
  use std::str::FromStr;
29
31
  use std::time::Instant;
30
32
 
@@ -34,100 +36,78 @@ fn input_data(
34
36
  ) -> (
35
37
  Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
36
38
  Vec<String>,
37
- Vec<String>,
38
39
  ) {
39
40
  // Build the CSV reader and iterate over each record.
40
- let path = Path::new(filename);
41
- let mut rdr = csv::Reader::from_path(path).unwrap();
41
+ let mut reader = BGZFReader::new(fs::File::open(filename).unwrap()).unwrap();
42
42
  let mut num_lines: usize = 0;
43
- let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
44
- let mut gene_names: Vec<String> = Vec::with_capacity(65000);
45
- let mut gene_symbols: Vec<String> = Vec::with_capacity(65000);
46
- let mut num_columns: usize = 0;
43
+ let mut gene_symbols: Vec<String> = Vec::with_capacity(500);
47
44
 
48
- // Check headers for samples
49
- let header_line = rdr.headers().unwrap();
50
- let mut headers: Vec<&str> = Vec::with_capacity(1500);
51
- for field in header_line.iter() {
52
- headers = field.split('\t').collect::<Vec<&str>>();
53
- }
54
- //println!("headers:{:?}", headers);
55
- let mut sample_indexes_original: Vec<usize> = Vec::with_capacity(sample_list.len());
56
- let gene_name_index = headers.iter().position(|r| r == &"geneID");
57
- let gene_symbol_index = headers.iter().position(|r| r == &"geneSymbol");
58
- //let mut case_samples_not_found: Vec<&str> = Vec::with_capacity(sample_list.len());
59
- //let mut control_samples_not_found: Vec<&str> = Vec::with_capacity(control_list.len());
45
+ let mut buffer = String::new();
46
+ reader.read_to_string(&mut buffer).unwrap();
60
47
 
61
- for item in sample_list {
62
- //println!("item:{}", item);
63
- let index = headers.iter().position(|r| r == item);
64
- match index {
65
- Some(n) => sample_indexes_original.push(n),
66
- None => {
67
- //panic!("Case sample not found:{}", item);
68
- //case_samples_not_found.push(item);
48
+ let lines = buffer.split("\n");
49
+ let mut first = true;
50
+ let mut input_vector: Vec<f64> = Vec::with_capacity(1000 * 500);
51
+ let mut column_numbers: Vec<usize> = Vec::with_capacity(300);
52
+ for line in lines {
53
+ if first == true {
54
+ first = false;
55
+ let columns: Vec<&str> = line.split("\t").collect();
56
+ // Finding column numbers corresponding to each sample given in the input list
57
+ for item in sample_list {
58
+ if let Some(index) = columns.iter().position(|num| num == item) {
59
+ column_numbers.push(index)
60
+ } else {
61
+ panic!("Sample {} not found:", item)
62
+ }
69
63
  }
70
- }
71
- }
72
-
73
- //println!("case_indexes_original:{:?}", case_indexes_original);
74
-
75
- let mut samples_indexes: Vec<usize> = Vec::with_capacity(sample_list.len());
76
- for result in rdr.records() {
77
- // The iterator yields Result<StringRecord, Error>, so we check the
78
- // error here.
79
- let record = result.unwrap();
80
- //println!("record:{:?}", record);
81
- let mut index = 0;
82
- for field in record[0].split('\t').collect::<Vec<&str>>() {
83
- if index == gene_name_index.unwrap() {
84
- gene_names.push(field.to_string());
85
- } else if index == gene_symbol_index.unwrap() {
86
- gene_symbols.push(field.to_string());
87
- } else if sample_indexes_original.contains(&index) {
88
- let num = FromStr::from_str(field);
89
- match num {
90
- Ok(n) => {
91
- //println!("n:{}", n);
92
- input_vector.push(n);
93
- if num_lines == 0 {
94
- samples_indexes.push(num_columns);
95
- num_columns += 1;
64
+ } else {
65
+ let line2: Vec<&str> = line.split("\t").collect();
66
+ if line2.len() == 1 {
67
+ break; // end of file
68
+ } else {
69
+ num_lines += 1;
70
+ //println!("line2:{:?}", line2);
71
+ gene_symbols.push(line2[3].to_string());
72
+ for i in &column_numbers {
73
+ let field = line2[*i];
74
+ let num = FromStr::from_str(field);
75
+ match num {
76
+ Ok(n) => {
77
+ //println!("n:{}", n);
78
+ input_vector.push(n);
79
+ }
80
+ Err(_n) => {
81
+ panic!(
82
+ "Number {} in line {} and column {} is not a decimal number",
83
+ field,
84
+ num_lines + 1,
85
+ i + 1
86
+ );
96
87
  }
97
- }
98
- Err(_n) => {
99
- panic!(
100
- "Number {} in line {} and column {} is not a decimal number",
101
- field,
102
- num_lines + 1,
103
- index + 1
104
- );
105
88
  }
106
89
  }
107
90
  }
108
- index += 1;
109
91
  }
110
- num_lines += 1;
111
92
  }
93
+
112
94
  //println!("case_indexes:{:?}", case_indexes);
113
95
  //println!("control_indexes:{:?}", control_indexes);
114
96
 
115
- let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
97
+ let dm = DMatrix::from_row_slice(num_lines, sample_list.len(), &input_vector);
116
98
  //println!("dm:{:?}", dm);
117
- (dm, gene_names, gene_symbols)
99
+ (dm, gene_symbols)
118
100
  }
119
101
 
120
102
  #[allow(dead_code)]
121
103
  #[derive(Debug, Serialize, Deserialize)]
122
104
  struct GeneInfo {
123
- gene_name: String,
124
105
  gene_symbol: String,
125
106
  param: f64,
126
107
  }
127
108
 
128
109
  fn calculate_variance(
129
110
  input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
130
- gene_names: Vec<String>,
131
111
  gene_symbols: Vec<String>,
132
112
  mut min_sample_size: f64,
133
113
  filter_extreme_values: bool,
@@ -196,13 +176,11 @@ fn calculate_variance(
196
176
  {
197
177
  gene_infos.push(GeneInfo {
198
178
  param: gene_counts.variance(),
199
- gene_name: gene_names[row].clone(),
200
179
  gene_symbol: gene_symbols[row].clone(),
201
180
  });
202
181
  } else if filter_extreme_values == false {
203
182
  gene_infos.push(GeneInfo {
204
183
  param: gene_counts.variance(),
205
- gene_name: gene_names[row].clone(),
206
184
  gene_symbol: gene_symbols[row].clone(),
207
185
  });
208
186
  }
@@ -216,13 +194,11 @@ fn calculate_variance(
216
194
  {
217
195
  gene_infos.push(GeneInfo {
218
196
  param: gene_counts_data.interquartile_range(),
219
- gene_name: gene_names[row].clone(),
220
197
  gene_symbol: gene_symbols[row].clone(),
221
198
  });
222
199
  } else if filter_extreme_values == false {
223
200
  gene_infos.push(GeneInfo {
224
201
  param: gene_counts_data.interquartile_range(),
225
- gene_name: gene_names[row].clone(),
226
202
  gene_symbol: gene_symbols[row].clone(),
227
203
  });
228
204
  }
@@ -321,11 +297,9 @@ fn main() {
321
297
  }
322
298
 
323
299
  let samples_list: Vec<&str> = samples_string.split(",").collect();
324
- let (input_matrix, gene_names, gene_symbols) =
325
- input_data(&file_name, &samples_list);
300
+ let (input_matrix, gene_symbols) = input_data(&file_name, &samples_list);
326
301
  let gene_infos = calculate_variance(
327
302
  input_matrix,
328
- gene_names,
329
303
  gene_symbols,
330
304
  samples_list.len() as f64,
331
305
  filter_extreme_values,
@@ -343,7 +317,7 @@ fn main() {
343
317
  }
344
318
  }
345
319
  output_string += &"]".to_string();
346
- println!("{}", output_string);
320
+ println!("output_json:{}", output_string);
347
321
  println!("Time for calculating variances:{:?}", now.elapsed());
348
322
  }
349
323
  Err(error) => println!("Incorrect json: {}", error),