@sjcrh/proteinpaint-rust 2.38.1 → 2.40.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -11,6 +11,7 @@ autobins = false
11
11
  [dependencies]
12
12
  kodama = "0.3"
13
13
  rayon = "1.7.0"
14
+ bgzip = "0.3.1"
14
15
  petgraph = "0.6.3"
15
16
  ndarray = "0.15.6"
16
17
  nalgebra = {version = "0.32.2", features = ["serde-serialize"]}
@@ -26,7 +27,7 @@ serde = {version = "^1.0.147", features = ["derive"]}
26
27
  serde_json="^1.0.88"
27
28
  num = "^0.4.1"
28
29
  csv = "^1.2.2"
29
- r_mathlib="^0.2.0" # Uncomment this line to activate DE expression app for high sample sizes
30
+ r_mathlib="^0.2.0"
30
31
  tokio = { version="1", features = ["full"] }
31
32
  reqwest = "0.11"
32
33
  flate2 = "1"
@@ -73,14 +74,13 @@ name="gdcmaf"
73
74
  path="src/gdcmaf.rs"
74
75
 
75
76
  [[bin]]
76
- name="gene_variance"
77
- path="src/gene_variance.rs"
77
+ name="topGeneByExpressionVariance"
78
+ path="src/topGeneByExpressionVariance.rs"
78
79
 
79
80
  #[[bin]]
80
81
  #name="wilcoxon"
81
82
  #path="src/wilcoxon.rs"
82
83
 
83
- # Uncomment the lines below to use DE app for higher sample sizes
84
84
  [[bin]]
85
85
  name="DEanalysis"
86
86
  path="src/DEanalysis.rs"
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.38.1",
2
+ "version": "2.40.6",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
package/src/gdcmaf.rs CHANGED
@@ -7,7 +7,7 @@
7
7
  Output gzip compressed maf file to stdout.
8
8
 
9
9
  Example of usage:
10
- echo '{"host": "https://api.gdc.cancer.gov/data/", "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
10
+ echo '{"host": "https://api.gdc.cancer.gov/data/","columns": ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome", "Start_Position"], "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
11
11
  */
12
12
 
13
13
  use flate2::read::GzDecoder;
@@ -20,7 +20,7 @@ use std::io::{self,Read,Write};
20
20
 
21
21
 
22
22
 
23
- fn select_maf_col(d:String) -> Vec<u8> {
23
+ fn select_maf_col(d:String,columns:&Vec<String>) -> Vec<u8> {
24
24
  let mut maf_str: String = String::new();
25
25
  let mut header_indices: Vec<usize> = Vec::new();
26
26
  let lines = d.trim_end().split("\n");
@@ -29,9 +29,12 @@ fn select_maf_col(d:String) -> Vec<u8> {
29
29
  continue
30
30
  } else if line.contains("Hugo_Symbol") {
31
31
  let header: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
32
- for col in MAF_COL {
33
- let col_index: usize = header.iter().position(|x| x == col).unwrap();
34
- header_indices.push(col_index);
32
+ for col in columns {
33
+ if let Some(index) = header.iter().position(|x| x == col) {
34
+ header_indices.push(index);
35
+ } else {
36
+ panic!("{} was not found!",col);
37
+ }
35
38
  }
36
39
  } else {
37
40
  let maf_cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
@@ -47,27 +50,6 @@ fn select_maf_col(d:String) -> Vec<u8> {
47
50
  }
48
51
 
49
52
 
50
- // GDC MAF columns (96)
51
- const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
52
- "Start_Position", "End_Position", "Strand", "Variant_Classification",
53
- "Variant_Type", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2",
54
- "dbSNP_RS", "dbSNP_Val_Status", "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode",
55
- "Match_Norm_Seq_Allele1", "Match_Norm_Seq_Allele2", "Tumor_Validation_Allele1",
56
- "Tumor_Validation_Allele2", "Match_Norm_Validation_Allele1", "Match_Norm_Validation_Allele2",
57
- "Verification_Status", "Validation_Status", "Mutation_Status", "Sequencing_Phase",
58
- "Sequence_Source", "Validation_Method", "Score", "BAM_File", "Sequencer",
59
- "Tumor_Sample_UUID", "Matched_Norm_Sample_UUID", "HGVSc", "HGVSp", "HGVSp_Short",
60
- "Transcript_ID", "Exon_Number", "t_depth", "t_ref_count", "t_alt_count", "n_depth",
61
- "n_ref_count", "n_alt_count", "all_effects", "Allele", "Gene", "Feature", "Feature_type",
62
- "One_Consequence", "Consequence", "cDNA_position", "CDS_position", "Protein_position",
63
- "Amino_acids", "Codons", "Existing_variation", "DISTANCE", "TRANSCRIPT_STRAND", "SYMBOL",
64
- "SYMBOL_SOURCE", "HGNC_ID", "BIOTYPE", "CANONICAL", "CCDS", "ENSP", "SWISSPROT", "TREMBL",
65
- "UNIPARC", "RefSeq", "SIFT", "PolyPhen", "EXON", "INTRON", "DOMAINS", "CLIN_SIG", "SOMATIC",
66
- "PUBMED", "MOTIF_NAME", "MOTIF_POS", "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "IMPACT", "PICK",
67
- "VARIANT_CLASS", "TSL", "HGVS_OFFSET", "PHENO", "GENE_PHENO", "CONTEXT", "tumor_bam_uuid",
68
- "normal_bam_uuid", "case_id", "GDC_FILTER", "COSMIC"];
69
-
70
-
71
53
  #[tokio::main]
72
54
  async fn main() -> Result<(),Box<dyn std::error::Error>> {
73
55
  // Accepting the piped input json from jodejs and assign to the variable
@@ -83,6 +65,22 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
83
65
  url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
84
66
  };
85
67
 
68
+ // read columns as array from input json and convert data type from Vec<Value> to Vec<String>
69
+ let maf_col:Vec<String>;
70
+ if let Some(maf_col_value) = file_id_lst_js.get("columns") {
71
+ //convert Vec<Value> to Vec<String>
72
+ if let Some(maf_col_array) = maf_col_value.as_array() {
73
+ maf_col = maf_col_array
74
+ .iter()
75
+ .map(|v| v.to_string().replace("\"",""))
76
+ .collect::<Vec<String>>();
77
+ } else {
78
+ panic!("Columns is not an array");
79
+ }
80
+ } else {
81
+ panic!("Columns was not selected");
82
+ };
83
+
86
84
  //downloading maf files parallelly and merge them into single maf file
87
85
  let download_futures = futures::stream::iter(
88
86
  url.into_iter().map(|url|{
@@ -110,13 +108,13 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
110
108
 
111
109
  // output
112
110
  let mut encoder = GzEncoder::new(io::stdout(), Compression::default());
113
- let _ = encoder.write_all(&MAF_COL.join("\t").as_bytes().to_vec()).expect("Failed to write header");
111
+ let _ = encoder.write_all(&maf_col.join("\t").as_bytes().to_vec()).expect("Failed to write header");
114
112
  let _ = encoder.write_all(b"\n").expect("Failed to write newline");
115
113
  download_futures.buffer_unordered(20).for_each(|item| {
116
114
  if item.starts_with("Failed") {
117
115
  eprintln!("{}",item);
118
116
  } else {
119
- let maf_bit = select_maf_col(item);
117
+ let maf_bit = select_maf_col(item,&maf_col);
120
118
  let _ = encoder.write_all(&maf_bit).expect("Failed to write file");
121
119
  };
122
120
  async {}
package/src/indel.rs CHANGED
@@ -1,7 +1,7 @@
1
1
  // Syntax: cd .. && cargo build --release
2
2
 
3
3
  // Test case below:
4
- //Debug syntax: cd .. && cargo build --release && time cat ~/sjpp/test.txt | ~/proteinpaint/rust/target/release/indel
4
+ //Debug syntax: cd .. && cargo build --release && time cat ~/sjpp/test.txt | ~/sjpp/proteinpaint/rust/target/release/indel
5
5
 
6
6
  // Strictness:
7
7
  // 0: No postprocessing, pure indel typing results
@@ -1356,7 +1356,12 @@ fn main() {
1356
1356
  //let mut output_string = "[".to_string();
1357
1357
  //output_string += &all_alleles.to_string();
1358
1358
  output_string.pop();
1359
- output_string += &"]".to_string();
1359
+ if output_string.len() == 0 {
1360
+ // Pass empty JSON "[]" when no reads are passed back to nodejs
1361
+ output_string = "[]".to_string();
1362
+ } else {
1363
+ output_string += &"]".to_string();
1364
+ }
1360
1365
  println!("Final_output:{:?}", output_string);
1361
1366
  }
1362
1367
  Err(error) => println!("Incorrect json: {}", error),
@@ -11,6 +11,7 @@ Various JSON parameters:
11
11
  Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","input_file":"/path/to/input/file","filter_extreme_values":true,"num_genes":100, "param":"var"}' && time echo $json | target/release/gene_variance
12
12
  */
13
13
  #![allow(non_snake_case)]
14
+ use bgzip::BGZFReader;
14
15
  use json;
15
16
  use nalgebra::base::dimension::Dyn;
16
17
  use nalgebra::base::Matrix;
@@ -23,8 +24,9 @@ use statrs::statistics::Median;
23
24
  use statrs::statistics::OrderStatistics;
24
25
  use statrs::statistics::Statistics;
25
26
  use std::cmp::Ordering;
27
+ use std::fs;
26
28
  use std::io;
27
- use std::path::Path;
29
+ use std::io::Read;
28
30
  use std::str::FromStr;
29
31
  use std::time::Instant;
30
32
 
@@ -34,100 +36,78 @@ fn input_data(
34
36
  ) -> (
35
37
  Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
36
38
  Vec<String>,
37
- Vec<String>,
38
39
  ) {
39
40
  // Build the CSV reader and iterate over each record.
40
- let path = Path::new(filename);
41
- let mut rdr = csv::Reader::from_path(path).unwrap();
41
+ let mut reader = BGZFReader::new(fs::File::open(filename).unwrap()).unwrap();
42
42
  let mut num_lines: usize = 0;
43
- let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
44
- let mut gene_names: Vec<String> = Vec::with_capacity(65000);
45
- let mut gene_symbols: Vec<String> = Vec::with_capacity(65000);
46
- let mut num_columns: usize = 0;
43
+ let mut gene_symbols: Vec<String> = Vec::with_capacity(500);
47
44
 
48
- // Check headers for samples
49
- let header_line = rdr.headers().unwrap();
50
- let mut headers: Vec<&str> = Vec::with_capacity(1500);
51
- for field in header_line.iter() {
52
- headers = field.split('\t').collect::<Vec<&str>>();
53
- }
54
- //println!("headers:{:?}", headers);
55
- let mut sample_indexes_original: Vec<usize> = Vec::with_capacity(sample_list.len());
56
- let gene_name_index = headers.iter().position(|r| r == &"geneID");
57
- let gene_symbol_index = headers.iter().position(|r| r == &"geneSymbol");
58
- //let mut case_samples_not_found: Vec<&str> = Vec::with_capacity(sample_list.len());
59
- //let mut control_samples_not_found: Vec<&str> = Vec::with_capacity(control_list.len());
45
+ let mut buffer = String::new();
46
+ reader.read_to_string(&mut buffer).unwrap();
60
47
 
61
- for item in sample_list {
62
- //println!("item:{}", item);
63
- let index = headers.iter().position(|r| r == item);
64
- match index {
65
- Some(n) => sample_indexes_original.push(n),
66
- None => {
67
- //panic!("Case sample not found:{}", item);
68
- //case_samples_not_found.push(item);
48
+ let lines = buffer.split("\n");
49
+ let mut first = true;
50
+ let mut input_vector: Vec<f64> = Vec::with_capacity(1000 * 500);
51
+ let mut column_numbers: Vec<usize> = Vec::with_capacity(300);
52
+ for line in lines {
53
+ if first == true {
54
+ first = false;
55
+ let columns: Vec<&str> = line.split("\t").collect();
56
+ // Finding column numbers corresponding to each sample given in the input list
57
+ for item in sample_list {
58
+ if let Some(index) = columns.iter().position(|num| num == item) {
59
+ column_numbers.push(index)
60
+ } else {
61
+ panic!("Sample {} not found:", item)
62
+ }
69
63
  }
70
- }
71
- }
72
-
73
- //println!("case_indexes_original:{:?}", case_indexes_original);
74
-
75
- let mut samples_indexes: Vec<usize> = Vec::with_capacity(sample_list.len());
76
- for result in rdr.records() {
77
- // The iterator yields Result<StringRecord, Error>, so we check the
78
- // error here.
79
- let record = result.unwrap();
80
- //println!("record:{:?}", record);
81
- let mut index = 0;
82
- for field in record[0].split('\t').collect::<Vec<&str>>() {
83
- if index == gene_name_index.unwrap() {
84
- gene_names.push(field.to_string());
85
- } else if index == gene_symbol_index.unwrap() {
86
- gene_symbols.push(field.to_string());
87
- } else if sample_indexes_original.contains(&index) {
88
- let num = FromStr::from_str(field);
89
- match num {
90
- Ok(n) => {
91
- //println!("n:{}", n);
92
- input_vector.push(n);
93
- if num_lines == 0 {
94
- samples_indexes.push(num_columns);
95
- num_columns += 1;
64
+ } else {
65
+ let line2: Vec<&str> = line.split("\t").collect();
66
+ if line2.len() == 1 {
67
+ break; // end of file
68
+ } else {
69
+ num_lines += 1;
70
+ //println!("line2:{:?}", line2);
71
+ gene_symbols.push(line2[3].to_string());
72
+ for i in &column_numbers {
73
+ let field = line2[*i];
74
+ let num = FromStr::from_str(field);
75
+ match num {
76
+ Ok(n) => {
77
+ //println!("n:{}", n);
78
+ input_vector.push(n);
79
+ }
80
+ Err(_n) => {
81
+ panic!(
82
+ "Number {} in line {} and column {} is not a decimal number",
83
+ field,
84
+ num_lines + 1,
85
+ i + 1
86
+ );
96
87
  }
97
- }
98
- Err(_n) => {
99
- panic!(
100
- "Number {} in line {} and column {} is not a decimal number",
101
- field,
102
- num_lines + 1,
103
- index + 1
104
- );
105
88
  }
106
89
  }
107
90
  }
108
- index += 1;
109
91
  }
110
- num_lines += 1;
111
92
  }
93
+
112
94
  //println!("case_indexes:{:?}", case_indexes);
113
95
  //println!("control_indexes:{:?}", control_indexes);
114
96
 
115
- let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
97
+ let dm = DMatrix::from_row_slice(num_lines, sample_list.len(), &input_vector);
116
98
  //println!("dm:{:?}", dm);
117
- (dm, gene_names, gene_symbols)
99
+ (dm, gene_symbols)
118
100
  }
119
101
 
120
102
  #[allow(dead_code)]
121
103
  #[derive(Debug, Serialize, Deserialize)]
122
104
  struct GeneInfo {
123
- gene_name: String,
124
105
  gene_symbol: String,
125
106
  param: f64,
126
107
  }
127
108
 
128
109
  fn calculate_variance(
129
110
  input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
130
- gene_names: Vec<String>,
131
111
  gene_symbols: Vec<String>,
132
112
  mut min_sample_size: f64,
133
113
  filter_extreme_values: bool,
@@ -196,13 +176,11 @@ fn calculate_variance(
196
176
  {
197
177
  gene_infos.push(GeneInfo {
198
178
  param: gene_counts.variance(),
199
- gene_name: gene_names[row].clone(),
200
179
  gene_symbol: gene_symbols[row].clone(),
201
180
  });
202
181
  } else if filter_extreme_values == false {
203
182
  gene_infos.push(GeneInfo {
204
183
  param: gene_counts.variance(),
205
- gene_name: gene_names[row].clone(),
206
184
  gene_symbol: gene_symbols[row].clone(),
207
185
  });
208
186
  }
@@ -216,13 +194,11 @@ fn calculate_variance(
216
194
  {
217
195
  gene_infos.push(GeneInfo {
218
196
  param: gene_counts_data.interquartile_range(),
219
- gene_name: gene_names[row].clone(),
220
197
  gene_symbol: gene_symbols[row].clone(),
221
198
  });
222
199
  } else if filter_extreme_values == false {
223
200
  gene_infos.push(GeneInfo {
224
201
  param: gene_counts_data.interquartile_range(),
225
- gene_name: gene_names[row].clone(),
226
202
  gene_symbol: gene_symbols[row].clone(),
227
203
  });
228
204
  }
@@ -321,11 +297,9 @@ fn main() {
321
297
  }
322
298
 
323
299
  let samples_list: Vec<&str> = samples_string.split(",").collect();
324
- let (input_matrix, gene_names, gene_symbols) =
325
- input_data(&file_name, &samples_list);
300
+ let (input_matrix, gene_symbols) = input_data(&file_name, &samples_list);
326
301
  let gene_infos = calculate_variance(
327
302
  input_matrix,
328
- gene_names,
329
303
  gene_symbols,
330
304
  samples_list.len() as f64,
331
305
  filter_extreme_values,
@@ -343,7 +317,7 @@ fn main() {
343
317
  }
344
318
  }
345
319
  output_string += &"]".to_string();
346
- println!("{}", output_string);
320
+ println!("output_json:{}", output_string);
347
321
  println!("Time for calculating variances:{:?}", now.elapsed());
348
322
  }
349
323
  Err(error) => println!("Incorrect json: {}", error),