@sjcrh/proteinpaint-rust 2.59.0 → 2.61.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -86,3 +86,7 @@ path="src/DEanalysis.rs"
86
86
  [[bin]]
87
87
  name="genesetORA"
88
88
  path="src/genesetORA.rs"
89
+
90
+ [[bin]]
91
+ name="computeTopTerms"
92
+ path="src/computeTopTerms.rs"
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.59.0",
2
+ "version": "2.61.1",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
@@ -38,5 +38,5 @@
38
38
  "devDependencies": {
39
39
  "tape": "^5.2.2"
40
40
  },
41
- "pp_release_tag": "v2.59.0"
41
+ "pp_release_tag": "v2.61.1"
42
42
  }
package/src/DEanalysis.rs CHANGED
@@ -1,4 +1,4 @@
1
- // cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
1
+ // cd .. && cargo build --release && json='{"min_count":10,"min_total_count":15,"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
2
2
  // cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/DEanalysis
3
3
  #![allow(non_snake_case)]
4
4
  use json;
@@ -368,6 +368,22 @@ fn main() {
368
368
  match input_json {
369
369
  Ok(json_string) => {
370
370
  let now = Instant::now();
371
+ let min_count_option = json_string["min_count"].as_f64().to_owned();
372
+ let min_total_count_option = json_string["min_total_count"].as_f64().to_owned();
373
+ let min_count;
374
+ match min_count_option {
375
+ Some(x) => min_count = x,
376
+ None => {
377
+ panic!("min_count is missing a value")
378
+ }
379
+ }
380
+ let min_total_count;
381
+ match min_total_count_option {
382
+ Some(x) => min_total_count = x,
383
+ None => {
384
+ panic!("min_total_count is missing a value")
385
+ }
386
+ }
371
387
  let case_string = &json_string["case"].to_owned().as_str().unwrap().to_string();
372
388
  let control_string = &json_string["control"]
373
389
  .to_owned()
@@ -388,6 +404,8 @@ fn main() {
388
404
  let filtering_time = Instant::now();
389
405
  let (filtered_matrix, lib_sizes, filtered_genes, filtered_gene_symbols) =
390
406
  filter_by_expr(
407
+ min_count,
408
+ min_total_count,
391
409
  &input_matrix,
392
410
  case_indexes.len(),
393
411
  control_indexes.len(),
@@ -671,6 +689,7 @@ fn adjust_p_values_bonferroni(original_p_values: Vec<PValueIndexes>) -> Vec<Adju
671
689
  adjusted_p_values
672
690
  }
673
691
 
692
+ // Original TMM normalization source code in edgeR: https://rdrr.io/bioc/edgeR/src/R/calcNormFactors.R
674
693
  fn tmm_normalization(
675
694
  input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
676
695
  lib_sizes: &Vec<f64>,
@@ -947,7 +966,10 @@ fn calc_quantile(mut input: Vec<f64>, p: f64) -> f64 {
947
966
  qs_final
948
967
  }
949
968
 
969
+ // Original filterByExpr source code in edgeR: https://rdrr.io/bioc/edgeR/src/R/filterByExpr.R
950
970
  fn filter_by_expr(
971
+ min_count: f64,
972
+ min_total_count: f64,
951
973
  raw_data: &Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
952
974
  num_diseased: usize,
953
975
  num_control: usize,
@@ -960,9 +982,8 @@ fn filter_by_expr(
960
982
  Vec<String>,
961
983
  ) {
962
984
  // Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>
963
- #[allow(non_upper_case_globals)]
964
- const MIN_COUNT: f64 = 10.0; // Value of constant from R implementation
965
- const MIN_TOTAL_COUNT: f64 = 15.0; // Value of constant from R implementation
985
+ //const min_count: f64 = 10.0; // Value of constant from R implementation
986
+ //const min_total_count: f64 = 15.0; // Value of constant from R implementation
966
987
  const LARGE_N: f64 = 10.0; // Value of constant from R implementation
967
988
  const MIN_PROP: f64 = 0.7; // Value of constant from R implementation
968
989
 
@@ -989,7 +1010,7 @@ fn filter_by_expr(
989
1010
  //println!("lib_sizes:{:?}", lib_sizes);
990
1011
  //println!("min_sample_size:{}", min_sample_size);
991
1012
  let median_lib_size = Data::new(lib_sizes.clone()).median();
992
- let cpm_cutoff = (MIN_COUNT / median_lib_size) * 1000000.0;
1013
+ let cpm_cutoff = (min_count / median_lib_size) * 1000000.0;
993
1014
  //println!("cpm_cutoff:{}", cpm_cutoff);
994
1015
  let cpm_matrix = cpm(&raw_data);
995
1016
  const TOL: f64 = 1e-14; // Value of constant from R implementation
@@ -1018,7 +1039,7 @@ fn filter_by_expr(
1018
1039
  //}
1019
1040
 
1020
1041
  let mut keep_total_bool = false;
1021
- if row_sums[(row, 0)] as f64 >= MIN_TOTAL_COUNT - TOL {
1042
+ if row_sums[(row, 0)] as f64 >= min_total_count - TOL {
1022
1043
  keep_total_bool = true;
1023
1044
  //keep_total.push(keep_total_bool);
1024
1045
  //positive_total += 1;
@@ -0,0 +1,152 @@
1
+ /*
2
+ This script selects the top most variant metabolite by calculating the variance/interquartile region for each metabolite.
3
+
4
+ Various JSON parameters:
5
+ samples: Enter the sample ID(s) separated by comma
6
+ input_file: Path to input file(txt file instead of *.gz file)
7
+ num_metabolites: The top num_metabolites that need to be reported in the output(optional. 20 by default).
8
+ param: var/iqr . This parameter decides whether to sort metabolites using variance or interquartile region. There is an article which states that its better to use interquartile region than variance for selecting genes for clustering https://www.frontiersin.org/articles/10.3389/fgene.2021.632620/full
9
+
10
+ Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","input_file":"/path/to/input/file","filter_extreme_values":true,"param":"var"}' && time echo $json | target/release/computeTopTerms
11
+ */
12
+
13
+ #![allow(non_snake_case)]
14
+ use serde_json::{self,Value};
15
+ use serde::{Serialize,Deserialize};
16
+ use std::io::{self, BufReader, BufRead};
17
+ use std::fs::File;
18
+ use nalgebra::base::dimension::Dyn;
19
+ use nalgebra::base::Matrix;
20
+ use nalgebra::base::VecStorage;
21
+ use nalgebra::DMatrix;
22
+ use std::str::FromStr;
23
+ use std::cmp::Ordering;
24
+ use statrs::statistics::Data;
25
+ use statrs::statistics::OrderStatistics;
26
+ use statrs::statistics::Statistics;
27
+
28
+ fn input_data(
29
+ filename: &String,
30
+ sample_list: &Vec<&str>,
31
+ ) -> (
32
+ Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
33
+ Vec<String>,
34
+ ) {
35
+ let mut num_lines: usize = 0;
36
+ let mut metabolites: Vec<String> = Vec::with_capacity(500);
37
+ let file = File::open(filename).expect("Reading metabolite intensity file error!");
38
+ let reader = BufReader::new(file);
39
+ let mut input_vector: Vec<f64> = Vec::with_capacity(1000 * 500);
40
+ let mut column_numbers: Vec<usize> = Vec::with_capacity(300);
41
+ for line in reader.lines() {
42
+ let line_str = line.expect("line reading error");
43
+ let columns: Vec<&str> = line_str.split("\t").collect();
44
+ // Finding column numbers corresponding to each sample given in the input list
45
+ if columns[0] == "#Metabolites" {
46
+ for sam in sample_list {
47
+ if let Some(index) = columns.iter().position(|s| s == sam) {
48
+ column_numbers.push(index)
49
+ } else {
50
+ panic!("Sample {} not found:", sam);
51
+ }
52
+ }
53
+ } else {
54
+ num_lines += 1;
55
+ metabolites.push(columns[0].to_string());
56
+ for i in &column_numbers {
57
+ let intensity = columns[*i];
58
+ let intensity_num = FromStr::from_str(intensity);
59
+ match intensity_num {
60
+ Ok(n) => {
61
+ input_vector.push(n);
62
+ }
63
+ Err(_) => {
64
+ panic!(
65
+ "Number {} in line {} and column {} is not a decimal number",
66
+ intensity,
67
+ num_lines + 1,
68
+ i + 1
69
+ )
70
+ }
71
+ }
72
+ }
73
+ }
74
+ };
75
+ let dm = DMatrix::from_row_slice(num_lines, sample_list.len(), &input_vector);
76
+ (dm, metabolites)
77
+ }
78
+
79
+ #[derive(Debug, Serialize, Deserialize)]
80
+ struct MetaboliteInfo {
81
+ metabolite: String,
82
+ param: f64,
83
+ }
84
+ fn calculate_variance(
85
+ input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
86
+ metabolites: Vec<String>,
87
+ param: String,
88
+ ) -> Vec<MetaboliteInfo> {
89
+ let mut metabolite_infos = Vec::<MetaboliteInfo>::new();
90
+ for row in 0..input_matrix.nrows() {
91
+ let mut metabolite_counts: Vec<f64> = Vec::with_capacity(input_matrix.ncols());
92
+ for col in 0..input_matrix.ncols() {
93
+ metabolite_counts.push(input_matrix[(row, col)]);
94
+ }
95
+ if param == "var" {
96
+ // Calculating variance
97
+ metabolite_infos.push(MetaboliteInfo {
98
+ metabolite: metabolites[row].clone(),
99
+ param: metabolite_counts.clone().variance(),
100
+ });
101
+ } else {
102
+ // Calculating interquartile region
103
+ let metabolite_counts_data = Data::new(metabolite_counts);
104
+ metabolite_infos.push(MetaboliteInfo {
105
+ metabolite: metabolites[row].clone(),
106
+ param: metabolite_counts_data.clone().interquartile_range(),
107
+ });
108
+
109
+ }
110
+ }
111
+ metabolite_infos
112
+ .as_mut_slice()
113
+ .sort_by(|a, b| (a.param).partial_cmp(&b.param).unwrap_or(Ordering::Equal));
114
+ //println!("{:?}",metabolite_infos);
115
+ metabolite_infos
116
+ }
117
+
118
+ fn main() {
119
+ let mut input = String::new();
120
+ io::stdin().read_line(&mut input).expect("Piping error");
121
+ let input_json = serde_json::from_str::<Value>(&input).expect("Error reading input and serializing to JSON");
122
+ let sample_string = &input_json.get("samples").expect("samples is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
123
+ let file_name = &input_json.get("input_file").expect("input_file is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
124
+ let param = &input_json.get("param").expect("param is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
125
+ if param != "var" && param != "iqr" {
126
+ panic!("Unknown method:{}", param); // Check if any unknown method has been provided
127
+ };
128
+ let num_metabolites = match input_json.get("num_metabolites") {
129
+ Some(value) => {
130
+ let string_value = value.as_str().expect("Invalid value type for 'num_metabolites'");
131
+ string_value.parse::<usize>().expect("Invalid number of metabolites")
132
+ }
133
+ None => 20
134
+ };
135
+ let samples_list: Vec<&str> = sample_string.split(",").collect();
136
+ let (input_matrix, metabolites) = input_data(&file_name, &samples_list);
137
+ let metabolite_infos = calculate_variance(
138
+ input_matrix,
139
+ metabolites,
140
+ param.to_string(),
141
+ );
142
+ let mut output_string = "[".to_string();
143
+ for j in 0..num_metabolites {
144
+ let i = metabolite_infos.len() - j - 1;
145
+ output_string += &serde_json::to_string(&metabolite_infos[i]).unwrap();
146
+ if i > metabolite_infos.len() - num_metabolites {
147
+ output_string += &",".to_string();
148
+ }
149
+ }
150
+ output_string += &"]".to_string();
151
+ println!("output_json:{}", output_string);
152
+ }
package/src/genesetORA.rs CHANGED
@@ -112,12 +112,12 @@ fn main() -> Result<()> {
112
112
  + &genesetgroup
113
113
  + "'"),
114
114
  );
115
+ let mut iter = 0;
115
116
  match stmt_result {
116
117
  Ok(mut stmt) => {
117
118
  #[allow(non_snake_case)]
118
119
  let GO_iter =
119
120
  stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
120
- let mut iter = 0;
121
121
  #[allow(non_snake_case)]
122
122
  for GO_term in GO_iter {
123
123
  iter += 1;
@@ -178,14 +178,15 @@ fn main() -> Result<()> {
178
178
  }
179
179
  }
180
180
  }
181
- println!("Number of pathway entries:{}", iter);
182
181
  }
183
182
  Err(_) => panic!("sqlite database file not found"),
184
183
  }
185
- println!(
186
- "pathway_p_values:{}",
187
- adjust_p_values(pathway_p_values, num_items_output)
188
- );
184
+ let output_string = "{\"num_pathways\":".to_string()
185
+ + &iter.to_string()
186
+ + &",\"pathways\":"
187
+ + &adjust_p_values(pathway_p_values, num_items_output)
188
+ + &"}";
189
+ println!("pathway_p_values:{}", output_string);
189
190
  println!(
190
191
  "Time for calculating gene overrepresentation:{:?}",
191
192
  run_time.elapsed()
@@ -199,7 +200,10 @@ fn main() -> Result<()> {
199
200
  Ok(())
200
201
  }
201
202
 
202
- fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>, num_items_output: usize) -> String {
203
+ fn adjust_p_values(
204
+ mut original_p_values: Vec<pathway_p_value>,
205
+ mut num_items_output: usize,
206
+ ) -> String {
203
207
  // Sorting p-values in ascending order
204
208
  original_p_values.as_mut_slice().sort_by(|a, b| {
205
209
  (a.p_value_original)
@@ -243,6 +247,10 @@ fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>, num_items_output
243
247
  .unwrap_or(Ordering::Equal)
244
248
  });
245
249
 
250
+ if num_items_output > adjusted_p_values.len() {
251
+ num_items_output = adjusted_p_values.len()
252
+ }
253
+
246
254
  let mut output_string = "[".to_string();
247
255
  for i in 0..num_items_output {
248
256
  let j = adjusted_p_values.len() - i - 1;