@sjcrh/proteinpaint-rust 2.60.0 → 2.61.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -86,3 +86,7 @@ path="src/DEanalysis.rs"
86
86
  [[bin]]
87
87
  name="genesetORA"
88
88
  path="src/genesetORA.rs"
89
+
90
+ [[bin]]
91
+ name="computeTopTerms"
92
+ path="src/computeTopTerms.rs"
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.60.0",
2
+ "version": "2.61.1",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
@@ -38,5 +38,5 @@
38
38
  "devDependencies": {
39
39
  "tape": "^5.2.2"
40
40
  },
41
- "pp_release_tag": "v2.60.0"
41
+ "pp_release_tag": "v2.61.1"
42
42
  }
@@ -0,0 +1,152 @@
1
+ /*
2
+ This script selects the top most variant metabolite by calculating the variance/interquartile region for each metabolite.
3
+
4
+ Various JSON parameters:
5
+ samples: Enter the sample ID(s) separated by comma
6
+ input_file: Path to input file(txt file instead of *.gz file)
7
+ num_metabolites: The top num_metabolites that need to be reported in the output(optional. 20 by default).
8
+ param: var/iqr . This parameter decides whether to sort metabolites using variance or interquartile region. There is an article which states that its better to use interquartile region than variance for selecting genes for clustering https://www.frontiersin.org/articles/10.3389/fgene.2021.632620/full
9
+
10
+ Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","input_file":"/path/to/input/file","filter_extreme_values":true,"param":"var"}' && time echo $json | target/release/computeTopTerms
11
+ */
12
+
13
+ #![allow(non_snake_case)]
14
+ use serde_json::{self,Value};
15
+ use serde::{Serialize,Deserialize};
16
+ use std::io::{self, BufReader, BufRead};
17
+ use std::fs::File;
18
+ use nalgebra::base::dimension::Dyn;
19
+ use nalgebra::base::Matrix;
20
+ use nalgebra::base::VecStorage;
21
+ use nalgebra::DMatrix;
22
+ use std::str::FromStr;
23
+ use std::cmp::Ordering;
24
+ use statrs::statistics::Data;
25
+ use statrs::statistics::OrderStatistics;
26
+ use statrs::statistics::Statistics;
27
+
28
+ fn input_data(
29
+ filename: &String,
30
+ sample_list: &Vec<&str>,
31
+ ) -> (
32
+ Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
33
+ Vec<String>,
34
+ ) {
35
+ let mut num_lines: usize = 0;
36
+ let mut metabolites: Vec<String> = Vec::with_capacity(500);
37
+ let file = File::open(filename).expect("Reading metabolite intensity file error!");
38
+ let reader = BufReader::new(file);
39
+ let mut input_vector: Vec<f64> = Vec::with_capacity(1000 * 500);
40
+ let mut column_numbers: Vec<usize> = Vec::with_capacity(300);
41
+ for line in reader.lines() {
42
+ let line_str = line.expect("line reading error");
43
+ let columns: Vec<&str> = line_str.split("\t").collect();
44
+ // Finding column numbers corresponding to each sample given in the input list
45
+ if columns[0] == "#Metabolites" {
46
+ for sam in sample_list {
47
+ if let Some(index) = columns.iter().position(|s| s == sam) {
48
+ column_numbers.push(index)
49
+ } else {
50
+ panic!("Sample {} not found:", sam);
51
+ }
52
+ }
53
+ } else {
54
+ num_lines += 1;
55
+ metabolites.push(columns[0].to_string());
56
+ for i in &column_numbers {
57
+ let intensity = columns[*i];
58
+ let intensity_num = FromStr::from_str(intensity);
59
+ match intensity_num {
60
+ Ok(n) => {
61
+ input_vector.push(n);
62
+ }
63
+ Err(_) => {
64
+ panic!(
65
+ "Number {} in line {} and column {} is not a decimal number",
66
+ intensity,
67
+ num_lines + 1,
68
+ i + 1
69
+ )
70
+ }
71
+ }
72
+ }
73
+ }
74
+ };
75
+ let dm = DMatrix::from_row_slice(num_lines, sample_list.len(), &input_vector);
76
+ (dm, metabolites)
77
+ }
78
+
79
+ #[derive(Debug, Serialize, Deserialize)]
80
+ struct MetaboliteInfo {
81
+ metabolite: String,
82
+ param: f64,
83
+ }
84
+ fn calculate_variance(
85
+ input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
86
+ metabolites: Vec<String>,
87
+ param: String,
88
+ ) -> Vec<MetaboliteInfo> {
89
+ let mut metabolite_infos = Vec::<MetaboliteInfo>::new();
90
+ for row in 0..input_matrix.nrows() {
91
+ let mut metabolite_counts: Vec<f64> = Vec::with_capacity(input_matrix.ncols());
92
+ for col in 0..input_matrix.ncols() {
93
+ metabolite_counts.push(input_matrix[(row, col)]);
94
+ }
95
+ if param == "var" {
96
+ // Calculating variance
97
+ metabolite_infos.push(MetaboliteInfo {
98
+ metabolite: metabolites[row].clone(),
99
+ param: metabolite_counts.clone().variance(),
100
+ });
101
+ } else {
102
+ // Calculating interquartile region
103
+ let metabolite_counts_data = Data::new(metabolite_counts);
104
+ metabolite_infos.push(MetaboliteInfo {
105
+ metabolite: metabolites[row].clone(),
106
+ param: metabolite_counts_data.clone().interquartile_range(),
107
+ });
108
+
109
+ }
110
+ }
111
+ metabolite_infos
112
+ .as_mut_slice()
113
+ .sort_by(|a, b| (a.param).partial_cmp(&b.param).unwrap_or(Ordering::Equal));
114
+ //println!("{:?}",metabolite_infos);
115
+ metabolite_infos
116
+ }
117
+
118
+ fn main() {
119
+ let mut input = String::new();
120
+ io::stdin().read_line(&mut input).expect("Piping error");
121
+ let input_json = serde_json::from_str::<Value>(&input).expect("Error reading input and serializing to JSON");
122
+ let sample_string = &input_json.get("samples").expect("samples is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
123
+ let file_name = &input_json.get("input_file").expect("input_file is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
124
+ let param = &input_json.get("param").expect("param is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
125
+ if param != "var" && param != "iqr" {
126
+ panic!("Unknown method:{}", param); // Check if any unknown method has been provided
127
+ };
128
+ let num_metabolites = match input_json.get("num_metabolites") {
129
+ Some(value) => {
130
+ let string_value = value.as_str().expect("Invalid value type for 'num_metabolites'");
131
+ string_value.parse::<usize>().expect("Invalid number of metabolites")
132
+ }
133
+ None => 20
134
+ };
135
+ let samples_list: Vec<&str> = sample_string.split(",").collect();
136
+ let (input_matrix, metabolites) = input_data(&file_name, &samples_list);
137
+ let metabolite_infos = calculate_variance(
138
+ input_matrix,
139
+ metabolites,
140
+ param.to_string(),
141
+ );
142
+ let mut output_string = "[".to_string();
143
+ for j in 0..num_metabolites {
144
+ let i = metabolite_infos.len() - j - 1;
145
+ output_string += &serde_json::to_string(&metabolite_infos[i]).unwrap();
146
+ if i > metabolite_infos.len() - num_metabolites {
147
+ output_string += &",".to_string();
148
+ }
149
+ }
150
+ output_string += &"]".to_string();
151
+ println!("output_json:{}", output_string);
152
+ }