@sjcrh/proteinpaint-rust 2.60.0 → 2.61.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +4 -0
- package/package.json +2 -2
- package/src/computeTopTerms.rs +152 -0
package/Cargo.toml
CHANGED
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.
|
|
2
|
+
"version": "2.61.1",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"description": "Rust-based utilities for proteinpaint",
|
|
5
5
|
"main": "index.js",
|
|
@@ -38,5 +38,5 @@
|
|
|
38
38
|
"devDependencies": {
|
|
39
39
|
"tape": "^5.2.2"
|
|
40
40
|
},
|
|
41
|
-
"pp_release_tag": "v2.
|
|
41
|
+
"pp_release_tag": "v2.61.1"
|
|
42
42
|
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/*
|
|
2
|
+
This script selects the top most variant metabolite by calculating the variance/interquartile region for each metabolite.
|
|
3
|
+
|
|
4
|
+
Various JSON parameters:
|
|
5
|
+
samples: Enter the sample ID(s) separated by comma
|
|
6
|
+
input_file: Path to input file(txt file instead of *.gz file)
|
|
7
|
+
num_metabolites: The top num_metabolites that need to be reported in the output(optional. 20 by default).
|
|
8
|
+
param: var/iqr . This parameter decides whether to sort metabolites using variance or interquartile region. There is an article which states that its better to use interquartile region than variance for selecting genes for clustering https://www.frontiersin.org/articles/10.3389/fgene.2021.632620/full
|
|
9
|
+
|
|
10
|
+
Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","input_file":"/path/to/input/file","filter_extreme_values":true,"param":"var"}' && time echo $json | target/release/computeTopTerms
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
#![allow(non_snake_case)]
|
|
14
|
+
use serde_json::{self,Value};
|
|
15
|
+
use serde::{Serialize,Deserialize};
|
|
16
|
+
use std::io::{self, BufReader, BufRead};
|
|
17
|
+
use std::fs::File;
|
|
18
|
+
use nalgebra::base::dimension::Dyn;
|
|
19
|
+
use nalgebra::base::Matrix;
|
|
20
|
+
use nalgebra::base::VecStorage;
|
|
21
|
+
use nalgebra::DMatrix;
|
|
22
|
+
use std::str::FromStr;
|
|
23
|
+
use std::cmp::Ordering;
|
|
24
|
+
use statrs::statistics::Data;
|
|
25
|
+
use statrs::statistics::OrderStatistics;
|
|
26
|
+
use statrs::statistics::Statistics;
|
|
27
|
+
|
|
28
|
+
fn input_data(
|
|
29
|
+
filename: &String,
|
|
30
|
+
sample_list: &Vec<&str>,
|
|
31
|
+
) -> (
|
|
32
|
+
Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
33
|
+
Vec<String>,
|
|
34
|
+
) {
|
|
35
|
+
let mut num_lines: usize = 0;
|
|
36
|
+
let mut metabolites: Vec<String> = Vec::with_capacity(500);
|
|
37
|
+
let file = File::open(filename).expect("Reading metabolite intensity file error!");
|
|
38
|
+
let reader = BufReader::new(file);
|
|
39
|
+
let mut input_vector: Vec<f64> = Vec::with_capacity(1000 * 500);
|
|
40
|
+
let mut column_numbers: Vec<usize> = Vec::with_capacity(300);
|
|
41
|
+
for line in reader.lines() {
|
|
42
|
+
let line_str = line.expect("line reading error");
|
|
43
|
+
let columns: Vec<&str> = line_str.split("\t").collect();
|
|
44
|
+
// Finding column numbers corresponding to each sample given in the input list
|
|
45
|
+
if columns[0] == "#Metabolites" {
|
|
46
|
+
for sam in sample_list {
|
|
47
|
+
if let Some(index) = columns.iter().position(|s| s == sam) {
|
|
48
|
+
column_numbers.push(index)
|
|
49
|
+
} else {
|
|
50
|
+
panic!("Sample {} not found:", sam);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
} else {
|
|
54
|
+
num_lines += 1;
|
|
55
|
+
metabolites.push(columns[0].to_string());
|
|
56
|
+
for i in &column_numbers {
|
|
57
|
+
let intensity = columns[*i];
|
|
58
|
+
let intensity_num = FromStr::from_str(intensity);
|
|
59
|
+
match intensity_num {
|
|
60
|
+
Ok(n) => {
|
|
61
|
+
input_vector.push(n);
|
|
62
|
+
}
|
|
63
|
+
Err(_) => {
|
|
64
|
+
panic!(
|
|
65
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
66
|
+
intensity,
|
|
67
|
+
num_lines + 1,
|
|
68
|
+
i + 1
|
|
69
|
+
)
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
let dm = DMatrix::from_row_slice(num_lines, sample_list.len(), &input_vector);
|
|
76
|
+
(dm, metabolites)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
#[derive(Debug, Serialize, Deserialize)]
|
|
80
|
+
struct MetaboliteInfo {
|
|
81
|
+
metabolite: String,
|
|
82
|
+
param: f64,
|
|
83
|
+
}
|
|
84
|
+
fn calculate_variance(
|
|
85
|
+
input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
86
|
+
metabolites: Vec<String>,
|
|
87
|
+
param: String,
|
|
88
|
+
) -> Vec<MetaboliteInfo> {
|
|
89
|
+
let mut metabolite_infos = Vec::<MetaboliteInfo>::new();
|
|
90
|
+
for row in 0..input_matrix.nrows() {
|
|
91
|
+
let mut metabolite_counts: Vec<f64> = Vec::with_capacity(input_matrix.ncols());
|
|
92
|
+
for col in 0..input_matrix.ncols() {
|
|
93
|
+
metabolite_counts.push(input_matrix[(row, col)]);
|
|
94
|
+
}
|
|
95
|
+
if param == "var" {
|
|
96
|
+
// Calculating variance
|
|
97
|
+
metabolite_infos.push(MetaboliteInfo {
|
|
98
|
+
metabolite: metabolites[row].clone(),
|
|
99
|
+
param: metabolite_counts.clone().variance(),
|
|
100
|
+
});
|
|
101
|
+
} else {
|
|
102
|
+
// Calculating interquartile region
|
|
103
|
+
let metabolite_counts_data = Data::new(metabolite_counts);
|
|
104
|
+
metabolite_infos.push(MetaboliteInfo {
|
|
105
|
+
metabolite: metabolites[row].clone(),
|
|
106
|
+
param: metabolite_counts_data.clone().interquartile_range(),
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
metabolite_infos
|
|
112
|
+
.as_mut_slice()
|
|
113
|
+
.sort_by(|a, b| (a.param).partial_cmp(&b.param).unwrap_or(Ordering::Equal));
|
|
114
|
+
//println!("{:?}",metabolite_infos);
|
|
115
|
+
metabolite_infos
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
fn main() {
|
|
119
|
+
let mut input = String::new();
|
|
120
|
+
io::stdin().read_line(&mut input).expect("Piping error");
|
|
121
|
+
let input_json = serde_json::from_str::<Value>(&input).expect("Error reading input and serializing to JSON");
|
|
122
|
+
let sample_string = &input_json.get("samples").expect("samples is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
|
|
123
|
+
let file_name = &input_json.get("input_file").expect("input_file is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
|
|
124
|
+
let param = &input_json.get("param").expect("param is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
|
|
125
|
+
if param != "var" && param != "iqr" {
|
|
126
|
+
panic!("Unknown method:{}", param); // Check if any unknown method has been provided
|
|
127
|
+
};
|
|
128
|
+
let num_metabolites = match input_json.get("num_metabolites") {
|
|
129
|
+
Some(value) => {
|
|
130
|
+
let string_value = value.as_str().expect("Invalid value type for 'num_metabolites'");
|
|
131
|
+
string_value.parse::<usize>().expect("Invalid number of metabolites")
|
|
132
|
+
}
|
|
133
|
+
None => 20
|
|
134
|
+
};
|
|
135
|
+
let samples_list: Vec<&str> = sample_string.split(",").collect();
|
|
136
|
+
let (input_matrix, metabolites) = input_data(&file_name, &samples_list);
|
|
137
|
+
let metabolite_infos = calculate_variance(
|
|
138
|
+
input_matrix,
|
|
139
|
+
metabolites,
|
|
140
|
+
param.to_string(),
|
|
141
|
+
);
|
|
142
|
+
let mut output_string = "[".to_string();
|
|
143
|
+
for j in 0..num_metabolites {
|
|
144
|
+
let i = metabolite_infos.len() - j - 1;
|
|
145
|
+
output_string += &serde_json::to_string(&metabolite_infos[i]).unwrap();
|
|
146
|
+
if i > metabolite_infos.len() - num_metabolites {
|
|
147
|
+
output_string += &",".to_string();
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
output_string += &"]".to_string();
|
|
151
|
+
println!("output_json:{}", output_string);
|
|
152
|
+
}
|