@sjcrh/proteinpaint-rust 2.59.0 → 2.61.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +4 -0
- package/package.json +2 -2
- package/src/DEanalysis.rs +27 -6
- package/src/computeTopTerms.rs +152 -0
- package/src/genesetORA.rs +15 -7
package/Cargo.toml
CHANGED
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.
|
|
2
|
+
"version": "2.61.1",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"description": "Rust-based utilities for proteinpaint",
|
|
5
5
|
"main": "index.js",
|
|
@@ -38,5 +38,5 @@
|
|
|
38
38
|
"devDependencies": {
|
|
39
39
|
"tape": "^5.2.2"
|
|
40
40
|
},
|
|
41
|
-
"pp_release_tag": "v2.
|
|
41
|
+
"pp_release_tag": "v2.61.1"
|
|
42
42
|
}
|
package/src/DEanalysis.rs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
// cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
|
|
1
|
+
// cd .. && cargo build --release && json='{"min_count":10,"min_total_count":15,"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
|
|
2
2
|
// cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/DEanalysis
|
|
3
3
|
#![allow(non_snake_case)]
|
|
4
4
|
use json;
|
|
@@ -368,6 +368,22 @@ fn main() {
|
|
|
368
368
|
match input_json {
|
|
369
369
|
Ok(json_string) => {
|
|
370
370
|
let now = Instant::now();
|
|
371
|
+
let min_count_option = json_string["min_count"].as_f64().to_owned();
|
|
372
|
+
let min_total_count_option = json_string["min_total_count"].as_f64().to_owned();
|
|
373
|
+
let min_count;
|
|
374
|
+
match min_count_option {
|
|
375
|
+
Some(x) => min_count = x,
|
|
376
|
+
None => {
|
|
377
|
+
panic!("min_count is missing a value")
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
let min_total_count;
|
|
381
|
+
match min_total_count_option {
|
|
382
|
+
Some(x) => min_total_count = x,
|
|
383
|
+
None => {
|
|
384
|
+
panic!("min_total_count is missing a value")
|
|
385
|
+
}
|
|
386
|
+
}
|
|
371
387
|
let case_string = &json_string["case"].to_owned().as_str().unwrap().to_string();
|
|
372
388
|
let control_string = &json_string["control"]
|
|
373
389
|
.to_owned()
|
|
@@ -388,6 +404,8 @@ fn main() {
|
|
|
388
404
|
let filtering_time = Instant::now();
|
|
389
405
|
let (filtered_matrix, lib_sizes, filtered_genes, filtered_gene_symbols) =
|
|
390
406
|
filter_by_expr(
|
|
407
|
+
min_count,
|
|
408
|
+
min_total_count,
|
|
391
409
|
&input_matrix,
|
|
392
410
|
case_indexes.len(),
|
|
393
411
|
control_indexes.len(),
|
|
@@ -671,6 +689,7 @@ fn adjust_p_values_bonferroni(original_p_values: Vec<PValueIndexes>) -> Vec<Adju
|
|
|
671
689
|
adjusted_p_values
|
|
672
690
|
}
|
|
673
691
|
|
|
692
|
+
// Original TMM normalization source code in edgeR: https://rdrr.io/bioc/edgeR/src/R/calcNormFactors.R
|
|
674
693
|
fn tmm_normalization(
|
|
675
694
|
input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
676
695
|
lib_sizes: &Vec<f64>,
|
|
@@ -947,7 +966,10 @@ fn calc_quantile(mut input: Vec<f64>, p: f64) -> f64 {
|
|
|
947
966
|
qs_final
|
|
948
967
|
}
|
|
949
968
|
|
|
969
|
+
// Original filterByExpr source code in edgeR: https://rdrr.io/bioc/edgeR/src/R/filterByExpr.R
|
|
950
970
|
fn filter_by_expr(
|
|
971
|
+
min_count: f64,
|
|
972
|
+
min_total_count: f64,
|
|
951
973
|
raw_data: &Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
952
974
|
num_diseased: usize,
|
|
953
975
|
num_control: usize,
|
|
@@ -960,9 +982,8 @@ fn filter_by_expr(
|
|
|
960
982
|
Vec<String>,
|
|
961
983
|
) {
|
|
962
984
|
// Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>
|
|
963
|
-
|
|
964
|
-
const
|
|
965
|
-
const MIN_TOTAL_COUNT: f64 = 15.0; // Value of constant from R implementation
|
|
985
|
+
//const min_count: f64 = 10.0; // Value of constant from R implementation
|
|
986
|
+
//const min_total_count: f64 = 15.0; // Value of constant from R implementation
|
|
966
987
|
const LARGE_N: f64 = 10.0; // Value of constant from R implementation
|
|
967
988
|
const MIN_PROP: f64 = 0.7; // Value of constant from R implementation
|
|
968
989
|
|
|
@@ -989,7 +1010,7 @@ fn filter_by_expr(
|
|
|
989
1010
|
//println!("lib_sizes:{:?}", lib_sizes);
|
|
990
1011
|
//println!("min_sample_size:{}", min_sample_size);
|
|
991
1012
|
let median_lib_size = Data::new(lib_sizes.clone()).median();
|
|
992
|
-
let cpm_cutoff = (
|
|
1013
|
+
let cpm_cutoff = (min_count / median_lib_size) * 1000000.0;
|
|
993
1014
|
//println!("cpm_cutoff:{}", cpm_cutoff);
|
|
994
1015
|
let cpm_matrix = cpm(&raw_data);
|
|
995
1016
|
const TOL: f64 = 1e-14; // Value of constant from R implementation
|
|
@@ -1018,7 +1039,7 @@ fn filter_by_expr(
|
|
|
1018
1039
|
//}
|
|
1019
1040
|
|
|
1020
1041
|
let mut keep_total_bool = false;
|
|
1021
|
-
if row_sums[(row, 0)] as f64 >=
|
|
1042
|
+
if row_sums[(row, 0)] as f64 >= min_total_count - TOL {
|
|
1022
1043
|
keep_total_bool = true;
|
|
1023
1044
|
//keep_total.push(keep_total_bool);
|
|
1024
1045
|
//positive_total += 1;
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/*
|
|
2
|
+
This script selects the top most variant metabolite by calculating the variance/interquartile region for each metabolite.
|
|
3
|
+
|
|
4
|
+
Various JSON parameters:
|
|
5
|
+
samples: Enter the sample ID(s) separated by comma
|
|
6
|
+
input_file: Path to input file(txt file instead of *.gz file)
|
|
7
|
+
num_metabolites: The top num_metabolites that need to be reported in the output(optional. 20 by default).
|
|
8
|
+
param: var/iqr . This parameter decides whether to sort metabolites using variance or interquartile region. There is an article which states that its better to use interquartile region than variance for selecting genes for clustering https://www.frontiersin.org/articles/10.3389/fgene.2021.632620/full
|
|
9
|
+
|
|
10
|
+
Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","input_file":"/path/to/input/file","filter_extreme_values":true,"param":"var"}' && time echo $json | target/release/computeTopTerms
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
#![allow(non_snake_case)]
|
|
14
|
+
use serde_json::{self,Value};
|
|
15
|
+
use serde::{Serialize,Deserialize};
|
|
16
|
+
use std::io::{self, BufReader, BufRead};
|
|
17
|
+
use std::fs::File;
|
|
18
|
+
use nalgebra::base::dimension::Dyn;
|
|
19
|
+
use nalgebra::base::Matrix;
|
|
20
|
+
use nalgebra::base::VecStorage;
|
|
21
|
+
use nalgebra::DMatrix;
|
|
22
|
+
use std::str::FromStr;
|
|
23
|
+
use std::cmp::Ordering;
|
|
24
|
+
use statrs::statistics::Data;
|
|
25
|
+
use statrs::statistics::OrderStatistics;
|
|
26
|
+
use statrs::statistics::Statistics;
|
|
27
|
+
|
|
28
|
+
fn input_data(
|
|
29
|
+
filename: &String,
|
|
30
|
+
sample_list: &Vec<&str>,
|
|
31
|
+
) -> (
|
|
32
|
+
Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
33
|
+
Vec<String>,
|
|
34
|
+
) {
|
|
35
|
+
let mut num_lines: usize = 0;
|
|
36
|
+
let mut metabolites: Vec<String> = Vec::with_capacity(500);
|
|
37
|
+
let file = File::open(filename).expect("Reading metabolite intensity file error!");
|
|
38
|
+
let reader = BufReader::new(file);
|
|
39
|
+
let mut input_vector: Vec<f64> = Vec::with_capacity(1000 * 500);
|
|
40
|
+
let mut column_numbers: Vec<usize> = Vec::with_capacity(300);
|
|
41
|
+
for line in reader.lines() {
|
|
42
|
+
let line_str = line.expect("line reading error");
|
|
43
|
+
let columns: Vec<&str> = line_str.split("\t").collect();
|
|
44
|
+
// Finding column numbers corresponding to each sample given in the input list
|
|
45
|
+
if columns[0] == "#Metabolites" {
|
|
46
|
+
for sam in sample_list {
|
|
47
|
+
if let Some(index) = columns.iter().position(|s| s == sam) {
|
|
48
|
+
column_numbers.push(index)
|
|
49
|
+
} else {
|
|
50
|
+
panic!("Sample {} not found:", sam);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
} else {
|
|
54
|
+
num_lines += 1;
|
|
55
|
+
metabolites.push(columns[0].to_string());
|
|
56
|
+
for i in &column_numbers {
|
|
57
|
+
let intensity = columns[*i];
|
|
58
|
+
let intensity_num = FromStr::from_str(intensity);
|
|
59
|
+
match intensity_num {
|
|
60
|
+
Ok(n) => {
|
|
61
|
+
input_vector.push(n);
|
|
62
|
+
}
|
|
63
|
+
Err(_) => {
|
|
64
|
+
panic!(
|
|
65
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
66
|
+
intensity,
|
|
67
|
+
num_lines + 1,
|
|
68
|
+
i + 1
|
|
69
|
+
)
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
let dm = DMatrix::from_row_slice(num_lines, sample_list.len(), &input_vector);
|
|
76
|
+
(dm, metabolites)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
#[derive(Debug, Serialize, Deserialize)]
|
|
80
|
+
struct MetaboliteInfo {
|
|
81
|
+
metabolite: String,
|
|
82
|
+
param: f64,
|
|
83
|
+
}
|
|
84
|
+
fn calculate_variance(
|
|
85
|
+
input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
86
|
+
metabolites: Vec<String>,
|
|
87
|
+
param: String,
|
|
88
|
+
) -> Vec<MetaboliteInfo> {
|
|
89
|
+
let mut metabolite_infos = Vec::<MetaboliteInfo>::new();
|
|
90
|
+
for row in 0..input_matrix.nrows() {
|
|
91
|
+
let mut metabolite_counts: Vec<f64> = Vec::with_capacity(input_matrix.ncols());
|
|
92
|
+
for col in 0..input_matrix.ncols() {
|
|
93
|
+
metabolite_counts.push(input_matrix[(row, col)]);
|
|
94
|
+
}
|
|
95
|
+
if param == "var" {
|
|
96
|
+
// Calculating variance
|
|
97
|
+
metabolite_infos.push(MetaboliteInfo {
|
|
98
|
+
metabolite: metabolites[row].clone(),
|
|
99
|
+
param: metabolite_counts.clone().variance(),
|
|
100
|
+
});
|
|
101
|
+
} else {
|
|
102
|
+
// Calculating interquartile region
|
|
103
|
+
let metabolite_counts_data = Data::new(metabolite_counts);
|
|
104
|
+
metabolite_infos.push(MetaboliteInfo {
|
|
105
|
+
metabolite: metabolites[row].clone(),
|
|
106
|
+
param: metabolite_counts_data.clone().interquartile_range(),
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
metabolite_infos
|
|
112
|
+
.as_mut_slice()
|
|
113
|
+
.sort_by(|a, b| (a.param).partial_cmp(&b.param).unwrap_or(Ordering::Equal));
|
|
114
|
+
//println!("{:?}",metabolite_infos);
|
|
115
|
+
metabolite_infos
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
fn main() {
|
|
119
|
+
let mut input = String::new();
|
|
120
|
+
io::stdin().read_line(&mut input).expect("Piping error");
|
|
121
|
+
let input_json = serde_json::from_str::<Value>(&input).expect("Error reading input and serializing to JSON");
|
|
122
|
+
let sample_string = &input_json.get("samples").expect("samples is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
|
|
123
|
+
let file_name = &input_json.get("input_file").expect("input_file is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
|
|
124
|
+
let param = &input_json.get("param").expect("param is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
|
|
125
|
+
if param != "var" && param != "iqr" {
|
|
126
|
+
panic!("Unknown method:{}", param); // Check if any unknown method has been provided
|
|
127
|
+
};
|
|
128
|
+
let num_metabolites = match input_json.get("num_metabolites") {
|
|
129
|
+
Some(value) => {
|
|
130
|
+
let string_value = value.as_str().expect("Invalid value type for 'num_metabolites'");
|
|
131
|
+
string_value.parse::<usize>().expect("Invalid number of metabolites")
|
|
132
|
+
}
|
|
133
|
+
None => 20
|
|
134
|
+
};
|
|
135
|
+
let samples_list: Vec<&str> = sample_string.split(",").collect();
|
|
136
|
+
let (input_matrix, metabolites) = input_data(&file_name, &samples_list);
|
|
137
|
+
let metabolite_infos = calculate_variance(
|
|
138
|
+
input_matrix,
|
|
139
|
+
metabolites,
|
|
140
|
+
param.to_string(),
|
|
141
|
+
);
|
|
142
|
+
let mut output_string = "[".to_string();
|
|
143
|
+
for j in 0..num_metabolites {
|
|
144
|
+
let i = metabolite_infos.len() - j - 1;
|
|
145
|
+
output_string += &serde_json::to_string(&metabolite_infos[i]).unwrap();
|
|
146
|
+
if i > metabolite_infos.len() - num_metabolites {
|
|
147
|
+
output_string += &",".to_string();
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
output_string += &"]".to_string();
|
|
151
|
+
println!("output_json:{}", output_string);
|
|
152
|
+
}
|
package/src/genesetORA.rs
CHANGED
|
@@ -112,12 +112,12 @@ fn main() -> Result<()> {
|
|
|
112
112
|
+ &genesetgroup
|
|
113
113
|
+ "'"),
|
|
114
114
|
);
|
|
115
|
+
let mut iter = 0;
|
|
115
116
|
match stmt_result {
|
|
116
117
|
Ok(mut stmt) => {
|
|
117
118
|
#[allow(non_snake_case)]
|
|
118
119
|
let GO_iter =
|
|
119
120
|
stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
|
|
120
|
-
let mut iter = 0;
|
|
121
121
|
#[allow(non_snake_case)]
|
|
122
122
|
for GO_term in GO_iter {
|
|
123
123
|
iter += 1;
|
|
@@ -178,14 +178,15 @@ fn main() -> Result<()> {
|
|
|
178
178
|
}
|
|
179
179
|
}
|
|
180
180
|
}
|
|
181
|
-
println!("Number of pathway entries:{}", iter);
|
|
182
181
|
}
|
|
183
182
|
Err(_) => panic!("sqlite database file not found"),
|
|
184
183
|
}
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
184
|
+
let output_string = "{\"num_pathways\":".to_string()
|
|
185
|
+
+ &iter.to_string()
|
|
186
|
+
+ &",\"pathways\":"
|
|
187
|
+
+ &adjust_p_values(pathway_p_values, num_items_output)
|
|
188
|
+
+ &"}";
|
|
189
|
+
println!("pathway_p_values:{}", output_string);
|
|
189
190
|
println!(
|
|
190
191
|
"Time for calculating gene overrepresentation:{:?}",
|
|
191
192
|
run_time.elapsed()
|
|
@@ -199,7 +200,10 @@ fn main() -> Result<()> {
|
|
|
199
200
|
Ok(())
|
|
200
201
|
}
|
|
201
202
|
|
|
202
|
-
fn adjust_p_values(
|
|
203
|
+
fn adjust_p_values(
|
|
204
|
+
mut original_p_values: Vec<pathway_p_value>,
|
|
205
|
+
mut num_items_output: usize,
|
|
206
|
+
) -> String {
|
|
203
207
|
// Sorting p-values in ascending order
|
|
204
208
|
original_p_values.as_mut_slice().sort_by(|a, b| {
|
|
205
209
|
(a.p_value_original)
|
|
@@ -243,6 +247,10 @@ fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>, num_items_output
|
|
|
243
247
|
.unwrap_or(Ordering::Equal)
|
|
244
248
|
});
|
|
245
249
|
|
|
250
|
+
if num_items_output > adjusted_p_values.len() {
|
|
251
|
+
num_items_output = adjusted_p_values.len()
|
|
252
|
+
}
|
|
253
|
+
|
|
246
254
|
let mut output_string = "[".to_string();
|
|
247
255
|
for i in 0..num_items_output {
|
|
248
256
|
let j = adjusted_p_values.len() - i - 1;
|