@sjcrh/proteinpaint-rust 2.126.0 → 2.126.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +0 -4
- package/index.js +1 -1
- package/package.json +2 -2
- package/src/cerno.rs +0 -341
package/Cargo.toml
CHANGED
package/index.js
CHANGED
|
@@ -5,7 +5,7 @@ import { spawn, exec } from 'child_process'
|
|
|
5
5
|
import { Readable, Transform } from 'stream'
|
|
6
6
|
import { promisify } from 'util'
|
|
7
7
|
|
|
8
|
-
const __dirname = import.meta.dirname
|
|
8
|
+
const __dirname = import.meta.dirname // set __dirname for consistency with cjs code
|
|
9
9
|
|
|
10
10
|
const execPromise = promisify(exec)
|
|
11
11
|
|
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.126.
|
|
2
|
+
"version": "2.126.2",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Rust-based utilities for proteinpaint",
|
|
@@ -39,5 +39,5 @@
|
|
|
39
39
|
"devDependencies": {
|
|
40
40
|
"tape": "^5.2.2"
|
|
41
41
|
},
|
|
42
|
-
"pp_release_tag": "v2.126.
|
|
42
|
+
"pp_release_tag": "v2.126.2"
|
|
43
43
|
}
|
package/src/cerno.rs
DELETED
|
@@ -1,341 +0,0 @@
|
|
|
1
|
-
// Syntax: cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/cerno
|
|
2
|
-
#![allow(non_snake_case)]
|
|
3
|
-
use json::JsonValue;
|
|
4
|
-
use r_mathlib::chi_squared_cdf;
|
|
5
|
-
use rusqlite::{Connection, Result};
|
|
6
|
-
use serde::{Deserialize, Serialize};
|
|
7
|
-
use serde_json;
|
|
8
|
-
use std::cmp::Ordering;
|
|
9
|
-
use std::collections::HashSet;
|
|
10
|
-
use std::io;
|
|
11
|
-
|
|
12
|
-
#[allow(non_camel_case_types)]
|
|
13
|
-
#[allow(non_snake_case)]
|
|
14
|
-
#[derive(Debug)]
|
|
15
|
-
struct GO_pathway {
|
|
16
|
-
GO_id: String,
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
#[allow(non_camel_case_types)]
|
|
20
|
-
#[allow(non_snake_case)]
|
|
21
|
-
#[derive(Debug, Clone, PartialEq, PartialOrd)]
|
|
22
|
-
struct gene_order {
|
|
23
|
-
gene_name: String,
|
|
24
|
-
fold_change: f64,
|
|
25
|
-
rank: Option<usize>,
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
#[allow(non_camel_case_types)]
|
|
29
|
-
#[allow(non_snake_case)]
|
|
30
|
-
#[derive(Debug, Serialize, Deserialize)]
|
|
31
|
-
//#[allow(dead_code)]
|
|
32
|
-
struct pathway_p_value {
|
|
33
|
-
pathway_name: String,
|
|
34
|
-
p_value_original: f64,
|
|
35
|
-
p_value_adjusted: Option<f64>,
|
|
36
|
-
gene_set_hits: String,
|
|
37
|
-
auc: f64,
|
|
38
|
-
es: f64,
|
|
39
|
-
gene_set_size: usize,
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
#[allow(non_camel_case_types)]
|
|
43
|
-
#[allow(non_snake_case)]
|
|
44
|
-
#[derive(Debug, Serialize, Deserialize)]
|
|
45
|
-
//#[allow(dead_code)]
|
|
46
|
-
struct output_struct {
|
|
47
|
-
pval: f64,
|
|
48
|
-
fdr: f64,
|
|
49
|
-
leading_edge: String,
|
|
50
|
-
auc: f64,
|
|
51
|
-
es: f64,
|
|
52
|
-
geneset_size: usize,
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
fn main() -> Result<()> {
|
|
56
|
-
let mut input = String::new();
|
|
57
|
-
match io::stdin().read_line(&mut input) {
|
|
58
|
-
// Accepting the piped input from nodejs (or command line from testing)
|
|
59
|
-
Ok(_n) => {
|
|
60
|
-
let input_json = json::parse(&input);
|
|
61
|
-
match input_json {
|
|
62
|
-
Ok(json_string) => {
|
|
63
|
-
let msigdb_input: &JsonValue = &json_string["db"];
|
|
64
|
-
let msigdb;
|
|
65
|
-
match msigdb_input.as_str() {
|
|
66
|
-
Some(db_string) => msigdb = db_string.to_string(),
|
|
67
|
-
None => panic!("msigdb file path is missing"),
|
|
68
|
-
}
|
|
69
|
-
let genesetgroup;
|
|
70
|
-
let genesetgroup_input: &JsonValue = &json_string["geneset_group"];
|
|
71
|
-
match genesetgroup_input.as_str() {
|
|
72
|
-
Some(genesetgroup_string) => genesetgroup = genesetgroup_string.to_string(),
|
|
73
|
-
None => panic!("genesetgroup is missing"),
|
|
74
|
-
}
|
|
75
|
-
let sample_genes_input: &JsonValue = &json_string["genes"];
|
|
76
|
-
let mut sample_genes = Vec::<&str>::new();
|
|
77
|
-
for iter in 0..sample_genes_input.len() {
|
|
78
|
-
let item = sample_genes_input[iter].as_str().unwrap();
|
|
79
|
-
sample_genes.push(item);
|
|
80
|
-
}
|
|
81
|
-
//println!("sample_genes:{:?}", sample_genes);
|
|
82
|
-
|
|
83
|
-
let fold_change_input: &JsonValue = &json_string["fold_change"];
|
|
84
|
-
let mut fold_change_f64 = Vec::<f64>::new();
|
|
85
|
-
for iter in 0..fold_change_input.len() {
|
|
86
|
-
let item = fold_change_input[iter].as_f64().unwrap();
|
|
87
|
-
fold_change_f64.push(item);
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
if sample_genes.len() == 0 {
|
|
91
|
-
panic!("No sample genes provided");
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
if sample_genes.len() != fold_change_f64.len() {
|
|
95
|
-
panic!("Length of genes array and fold change array are not equal");
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
let mut genes_vector: Vec<gene_order> = Vec::with_capacity(sample_genes.len());
|
|
99
|
-
for i in 0..sample_genes.len() {
|
|
100
|
-
let item: gene_order = gene_order {
|
|
101
|
-
gene_name: sample_genes[i].to_string(),
|
|
102
|
-
fold_change: fold_change_f64[i],
|
|
103
|
-
rank: None, // Will be calculated later
|
|
104
|
-
};
|
|
105
|
-
genes_vector.push(item)
|
|
106
|
-
}
|
|
107
|
-
let mut pathway_p_values: Vec<pathway_p_value> = Vec::with_capacity(10000);
|
|
108
|
-
|
|
109
|
-
let genedb_input: &JsonValue = &json_string["genedb"];
|
|
110
|
-
let genedb;
|
|
111
|
-
match genedb_input.as_str() {
|
|
112
|
-
Some(gene_db_string) => genedb = gene_db_string.to_string(),
|
|
113
|
-
None => panic!("genedb file path is missing"),
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
let filter_non_coding_genes_input: &JsonValue = &json_string["filter_non_coding_genes"];
|
|
117
|
-
let filter_non_coding_genes: bool = filter_non_coding_genes_input.as_bool().unwrap();
|
|
118
|
-
|
|
119
|
-
let genedbconn = Connection::open(genedb)?;
|
|
120
|
-
let genedb_result = genedbconn.prepare(&("select * from codingGenes"));
|
|
121
|
-
let mut sample_coding_genes: Vec<gene_order> = Vec::with_capacity(24000);
|
|
122
|
-
match genedb_result {
|
|
123
|
-
Ok(mut x) => {
|
|
124
|
-
let mut genes = x.query([])?;
|
|
125
|
-
while let Some(coding_gene) = genes.next()? {
|
|
126
|
-
//println!("coding_gene:{:?}", coding_gene);
|
|
127
|
-
for sample_gene in &genes_vector {
|
|
128
|
-
let code_gene: String = coding_gene.get(0).unwrap();
|
|
129
|
-
if filter_non_coding_genes == true && code_gene == *sample_gene.gene_name {
|
|
130
|
-
sample_coding_genes.push(sample_gene.clone());
|
|
131
|
-
} else if filter_non_coding_genes == false {
|
|
132
|
-
sample_coding_genes.push(sample_gene.clone());
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
Err(_) => {}
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
if sample_coding_genes.len() == 0 {
|
|
141
|
-
panic!("All query genes are non-coding");
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
// Sort sample_coding_gene in descending order
|
|
145
|
-
sample_coding_genes
|
|
146
|
-
.as_mut_slice()
|
|
147
|
-
.sort_by(|a, b| (b.fold_change).partial_cmp(&a.fold_change).unwrap_or(Ordering::Equal));
|
|
148
|
-
|
|
149
|
-
// Assign ranks to each gene
|
|
150
|
-
for i in 0..sample_coding_genes.len() {
|
|
151
|
-
sample_coding_genes[i].rank = Some(i)
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
//println!("sample_genes:{:?}", sample_genes);
|
|
155
|
-
//println!("background_genes:{:?}", background_genes);
|
|
156
|
-
|
|
157
|
-
let msigdbconn = Connection::open(msigdb)?;
|
|
158
|
-
let stmt_result = msigdbconn
|
|
159
|
-
.prepare(&("select id from terms where parent_id='".to_owned() + &genesetgroup + "'"));
|
|
160
|
-
match stmt_result {
|
|
161
|
-
Ok(mut stmt) => {
|
|
162
|
-
#[allow(non_snake_case)]
|
|
163
|
-
let GO_iter = stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
|
|
164
|
-
#[allow(non_snake_case)]
|
|
165
|
-
for GO_term in GO_iter {
|
|
166
|
-
match GO_term {
|
|
167
|
-
Ok(n) => {
|
|
168
|
-
//println!("GO term {:?}", n);
|
|
169
|
-
let sql_statement =
|
|
170
|
-
"select genes from term2genes where id='".to_owned() + &n.GO_id + &"'";
|
|
171
|
-
//println!("sql_statement:{}", sql_statement);
|
|
172
|
-
let mut gene_stmt = msigdbconn.prepare(&(sql_statement))?;
|
|
173
|
-
//println!("gene_stmt:{:?}", gene_stmt);
|
|
174
|
-
|
|
175
|
-
let mut rows = gene_stmt.query([])?;
|
|
176
|
-
let mut names = HashSet::<String>::new();
|
|
177
|
-
while let Some(row) = rows.next()? {
|
|
178
|
-
let a: String = row.get(0)?;
|
|
179
|
-
let input_gene_json = json::parse(&a);
|
|
180
|
-
match input_gene_json {
|
|
181
|
-
Ok(json_genes) => {
|
|
182
|
-
for json_iter in 0..json_genes.len() {
|
|
183
|
-
names.insert(json_genes[json_iter]["symbol"].to_string());
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
Err(_) => {
|
|
187
|
-
panic!("Symbol, ensg, enstCanonical structure is missing!")
|
|
188
|
-
}
|
|
189
|
-
}
|
|
190
|
-
}
|
|
191
|
-
let gene_set_size = names.len();
|
|
192
|
-
let (p_value, auc, es, matches, gene_set_hits) =
|
|
193
|
-
cerno(&sample_coding_genes, names);
|
|
194
|
-
|
|
195
|
-
if matches >= 1.0
|
|
196
|
-
&& p_value.is_nan() == false
|
|
197
|
-
&& es.is_nan() == false
|
|
198
|
-
&& es != f64::INFINITY
|
|
199
|
-
&& auc != f64::INFINITY
|
|
200
|
-
&& auc.is_nan() == false
|
|
201
|
-
{
|
|
202
|
-
pathway_p_values.push(pathway_p_value {
|
|
203
|
-
pathway_name: n.GO_id,
|
|
204
|
-
p_value_original: p_value,
|
|
205
|
-
p_value_adjusted: None,
|
|
206
|
-
auc: auc,
|
|
207
|
-
es: es,
|
|
208
|
-
gene_set_hits: gene_set_hits,
|
|
209
|
-
gene_set_size: gene_set_size,
|
|
210
|
-
})
|
|
211
|
-
}
|
|
212
|
-
}
|
|
213
|
-
Err(_) => {
|
|
214
|
-
println!("GO term not found!")
|
|
215
|
-
}
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
Err(_) => panic!("sqlite database file not found"),
|
|
220
|
-
}
|
|
221
|
-
let output_string =
|
|
222
|
-
"result: {".to_string() + &"\"data\":" + &adjust_p_values(pathway_p_values) + &"}";
|
|
223
|
-
println!("{}", output_string);
|
|
224
|
-
}
|
|
225
|
-
Err(error) => println!("Incorrect json:{}", error),
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
Err(error) => println!("Piping error: {}", error),
|
|
229
|
-
}
|
|
230
|
-
Ok(())
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
fn cerno(sample_coding_genes: &Vec<gene_order>, genes_in_pathway: HashSet<String>) -> (f64, f64, f64, f64, String) {
|
|
234
|
-
// Filter the sample_coding_genes vector to only include those whose gene_names are in the HashSet genes_in_pathway
|
|
235
|
-
let gene_intersections: Vec<&gene_order> = sample_coding_genes
|
|
236
|
-
.iter()
|
|
237
|
-
.filter(|sample_coding_genes| genes_in_pathway.contains(&sample_coding_genes.gene_name)) // Check if name is in the HashSet genes_in_pathway
|
|
238
|
-
.collect(); // Collect the results into a new vector
|
|
239
|
-
|
|
240
|
-
let N1 = gene_intersections.len() as f64;
|
|
241
|
-
let N = sample_coding_genes.len() as f64;
|
|
242
|
-
let mut gene_set_hits: String = "".to_string();
|
|
243
|
-
for gene in &gene_intersections {
|
|
244
|
-
gene_set_hits += &(gene.gene_name.to_string() + &",");
|
|
245
|
-
}
|
|
246
|
-
if gene_intersections.len() > 0 {
|
|
247
|
-
// Remove the last "," in string
|
|
248
|
-
gene_set_hits.pop();
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
let ranks: Vec<usize> = gene_intersections // x <- l %in% mset$gs2gv[[m]] ; ranks <- c(1:N)[x]
|
|
252
|
-
.iter()
|
|
253
|
-
.map(|x| x.rank.unwrap())
|
|
254
|
-
.collect::<Vec<usize>>();
|
|
255
|
-
|
|
256
|
-
let cerno: f64 = ranks // -2 * sum( log(ranks/N) )
|
|
257
|
-
.iter()
|
|
258
|
-
.map(|x| ((*x as f64) / N).ln())
|
|
259
|
-
.collect::<Vec<f64>>()
|
|
260
|
-
.iter()
|
|
261
|
-
.sum::<f64>()
|
|
262
|
-
* (-2.0);
|
|
263
|
-
|
|
264
|
-
let cES: f64 = cerno / (2.0 * (N1 as f64)); // cES <- cerno/(2*N1)
|
|
265
|
-
let N2 = N - N1; // N2 = N - N1
|
|
266
|
-
let R1 = ranks.iter().sum::<usize>() as f64; // R1 <- sum(ranks)
|
|
267
|
-
let U = N1 * N2 + N1 * (N1 + 1.0) / 2.0 - R1; // U <- N1*N2+N1*(N1+1)/2-R1
|
|
268
|
-
let AUC = U / (N1 * N2); // AUC <- U/(N1*N2)
|
|
269
|
-
let p_value = chi_squared_cdf(cerno, 2.0 * N1, false, false); // pchisq(ret$cerno, 2*N1, lower.tail=FALSE)
|
|
270
|
-
(p_value, AUC, cES, N1, gene_set_hits)
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>) -> String {
|
|
274
|
-
// Sorting p-values in ascending order
|
|
275
|
-
original_p_values.as_mut_slice().sort_by(|a, b| {
|
|
276
|
-
(a.p_value_original)
|
|
277
|
-
.partial_cmp(&b.p_value_original)
|
|
278
|
-
.unwrap_or(Ordering::Equal)
|
|
279
|
-
});
|
|
280
|
-
|
|
281
|
-
let mut adjusted_p_values: Vec<pathway_p_value> = Vec::with_capacity(original_p_values.len());
|
|
282
|
-
let mut old_p_value: f64 = 0.0;
|
|
283
|
-
let mut rank: f64 = original_p_values.len() as f64;
|
|
284
|
-
for j in 0..original_p_values.len() {
|
|
285
|
-
let i = original_p_values.len() - j - 1;
|
|
286
|
-
|
|
287
|
-
//println!("p_val:{}", p_val);
|
|
288
|
-
let mut adjusted_p_val: f64 = original_p_values[i].p_value_original * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
|
|
289
|
-
if adjusted_p_val > 1.0 {
|
|
290
|
-
// p_value should NEVER be greater than 1
|
|
291
|
-
adjusted_p_val = 1.0;
|
|
292
|
-
}
|
|
293
|
-
//println!("Original p_value:{}", original_p_values[i].p_value);
|
|
294
|
-
//println!("Raw adjusted p_value:{}", adjusted_p_value);
|
|
295
|
-
if i != original_p_values.len() - 1 {
|
|
296
|
-
if adjusted_p_val > old_p_value {
|
|
297
|
-
adjusted_p_val = old_p_value;
|
|
298
|
-
}
|
|
299
|
-
}
|
|
300
|
-
old_p_value = adjusted_p_val;
|
|
301
|
-
//println!("adjusted_p_value:{}", adjusted_p_val);
|
|
302
|
-
rank -= 1.0;
|
|
303
|
-
|
|
304
|
-
adjusted_p_values.push(pathway_p_value {
|
|
305
|
-
pathway_name: original_p_values[i].pathway_name.clone(),
|
|
306
|
-
p_value_original: original_p_values[i].p_value_original,
|
|
307
|
-
p_value_adjusted: Some(adjusted_p_val),
|
|
308
|
-
auc: original_p_values[i].auc,
|
|
309
|
-
es: original_p_values[i].es,
|
|
310
|
-
gene_set_hits: original_p_values[i].gene_set_hits.clone(),
|
|
311
|
-
gene_set_size: original_p_values[i].gene_set_size,
|
|
312
|
-
});
|
|
313
|
-
}
|
|
314
|
-
adjusted_p_values.as_mut_slice().sort_by(|a, b| {
|
|
315
|
-
(a.p_value_adjusted.unwrap())
|
|
316
|
-
.partial_cmp(&b.p_value_adjusted.unwrap())
|
|
317
|
-
.unwrap_or(Ordering::Equal)
|
|
318
|
-
});
|
|
319
|
-
|
|
320
|
-
let mut output_string = "{".to_string();
|
|
321
|
-
for i in 0..adjusted_p_values.len() {
|
|
322
|
-
let item = output_struct {
|
|
323
|
-
pval: adjusted_p_values[i].p_value_original,
|
|
324
|
-
fdr: adjusted_p_values[i].p_value_adjusted.unwrap(),
|
|
325
|
-
leading_edge: adjusted_p_values[i].gene_set_hits.clone(),
|
|
326
|
-
geneset_size: adjusted_p_values[i].gene_set_size,
|
|
327
|
-
es: adjusted_p_values[i].es,
|
|
328
|
-
auc: adjusted_p_values[i].auc,
|
|
329
|
-
};
|
|
330
|
-
output_string += &format!(
|
|
331
|
-
"\"{}\":{}",
|
|
332
|
-
adjusted_p_values[i].pathway_name.clone(),
|
|
333
|
-
serde_json::to_string(&item).unwrap()
|
|
334
|
-
);
|
|
335
|
-
if i < adjusted_p_values.len() - 1 {
|
|
336
|
-
output_string += &",".to_string();
|
|
337
|
-
}
|
|
338
|
-
}
|
|
339
|
-
output_string += &"}".to_string();
|
|
340
|
-
output_string
|
|
341
|
-
}
|