@sjcrh/proteinpaint-rust 2.128.3-bd2a3a1c9.0 → 2.129.1-80343740e.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +3 -1
- package/package.json +1 -1
- package/src/cerno.rs +170 -73
package/Cargo.toml
CHANGED
|
@@ -13,7 +13,7 @@ kodama = "0.3"
|
|
|
13
13
|
rayon = "1.7.0"
|
|
14
14
|
bgzip = "0.3.1"
|
|
15
15
|
petgraph = "0.6.3"
|
|
16
|
-
rusqlite="0.
|
|
16
|
+
rusqlite="0.35"
|
|
17
17
|
ndarray = "0.16.1"
|
|
18
18
|
hdf5 = { package = "hdf5-metno", version = "0.9.0" }
|
|
19
19
|
nalgebra = {version = "0.32.2", features = ["serde-serialize"]}
|
|
@@ -35,6 +35,8 @@ flate2 = "1"
|
|
|
35
35
|
futures = "0.3"
|
|
36
36
|
num_cpus = "1.16.0"
|
|
37
37
|
memchr = "2"
|
|
38
|
+
r2d2_sqlite = "0.28.0"
|
|
39
|
+
r2d2 = "0.8.10"
|
|
38
40
|
|
|
39
41
|
[profile.release]
|
|
40
42
|
lto = "fat"
|
package/package.json
CHANGED
package/src/cerno.rs
CHANGED
|
@@ -2,16 +2,20 @@
|
|
|
2
2
|
#![allow(non_snake_case)]
|
|
3
3
|
use json::JsonValue;
|
|
4
4
|
use r_mathlib::chi_squared_cdf;
|
|
5
|
+
use r2d2;
|
|
6
|
+
use r2d2_sqlite::SqliteConnectionManager;
|
|
5
7
|
use rusqlite::{Connection, Result};
|
|
6
8
|
use serde::{Deserialize, Serialize};
|
|
7
9
|
use serde_json;
|
|
8
10
|
use std::cmp::Ordering;
|
|
9
11
|
use std::collections::HashSet;
|
|
10
12
|
use std::io;
|
|
13
|
+
use std::sync::{Arc, Mutex}; // Multithreading library
|
|
14
|
+
use std::thread;
|
|
11
15
|
|
|
12
16
|
#[allow(non_camel_case_types)]
|
|
13
17
|
#[allow(non_snake_case)]
|
|
14
|
-
#[derive(Debug)]
|
|
18
|
+
#[derive(Debug, Clone)]
|
|
15
19
|
struct GO_pathway {
|
|
16
20
|
GO_id: String,
|
|
17
21
|
}
|
|
@@ -21,7 +25,7 @@ struct GO_pathway {
|
|
|
21
25
|
#[derive(Debug, Clone, PartialEq, PartialOrd)]
|
|
22
26
|
struct gene_order {
|
|
23
27
|
gene_name: String,
|
|
24
|
-
fold_change:
|
|
28
|
+
fold_change: f32,
|
|
25
29
|
rank: Option<usize>,
|
|
26
30
|
}
|
|
27
31
|
|
|
@@ -31,11 +35,11 @@ struct gene_order {
|
|
|
31
35
|
//#[allow(dead_code)]
|
|
32
36
|
struct pathway_p_value {
|
|
33
37
|
pathway_name: String,
|
|
34
|
-
p_value_original:
|
|
35
|
-
p_value_adjusted: Option<
|
|
38
|
+
p_value_original: f32,
|
|
39
|
+
p_value_adjusted: Option<f32>,
|
|
36
40
|
gene_set_hits: String,
|
|
37
|
-
auc:
|
|
38
|
-
es:
|
|
41
|
+
auc: f32,
|
|
42
|
+
es: f32,
|
|
39
43
|
gene_set_size: usize,
|
|
40
44
|
}
|
|
41
45
|
|
|
@@ -44,13 +48,16 @@ struct pathway_p_value {
|
|
|
44
48
|
#[derive(Debug, Serialize, Deserialize)]
|
|
45
49
|
//#[allow(dead_code)]
|
|
46
50
|
struct output_struct {
|
|
47
|
-
pval:
|
|
48
|
-
fdr:
|
|
51
|
+
pval: f32,
|
|
52
|
+
fdr: f32,
|
|
49
53
|
leading_edge: String,
|
|
50
|
-
auc:
|
|
51
|
-
es:
|
|
54
|
+
auc: f32,
|
|
55
|
+
es: f32,
|
|
52
56
|
geneset_size: usize,
|
|
53
57
|
}
|
|
58
|
+
const PAR_CUTOFF: usize = 1000; // Cutoff for triggering multithreading processing of data
|
|
59
|
+
#[allow(non_upper_case_globals)]
|
|
60
|
+
const max_threads: usize = 3; // Max number of threads in case the parallel processing of reads is invoked
|
|
54
61
|
|
|
55
62
|
fn main() -> Result<()> {
|
|
56
63
|
let mut input = String::new();
|
|
@@ -81,17 +88,17 @@ fn main() -> Result<()> {
|
|
|
81
88
|
//println!("sample_genes:{:?}", sample_genes);
|
|
82
89
|
|
|
83
90
|
let fold_change_input: &JsonValue = &json_string["fold_change"];
|
|
84
|
-
let mut
|
|
91
|
+
let mut fold_change_f32 = Vec::<f32>::new();
|
|
85
92
|
for iter in 0..fold_change_input.len() {
|
|
86
|
-
let item = fold_change_input[iter].
|
|
87
|
-
|
|
93
|
+
let item = fold_change_input[iter].as_f32().unwrap();
|
|
94
|
+
fold_change_f32.push(item);
|
|
88
95
|
}
|
|
89
96
|
|
|
90
97
|
if sample_genes.len() == 0 {
|
|
91
98
|
panic!("No sample genes provided");
|
|
92
99
|
}
|
|
93
100
|
|
|
94
|
-
if sample_genes.len() !=
|
|
101
|
+
if sample_genes.len() != fold_change_f32.len() {
|
|
95
102
|
panic!("Length of genes array and fold change array are not equal");
|
|
96
103
|
}
|
|
97
104
|
|
|
@@ -99,7 +106,7 @@ fn main() -> Result<()> {
|
|
|
99
106
|
for i in 0..sample_genes.len() {
|
|
100
107
|
let item: gene_order = gene_order {
|
|
101
108
|
gene_name: sample_genes[i].to_string(),
|
|
102
|
-
fold_change:
|
|
109
|
+
fold_change: fold_change_f32[i],
|
|
103
110
|
rank: None, // Will be calculated later
|
|
104
111
|
};
|
|
105
112
|
genes_vector.push(item)
|
|
@@ -154,7 +161,7 @@ fn main() -> Result<()> {
|
|
|
154
161
|
//println!("sample_genes:{:?}", sample_genes);
|
|
155
162
|
//println!("background_genes:{:?}", background_genes);
|
|
156
163
|
|
|
157
|
-
let msigdbconn = Connection::open(msigdb)?;
|
|
164
|
+
let msigdbconn = Connection::open(&msigdb)?;
|
|
158
165
|
let stmt_result = msigdbconn
|
|
159
166
|
.prepare(&("select id from terms where parent_id='".to_owned() + &genesetgroup + "'"));
|
|
160
167
|
match stmt_result {
|
|
@@ -162,58 +169,148 @@ fn main() -> Result<()> {
|
|
|
162
169
|
#[allow(non_snake_case)]
|
|
163
170
|
let GO_iter = stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
|
|
164
171
|
#[allow(non_snake_case)]
|
|
172
|
+
let mut genesets = Vec::<String>::new();
|
|
165
173
|
for GO_term in GO_iter {
|
|
166
174
|
match GO_term {
|
|
167
175
|
Ok(n) => {
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
176
|
+
genesets.push(n.GO_id);
|
|
177
|
+
}
|
|
178
|
+
Err(_) => {
|
|
179
|
+
println!("GO term not found!")
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if genesets.len() < PAR_CUTOFF {
|
|
185
|
+
for gs in genesets {
|
|
186
|
+
let sql_statement =
|
|
187
|
+
"select genes from term2genes where id='".to_owned() + &gs + &"'";
|
|
188
|
+
//println!("sql_statement:{}", sql_statement);
|
|
189
|
+
let mut gene_stmt = msigdbconn.prepare(&(sql_statement))?;
|
|
190
|
+
//println!("gene_stmt:{:?}", gene_stmt);
|
|
191
|
+
|
|
192
|
+
let mut rows = gene_stmt.query([])?;
|
|
193
|
+
let mut names = HashSet::<String>::new();
|
|
194
|
+
while let Some(row) = rows.next()? {
|
|
195
|
+
let a: String = row.get(0)?;
|
|
196
|
+
let input_gene_json = json::parse(&a);
|
|
197
|
+
match input_gene_json {
|
|
198
|
+
Ok(json_genes) => {
|
|
199
|
+
for json_iter in 0..json_genes.len() {
|
|
200
|
+
names.insert(json_genes[json_iter]["symbol"].to_string());
|
|
188
201
|
}
|
|
189
202
|
}
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
cerno(&sample_coding_genes, names);
|
|
194
|
-
|
|
195
|
-
if matches >= 1.0
|
|
196
|
-
&& p_value.is_nan() == false
|
|
197
|
-
&& es.is_nan() == false
|
|
198
|
-
&& es != f64::INFINITY
|
|
199
|
-
&& auc != f64::INFINITY
|
|
200
|
-
&& auc.is_nan() == false
|
|
201
|
-
{
|
|
202
|
-
pathway_p_values.push(pathway_p_value {
|
|
203
|
-
pathway_name: n.GO_id,
|
|
204
|
-
p_value_original: p_value,
|
|
205
|
-
p_value_adjusted: None,
|
|
206
|
-
auc: auc,
|
|
207
|
-
es: es,
|
|
208
|
-
gene_set_hits: gene_set_hits,
|
|
209
|
-
gene_set_size: gene_set_size,
|
|
210
|
-
})
|
|
203
|
+
Err(_) => {
|
|
204
|
+
panic!("Symbol, ensg, enstCanonical structure is missing!")
|
|
205
|
+
}
|
|
211
206
|
}
|
|
212
207
|
}
|
|
213
|
-
|
|
214
|
-
|
|
208
|
+
let gene_set_size = names.len();
|
|
209
|
+
let (p_value, auc, es, matches, gene_set_hits) = cerno(&sample_coding_genes, names);
|
|
210
|
+
|
|
211
|
+
if matches >= 1.0
|
|
212
|
+
&& p_value.is_nan() == false
|
|
213
|
+
&& es.is_nan() == false
|
|
214
|
+
&& es != f32::INFINITY
|
|
215
|
+
&& auc != f32::INFINITY
|
|
216
|
+
&& auc.is_nan() == false
|
|
217
|
+
{
|
|
218
|
+
pathway_p_values.push(pathway_p_value {
|
|
219
|
+
pathway_name: gs,
|
|
220
|
+
p_value_original: p_value,
|
|
221
|
+
p_value_adjusted: None,
|
|
222
|
+
auc: auc,
|
|
223
|
+
es: es,
|
|
224
|
+
gene_set_hits: gene_set_hits,
|
|
225
|
+
gene_set_size: gene_set_size,
|
|
226
|
+
})
|
|
215
227
|
}
|
|
216
228
|
}
|
|
229
|
+
} else {
|
|
230
|
+
// Multithreaded implementation
|
|
231
|
+
let manager = SqliteConnectionManager::file(&msigdb); // This enables sqlite query from multiple threads simultaneously
|
|
232
|
+
let pool = r2d2::Pool::new(manager).unwrap(); // This enables sqlite query from multiple threads simultaneously
|
|
233
|
+
let genesets = Arc::new(genesets);
|
|
234
|
+
let pool_arc = Arc::new(pool);
|
|
235
|
+
let sample_coding_genes = Arc::new(sample_coding_genes);
|
|
236
|
+
let pathway_p_values_temp =
|
|
237
|
+
Arc::new(Mutex::new(Vec::<pathway_p_value>::with_capacity(genesets.len())));
|
|
238
|
+
let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
|
|
239
|
+
for thread_num in 0..max_threads {
|
|
240
|
+
let genesets = Arc::clone(&genesets);
|
|
241
|
+
let pool_arc = Arc::clone(&pool_arc);
|
|
242
|
+
let sample_coding_genes = Arc::clone(&sample_coding_genes);
|
|
243
|
+
let pathway_p_values_temp = Arc::clone(&pathway_p_values_temp);
|
|
244
|
+
let handle = thread::spawn(move || {
|
|
245
|
+
let mut pathway_p_values_thread: Vec<pathway_p_value> =
|
|
246
|
+
Vec::with_capacity(10000);
|
|
247
|
+
for iter in 0..genesets.len() {
|
|
248
|
+
let remainder: usize = iter % max_threads;
|
|
249
|
+
if remainder == thread_num {
|
|
250
|
+
let sql_statement = "select genes from term2genes where id='"
|
|
251
|
+
.to_owned()
|
|
252
|
+
+ &genesets[iter]
|
|
253
|
+
+ &"'";
|
|
254
|
+
//println!("sql_statement:{}", sql_statement);
|
|
255
|
+
let conn = pool_arc.get().unwrap();
|
|
256
|
+
let mut gene_stmt = conn.prepare(&sql_statement).unwrap();
|
|
257
|
+
//println!("gene_stmt:{:?}", gene_stmt);
|
|
258
|
+
|
|
259
|
+
let mut rows = gene_stmt.query([]).unwrap();
|
|
260
|
+
let mut names = HashSet::<String>::new();
|
|
261
|
+
while let Some(row) = rows.next().unwrap() {
|
|
262
|
+
let a: String = row.get(0).unwrap();
|
|
263
|
+
let input_gene_json = json::parse(&a);
|
|
264
|
+
match input_gene_json {
|
|
265
|
+
Ok(json_genes) => {
|
|
266
|
+
for json_iter in 0..json_genes.len() {
|
|
267
|
+
names.insert(
|
|
268
|
+
json_genes[json_iter]["symbol"].to_string(),
|
|
269
|
+
);
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
Err(_) => {
|
|
273
|
+
panic!("Symbol, ensg, enstCanonical structure is missing!")
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
let gene_set_size = names.len();
|
|
278
|
+
let (p_value, auc, es, matches, gene_set_hits) =
|
|
279
|
+
cerno(&sample_coding_genes, names);
|
|
280
|
+
|
|
281
|
+
if matches >= 1.0
|
|
282
|
+
&& p_value.is_nan() == false
|
|
283
|
+
&& es.is_nan() == false
|
|
284
|
+
&& es != f32::INFINITY
|
|
285
|
+
&& auc != f32::INFINITY
|
|
286
|
+
&& auc.is_nan() == false
|
|
287
|
+
{
|
|
288
|
+
pathway_p_values_thread.push(pathway_p_value {
|
|
289
|
+
pathway_name: genesets[iter].clone(),
|
|
290
|
+
p_value_original: p_value,
|
|
291
|
+
p_value_adjusted: None,
|
|
292
|
+
auc: auc,
|
|
293
|
+
es: es,
|
|
294
|
+
gene_set_hits: gene_set_hits,
|
|
295
|
+
gene_set_size: gene_set_size,
|
|
296
|
+
})
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
pathway_p_values_temp
|
|
301
|
+
.lock()
|
|
302
|
+
.unwrap()
|
|
303
|
+
.append(&mut pathway_p_values_thread);
|
|
304
|
+
drop(pathway_p_values_temp);
|
|
305
|
+
});
|
|
306
|
+
handles.push(handle); // The handle (which contains the thread) is stored in the handles vector
|
|
307
|
+
}
|
|
308
|
+
for handle in handles {
|
|
309
|
+
// Wait for all threads to finish before proceeding further
|
|
310
|
+
handle.join().unwrap();
|
|
311
|
+
}
|
|
312
|
+
// Combining data from all different threads
|
|
313
|
+
pathway_p_values.append(&mut *pathway_p_values_temp.lock().unwrap());
|
|
217
314
|
}
|
|
218
315
|
}
|
|
219
316
|
Err(_) => panic!("sqlite database file not found"),
|
|
@@ -229,15 +326,15 @@ fn main() -> Result<()> {
|
|
|
229
326
|
Ok(())
|
|
230
327
|
}
|
|
231
328
|
|
|
232
|
-
fn cerno(sample_coding_genes: &Vec<gene_order>, genes_in_pathway: HashSet<String>) -> (
|
|
329
|
+
fn cerno(sample_coding_genes: &Vec<gene_order>, genes_in_pathway: HashSet<String>) -> (f32, f32, f32, f32, String) {
|
|
233
330
|
// Filter the sample_coding_genes vector to only include those whose gene_names are in the HashSet genes_in_pathway
|
|
234
331
|
let gene_intersections: Vec<&gene_order> = sample_coding_genes
|
|
235
332
|
.iter()
|
|
236
333
|
.filter(|sample_coding_genes| genes_in_pathway.contains(&sample_coding_genes.gene_name)) // Check if name is in the HashSet genes_in_pathway
|
|
237
334
|
.collect(); // Collect the results into a new vector
|
|
238
335
|
|
|
239
|
-
let N1 = gene_intersections.len() as
|
|
240
|
-
let N = sample_coding_genes.len() as
|
|
336
|
+
let N1 = gene_intersections.len() as f32;
|
|
337
|
+
let N = sample_coding_genes.len() as f32;
|
|
241
338
|
let mut gene_set_hits: String = "".to_string();
|
|
242
339
|
for gene in &gene_intersections {
|
|
243
340
|
gene_set_hits += &(gene.gene_name.to_string() + &",");
|
|
@@ -252,21 +349,21 @@ fn cerno(sample_coding_genes: &Vec<gene_order>, genes_in_pathway: HashSet<String
|
|
|
252
349
|
.map(|x| x.rank.unwrap())
|
|
253
350
|
.collect::<Vec<usize>>();
|
|
254
351
|
|
|
255
|
-
let cerno:
|
|
352
|
+
let cerno: f32 = ranks // -2 * sum( log(ranks/N) )
|
|
256
353
|
.iter()
|
|
257
|
-
.map(|x| ((*x as
|
|
258
|
-
.collect::<Vec<
|
|
354
|
+
.map(|x| ((*x as f32) / N).ln())
|
|
355
|
+
.collect::<Vec<f32>>()
|
|
259
356
|
.iter()
|
|
260
|
-
.sum::<
|
|
357
|
+
.sum::<f32>()
|
|
261
358
|
* (-2.0);
|
|
262
359
|
|
|
263
|
-
let cES:
|
|
360
|
+
let cES: f32 = cerno / (2.0 * (N1 as f32)); // cES <- cerno/(2*N1)
|
|
264
361
|
let N2 = N - N1; // N2 = N - N1
|
|
265
|
-
let R1 = ranks.iter().sum::<usize>() as
|
|
362
|
+
let R1 = ranks.iter().sum::<usize>() as f32; // R1 <- sum(ranks)
|
|
266
363
|
let U = N1 * N2 + N1 * (N1 + 1.0) / 2.0 - R1; // U <- N1*N2+N1*(N1+1)/2-R1
|
|
267
364
|
let AUC = U / (N1 * N2); // AUC <- U/(N1*N2)
|
|
268
|
-
let p_value = chi_squared_cdf(cerno, 2.0 * N1, false, false); // pchisq(ret$cerno, 2*N1, lower.tail=FALSE)
|
|
269
|
-
(p_value, AUC, cES, N1, gene_set_hits)
|
|
365
|
+
let p_value = chi_squared_cdf(cerno as f64, (2.0 * N1) as f64, false, false); // pchisq(ret$cerno, 2*N1, lower.tail=FALSE)
|
|
366
|
+
(p_value as f32, AUC, cES, N1, gene_set_hits)
|
|
270
367
|
}
|
|
271
368
|
|
|
272
369
|
fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>) -> String {
|
|
@@ -278,13 +375,13 @@ fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>) -> String {
|
|
|
278
375
|
});
|
|
279
376
|
|
|
280
377
|
let mut adjusted_p_values: Vec<pathway_p_value> = Vec::with_capacity(original_p_values.len());
|
|
281
|
-
let mut old_p_value:
|
|
282
|
-
let mut rank:
|
|
378
|
+
let mut old_p_value: f32 = 0.0;
|
|
379
|
+
let mut rank: f32 = original_p_values.len() as f32;
|
|
283
380
|
for j in 0..original_p_values.len() {
|
|
284
381
|
let i = original_p_values.len() - j - 1;
|
|
285
382
|
|
|
286
383
|
//println!("p_val:{}", p_val);
|
|
287
|
-
let mut adjusted_p_val:
|
|
384
|
+
let mut adjusted_p_val: f32 = original_p_values[i].p_value_original * (original_p_values.len() as f32 / rank); // adjusted p-value = original_p_value * (N/rank)
|
|
288
385
|
if adjusted_p_val > 1.0 {
|
|
289
386
|
// p_value should NEVER be greater than 1
|
|
290
387
|
adjusted_p_val = 1.0;
|