@sjcrh/proteinpaint-rust 2.129.2 → 2.129.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -13,7 +13,7 @@ kodama = "0.3"
13
13
  rayon = "1.7.0"
14
14
  bgzip = "0.3.1"
15
15
  petgraph = "0.6.3"
16
- rusqlite="0.31.0"
16
+ rusqlite="0.36.0"
17
17
  ndarray = "0.16.1"
18
18
  hdf5 = { package = "hdf5-metno", version = "0.9.0" }
19
19
  nalgebra = {version = "0.32.2", features = ["serde-serialize"]}
@@ -35,6 +35,8 @@ flate2 = "1"
35
35
  futures = "0.3"
36
36
  num_cpus = "1.16.0"
37
37
  memchr = "2"
38
+ r2d2_sqlite = "0.29.0"
39
+ r2d2 = "0.8.10"
38
40
 
39
41
  [profile.release]
40
42
  lto = "fat"
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.129.2",
2
+ "version": "2.129.6",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "type": "module",
5
5
  "description": "Rust-based utilities for proteinpaint",
package/src/cerno.rs CHANGED
@@ -1,41 +1,39 @@
1
1
  // Syntax: cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/cerno
2
2
  #![allow(non_snake_case)]
3
3
  use json::JsonValue;
4
- use r_mathlib::chi_squared_cdf;
4
+ use r2d2;
5
+ use r2d2_sqlite::SqliteConnectionManager;
5
6
  use rusqlite::{Connection, Result};
6
7
  use serde::{Deserialize, Serialize};
7
8
  use serde_json;
8
9
  use std::cmp::Ordering;
9
10
  use std::collections::HashSet;
10
11
  use std::io;
12
+ use std::sync::{Arc, Mutex}; // Multithreading library
13
+ use std::thread;
14
+
15
+ mod stats_functions;
16
+ #[cfg(test)]
17
+ mod test_cerno; // Contains test examples to test cerno
11
18
 
12
19
  #[allow(non_camel_case_types)]
13
20
  #[allow(non_snake_case)]
14
- #[derive(Debug)]
21
+ #[derive(Debug, Clone)]
15
22
  struct GO_pathway {
16
23
  GO_id: String,
17
24
  }
18
25
 
19
- #[allow(non_camel_case_types)]
20
- #[allow(non_snake_case)]
21
- #[derive(Debug, Clone, PartialEq, PartialOrd)]
22
- struct gene_order {
23
- gene_name: String,
24
- fold_change: f64,
25
- rank: Option<usize>,
26
- }
27
-
28
26
  #[allow(non_camel_case_types)]
29
27
  #[allow(non_snake_case)]
30
28
  #[derive(Debug, Serialize, Deserialize)]
31
29
  //#[allow(dead_code)]
32
30
  struct pathway_p_value {
33
31
  pathway_name: String,
34
- p_value_original: f64,
35
- p_value_adjusted: Option<f64>,
32
+ p_value_original: f32,
33
+ p_value_adjusted: Option<f32>,
36
34
  gene_set_hits: String,
37
- auc: f64,
38
- es: f64,
35
+ auc: f32,
36
+ es: f32,
39
37
  gene_set_size: usize,
40
38
  }
41
39
 
@@ -44,13 +42,16 @@ struct pathway_p_value {
44
42
  #[derive(Debug, Serialize, Deserialize)]
45
43
  //#[allow(dead_code)]
46
44
  struct output_struct {
47
- pval: f64,
48
- fdr: f64,
45
+ pval: f32,
46
+ fdr: f32,
49
47
  leading_edge: String,
50
- auc: f64,
51
- es: f64,
48
+ auc: f32,
49
+ es: f32,
52
50
  geneset_size: usize,
53
51
  }
52
+ const PAR_CUTOFF: usize = 1000; // Cutoff for triggering multithreading processing of data
53
+ #[allow(non_upper_case_globals)]
54
+ const max_threads: usize = 3; // Max number of threads in case the parallel processing of reads is invoked
54
55
 
55
56
  fn main() -> Result<()> {
56
57
  let mut input = String::new();
@@ -81,25 +82,25 @@ fn main() -> Result<()> {
81
82
  //println!("sample_genes:{:?}", sample_genes);
82
83
 
83
84
  let fold_change_input: &JsonValue = &json_string["fold_change"];
84
- let mut fold_change_f64 = Vec::<f64>::new();
85
+ let mut fold_change_f32 = Vec::<f32>::new();
85
86
  for iter in 0..fold_change_input.len() {
86
- let item = fold_change_input[iter].as_f64().unwrap();
87
- fold_change_f64.push(item);
87
+ let item = fold_change_input[iter].as_f32().unwrap();
88
+ fold_change_f32.push(item);
88
89
  }
89
90
 
90
91
  if sample_genes.len() == 0 {
91
92
  panic!("No sample genes provided");
92
93
  }
93
94
 
94
- if sample_genes.len() != fold_change_f64.len() {
95
+ if sample_genes.len() != fold_change_f32.len() {
95
96
  panic!("Length of genes array and fold change array are not equal");
96
97
  }
97
98
 
98
- let mut genes_vector: Vec<gene_order> = Vec::with_capacity(sample_genes.len());
99
+ let mut genes_vector: Vec<stats_functions::gene_order> = Vec::with_capacity(sample_genes.len());
99
100
  for i in 0..sample_genes.len() {
100
- let item: gene_order = gene_order {
101
+ let item: stats_functions::gene_order = stats_functions::gene_order {
101
102
  gene_name: sample_genes[i].to_string(),
102
- fold_change: fold_change_f64[i],
103
+ fold_change: fold_change_f32[i],
103
104
  rank: None, // Will be calculated later
104
105
  };
105
106
  genes_vector.push(item)
@@ -118,7 +119,7 @@ fn main() -> Result<()> {
118
119
 
119
120
  let genedbconn = Connection::open(genedb)?;
120
121
  let genedb_result = genedbconn.prepare(&("select * from codingGenes"));
121
- let mut sample_coding_genes: Vec<gene_order> = Vec::with_capacity(24000);
122
+ let mut sample_coding_genes: Vec<stats_functions::gene_order> = Vec::with_capacity(24000);
122
123
  match genedb_result {
123
124
  Ok(mut x) => {
124
125
  let mut genes = x.query([])?;
@@ -145,16 +146,28 @@ fn main() -> Result<()> {
145
146
  sample_coding_genes
146
147
  .as_mut_slice()
147
148
  .sort_by(|a, b| (b.fold_change).partial_cmp(&a.fold_change).unwrap_or(Ordering::Equal));
149
+ let mut genes_descending = sample_coding_genes.clone();
150
+ //println!("genes_descending:{:?}", genes_descending);
151
+
152
+ // Sort sample_coding_gene in descending order
153
+ sample_coding_genes
154
+ .as_mut_slice()
155
+ .sort_by(|a, b| (a.fold_change).partial_cmp(&b.fold_change).unwrap_or(Ordering::Equal));
156
+ let mut genes_ascending = sample_coding_genes.clone();
157
+ //println!("genes_ascending:{:?}", genes_ascending);
158
+
159
+ drop(sample_coding_genes); // sample_coding_genes no longer deleted, so the variable is deleted
148
160
 
149
161
  // Assign ranks to each gene
150
- for i in 0..sample_coding_genes.len() {
151
- sample_coding_genes[i].rank = Some(i)
162
+ for i in 0..genes_descending.len() {
163
+ genes_descending[i].rank = Some(i);
164
+ genes_ascending[i].rank = Some(i)
152
165
  }
153
166
 
154
167
  //println!("sample_genes:{:?}", sample_genes);
155
168
  //println!("background_genes:{:?}", background_genes);
156
169
 
157
- let msigdbconn = Connection::open(msigdb)?;
170
+ let msigdbconn = Connection::open(&msigdb)?;
158
171
  let stmt_result = msigdbconn
159
172
  .prepare(&("select id from terms where parent_id='".to_owned() + &genesetgroup + "'"));
160
173
  match stmt_result {
@@ -162,58 +175,151 @@ fn main() -> Result<()> {
162
175
  #[allow(non_snake_case)]
163
176
  let GO_iter = stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
164
177
  #[allow(non_snake_case)]
178
+ let mut genesets = Vec::<String>::new();
165
179
  for GO_term in GO_iter {
166
180
  match GO_term {
167
181
  Ok(n) => {
168
- //println!("GO term {:?}", n);
169
- let sql_statement =
170
- "select genes from term2genes where id='".to_owned() + &n.GO_id + &"'";
171
- //println!("sql_statement:{}", sql_statement);
172
- let mut gene_stmt = msigdbconn.prepare(&(sql_statement))?;
173
- //println!("gene_stmt:{:?}", gene_stmt);
182
+ genesets.push(n.GO_id);
183
+ }
184
+ Err(_) => {
185
+ println!("GO term not found!")
186
+ }
187
+ }
188
+ }
174
189
 
175
- let mut rows = gene_stmt.query([])?;
176
- let mut names = HashSet::<String>::new();
177
- while let Some(row) = rows.next()? {
178
- let a: String = row.get(0)?;
179
- let input_gene_json = json::parse(&a);
180
- match input_gene_json {
181
- Ok(json_genes) => {
182
- for json_iter in 0..json_genes.len() {
183
- names.insert(json_genes[json_iter]["symbol"].to_string());
184
- }
185
- }
186
- Err(_) => {
187
- panic!("Symbol, ensg, enstCanonical structure is missing!")
190
+ if genesets.len() < PAR_CUTOFF {
191
+ for gs in genesets {
192
+ let sql_statement =
193
+ "select genes from term2genes where id='".to_owned() + &gs + &"'";
194
+ //println!("sql_statement:{}", sql_statement);
195
+ let mut gene_stmt = msigdbconn.prepare(&(sql_statement))?;
196
+ //println!("gene_stmt:{:?}", gene_stmt);
197
+
198
+ let mut rows = gene_stmt.query([])?;
199
+ let mut names = HashSet::<String>::new();
200
+ while let Some(row) = rows.next()? {
201
+ let a: String = row.get(0)?;
202
+ let input_gene_json = json::parse(&a);
203
+ match input_gene_json {
204
+ Ok(json_genes) => {
205
+ for json_iter in 0..json_genes.len() {
206
+ names.insert(json_genes[json_iter]["symbol"].to_string());
188
207
  }
189
208
  }
190
- }
191
- let gene_set_size = names.len();
192
- let (p_value, auc, es, matches, gene_set_hits) =
193
- cerno(&sample_coding_genes, names);
194
-
195
- if matches >= 1.0
196
- && p_value.is_nan() == false
197
- && es.is_nan() == false
198
- && es != f64::INFINITY
199
- && auc != f64::INFINITY
200
- && auc.is_nan() == false
201
- {
202
- pathway_p_values.push(pathway_p_value {
203
- pathway_name: n.GO_id,
204
- p_value_original: p_value,
205
- p_value_adjusted: None,
206
- auc: auc,
207
- es: es,
208
- gene_set_hits: gene_set_hits,
209
- gene_set_size: gene_set_size,
210
- })
209
+ Err(_) => {
210
+ panic!("Symbol, ensg, enstCanonical structure is missing!")
211
+ }
211
212
  }
212
213
  }
213
- Err(_) => {
214
- println!("GO term not found!")
214
+ let gene_set_size = names.len();
215
+ let (p_value, auc, es, matches, gene_set_hits, _cerno_output) =
216
+ stats_functions::cerno(&genes_descending, &genes_ascending, names);
217
+
218
+ if matches >= 1.0
219
+ && p_value.is_nan() == false
220
+ && es.is_nan() == false
221
+ && es != f32::INFINITY
222
+ && auc != f32::INFINITY
223
+ && auc.is_nan() == false
224
+ {
225
+ pathway_p_values.push(pathway_p_value {
226
+ pathway_name: gs,
227
+ p_value_original: p_value,
228
+ p_value_adjusted: None,
229
+ auc: auc,
230
+ es: es,
231
+ gene_set_hits: gene_set_hits,
232
+ gene_set_size: gene_set_size,
233
+ })
215
234
  }
216
235
  }
236
+ } else {
237
+ // Multithreaded implementation
238
+ let manager = SqliteConnectionManager::file(&msigdb); // This enables sqlite query from multiple threads simultaneously
239
+ let pool = r2d2::Pool::new(manager).unwrap(); // This enables sqlite query from multiple threads simultaneously
240
+ let genesets = Arc::new(genesets);
241
+ let pool_arc = Arc::new(pool);
242
+ let genes_descending = Arc::new(genes_descending);
243
+ let genes_ascending = Arc::new(genes_ascending);
244
+ let pathway_p_values_temp =
245
+ Arc::new(Mutex::new(Vec::<pathway_p_value>::with_capacity(genesets.len())));
246
+ let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
247
+ for thread_num in 0..max_threads {
248
+ let genesets = Arc::clone(&genesets);
249
+ let pool_arc = Arc::clone(&pool_arc);
250
+ let genes_descending = Arc::clone(&genes_descending);
251
+ let genes_ascending = Arc::clone(&genes_ascending);
252
+ let pathway_p_values_temp = Arc::clone(&pathway_p_values_temp);
253
+ let handle = thread::spawn(move || {
254
+ let mut pathway_p_values_thread: Vec<pathway_p_value> =
255
+ Vec::with_capacity(10000);
256
+ for iter in 0..genesets.len() {
257
+ let remainder: usize = iter % max_threads;
258
+ if remainder == thread_num {
259
+ let sql_statement = "select genes from term2genes where id='"
260
+ .to_owned()
261
+ + &genesets[iter]
262
+ + &"'";
263
+ //println!("sql_statement:{}", sql_statement);
264
+ let conn = pool_arc.get().unwrap();
265
+ let mut gene_stmt = conn.prepare(&sql_statement).unwrap();
266
+ //println!("gene_stmt:{:?}", gene_stmt);
267
+
268
+ let mut rows = gene_stmt.query([]).unwrap();
269
+ let mut names = HashSet::<String>::new();
270
+ while let Some(row) = rows.next().unwrap() {
271
+ let a: String = row.get(0).unwrap();
272
+ let input_gene_json = json::parse(&a);
273
+ match input_gene_json {
274
+ Ok(json_genes) => {
275
+ for json_iter in 0..json_genes.len() {
276
+ names.insert(
277
+ json_genes[json_iter]["symbol"].to_string(),
278
+ );
279
+ }
280
+ }
281
+ Err(_) => {
282
+ panic!("Symbol, ensg, enstCanonical structure is missing!")
283
+ }
284
+ }
285
+ }
286
+ let gene_set_size = names.len();
287
+ let (p_value, auc, es, matches, gene_set_hits, _cerno_output) =
288
+ stats_functions::cerno(&genes_descending, &genes_ascending, names);
289
+
290
+ if matches >= 1.0
291
+ && p_value.is_nan() == false
292
+ && es.is_nan() == false
293
+ && es != f32::INFINITY
294
+ && auc != f32::INFINITY
295
+ && auc.is_nan() == false
296
+ {
297
+ pathway_p_values_thread.push(pathway_p_value {
298
+ pathway_name: genesets[iter].clone(),
299
+ p_value_original: p_value,
300
+ p_value_adjusted: None,
301
+ auc: auc,
302
+ es: es,
303
+ gene_set_hits: gene_set_hits,
304
+ gene_set_size: gene_set_size,
305
+ })
306
+ }
307
+ }
308
+ }
309
+ pathway_p_values_temp
310
+ .lock()
311
+ .unwrap()
312
+ .append(&mut pathway_p_values_thread);
313
+ drop(pathway_p_values_temp);
314
+ });
315
+ handles.push(handle); // The handle (which contains the thread) is stored in the handles vector
316
+ }
317
+ for handle in handles {
318
+ // Wait for all threads to finish before proceeding further
319
+ handle.join().unwrap();
320
+ }
321
+ // Combining data from all different threads
322
+ pathway_p_values.append(&mut *pathway_p_values_temp.lock().unwrap());
217
323
  }
218
324
  }
219
325
  Err(_) => panic!("sqlite database file not found"),
@@ -229,46 +335,6 @@ fn main() -> Result<()> {
229
335
  Ok(())
230
336
  }
231
337
 
232
- fn cerno(sample_coding_genes: &Vec<gene_order>, genes_in_pathway: HashSet<String>) -> (f64, f64, f64, f64, String) {
233
- // Filter the sample_coding_genes vector to only include those whose gene_names are in the HashSet genes_in_pathway
234
- let gene_intersections: Vec<&gene_order> = sample_coding_genes
235
- .iter()
236
- .filter(|sample_coding_genes| genes_in_pathway.contains(&sample_coding_genes.gene_name)) // Check if name is in the HashSet genes_in_pathway
237
- .collect(); // Collect the results into a new vector
238
-
239
- let N1 = gene_intersections.len() as f64;
240
- let N = sample_coding_genes.len() as f64;
241
- let mut gene_set_hits: String = "".to_string();
242
- for gene in &gene_intersections {
243
- gene_set_hits += &(gene.gene_name.to_string() + &",");
244
- }
245
- if gene_intersections.len() > 0 {
246
- // Remove the last "," in string
247
- gene_set_hits.pop();
248
- }
249
-
250
- let ranks: Vec<usize> = gene_intersections // x <- l %in% mset$gs2gv[[m]] ; ranks <- c(1:N)[x]
251
- .iter()
252
- .map(|x| x.rank.unwrap())
253
- .collect::<Vec<usize>>();
254
-
255
- let cerno: f64 = ranks // -2 * sum( log(ranks/N) )
256
- .iter()
257
- .map(|x| ((*x as f64) / N).ln())
258
- .collect::<Vec<f64>>()
259
- .iter()
260
- .sum::<f64>()
261
- * (-2.0);
262
-
263
- let cES: f64 = cerno / (2.0 * (N1 as f64)); // cES <- cerno/(2*N1)
264
- let N2 = N - N1; // N2 = N - N1
265
- let R1 = ranks.iter().sum::<usize>() as f64; // R1 <- sum(ranks)
266
- let U = N1 * N2 + N1 * (N1 + 1.0) / 2.0 - R1; // U <- N1*N2+N1*(N1+1)/2-R1
267
- let AUC = U / (N1 * N2); // AUC <- U/(N1*N2)
268
- let p_value = chi_squared_cdf(cerno, 2.0 * N1, false, false); // pchisq(ret$cerno, 2*N1, lower.tail=FALSE)
269
- (p_value, AUC, cES, N1, gene_set_hits)
270
- }
271
-
272
338
  fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>) -> String {
273
339
  // Sorting p-values in ascending order
274
340
  original_p_values.as_mut_slice().sort_by(|a, b| {
@@ -278,13 +344,13 @@ fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>) -> String {
278
344
  });
279
345
 
280
346
  let mut adjusted_p_values: Vec<pathway_p_value> = Vec::with_capacity(original_p_values.len());
281
- let mut old_p_value: f64 = 0.0;
282
- let mut rank: f64 = original_p_values.len() as f64;
347
+ let mut old_p_value: f32 = 0.0;
348
+ let mut rank: f32 = original_p_values.len() as f32;
283
349
  for j in 0..original_p_values.len() {
284
350
  let i = original_p_values.len() - j - 1;
285
351
 
286
352
  //println!("p_val:{}", p_val);
287
- let mut adjusted_p_val: f64 = original_p_values[i].p_value_original * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
353
+ let mut adjusted_p_val: f32 = original_p_values[i].p_value_original * (original_p_values.len() as f32 / rank); // adjusted p-value = original_p_value * (N/rank)
288
354
  if adjusted_p_val > 1.0 {
289
355
  // p_value should NEVER be greater than 1
290
356
  adjusted_p_val = 1.0;