@sjcrh/proteinpaint-rust 2.126.2 → 2.128.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/Cargo.toml +4 -0
  2. package/package.json +2 -2
  3. package/src/cerno.rs +340 -0
package/Cargo.toml CHANGED
@@ -105,3 +105,7 @@ path="src/validateHDF5.rs"
105
105
  [[bin]]
106
106
  name="gdcGRIN2"
107
107
  path="src/gdcGRIN2.rs"
108
+
109
+ [[bin]]
110
+ name="cerno"
111
+ path="src/cerno.rs"
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.126.2",
2
+ "version": "2.128.1",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "type": "module",
5
5
  "description": "Rust-based utilities for proteinpaint",
@@ -39,5 +39,5 @@
39
39
  "devDependencies": {
40
40
  "tape": "^5.2.2"
41
41
  },
42
- "pp_release_tag": "v2.126.2"
42
+ "pp_release_tag": "v2.128.1"
43
43
  }
package/src/cerno.rs ADDED
@@ -0,0 +1,340 @@
1
+ // Syntax: cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/cerno
2
+ #![allow(non_snake_case)]
3
+ use json::JsonValue;
4
+ use r_mathlib::chi_squared_cdf;
5
+ use rusqlite::{Connection, Result};
6
+ use serde::{Deserialize, Serialize};
7
+ use serde_json;
8
+ use std::cmp::Ordering;
9
+ use std::collections::HashSet;
10
+ use std::io;
11
+
12
+ #[allow(non_camel_case_types)]
13
+ #[allow(non_snake_case)]
14
+ #[derive(Debug)]
15
+ struct GO_pathway {
16
+ GO_id: String,
17
+ }
18
+
19
+ #[allow(non_camel_case_types)]
20
+ #[allow(non_snake_case)]
21
+ #[derive(Debug, Clone, PartialEq, PartialOrd)]
22
+ struct gene_order {
23
+ gene_name: String,
24
+ fold_change: f64,
25
+ rank: Option<usize>,
26
+ }
27
+
28
+ #[allow(non_camel_case_types)]
29
+ #[allow(non_snake_case)]
30
+ #[derive(Debug, Serialize, Deserialize)]
31
+ //#[allow(dead_code)]
32
+ struct pathway_p_value {
33
+ pathway_name: String,
34
+ p_value_original: f64,
35
+ p_value_adjusted: Option<f64>,
36
+ gene_set_hits: String,
37
+ auc: f64,
38
+ es: f64,
39
+ gene_set_size: usize,
40
+ }
41
+
42
+ #[allow(non_camel_case_types)]
43
+ #[allow(non_snake_case)]
44
+ #[derive(Debug, Serialize, Deserialize)]
45
+ //#[allow(dead_code)]
46
+ struct output_struct {
47
+ pval: f64,
48
+ fdr: f64,
49
+ leading_edge: String,
50
+ auc: f64,
51
+ es: f64,
52
+ geneset_size: usize,
53
+ }
54
+
55
+ fn main() -> Result<()> {
56
+ let mut input = String::new();
57
+ match io::stdin().read_line(&mut input) {
58
+ // Accepting the piped input from nodejs (or command line from testing)
59
+ Ok(_n) => {
60
+ let input_json = json::parse(&input);
61
+ match input_json {
62
+ Ok(json_string) => {
63
+ let msigdb_input: &JsonValue = &json_string["db"];
64
+ let msigdb;
65
+ match msigdb_input.as_str() {
66
+ Some(db_string) => msigdb = db_string.to_string(),
67
+ None => panic!("msigdb file path is missing"),
68
+ }
69
+ let genesetgroup;
70
+ let genesetgroup_input: &JsonValue = &json_string["geneset_group"];
71
+ match genesetgroup_input.as_str() {
72
+ Some(genesetgroup_string) => genesetgroup = genesetgroup_string.to_string(),
73
+ None => panic!("genesetgroup is missing"),
74
+ }
75
+ let sample_genes_input: &JsonValue = &json_string["genes"];
76
+ let mut sample_genes = Vec::<&str>::new();
77
+ for iter in 0..sample_genes_input.len() {
78
+ let item = sample_genes_input[iter].as_str().unwrap();
79
+ sample_genes.push(item);
80
+ }
81
+ //println!("sample_genes:{:?}", sample_genes);
82
+
83
+ let fold_change_input: &JsonValue = &json_string["fold_change"];
84
+ let mut fold_change_f64 = Vec::<f64>::new();
85
+ for iter in 0..fold_change_input.len() {
86
+ let item = fold_change_input[iter].as_f64().unwrap();
87
+ fold_change_f64.push(item);
88
+ }
89
+
90
+ if sample_genes.len() == 0 {
91
+ panic!("No sample genes provided");
92
+ }
93
+
94
+ if sample_genes.len() != fold_change_f64.len() {
95
+ panic!("Length of genes array and fold change array are not equal");
96
+ }
97
+
98
+ let mut genes_vector: Vec<gene_order> = Vec::with_capacity(sample_genes.len());
99
+ for i in 0..sample_genes.len() {
100
+ let item: gene_order = gene_order {
101
+ gene_name: sample_genes[i].to_string(),
102
+ fold_change: fold_change_f64[i],
103
+ rank: None, // Will be calculated later
104
+ };
105
+ genes_vector.push(item)
106
+ }
107
+ let mut pathway_p_values: Vec<pathway_p_value> = Vec::with_capacity(10000);
108
+
109
+ let genedb_input: &JsonValue = &json_string["genedb"];
110
+ let genedb;
111
+ match genedb_input.as_str() {
112
+ Some(gene_db_string) => genedb = gene_db_string.to_string(),
113
+ None => panic!("genedb file path is missing"),
114
+ }
115
+
116
+ let filter_non_coding_genes_input: &JsonValue = &json_string["filter_non_coding_genes"];
117
+ let filter_non_coding_genes: bool = filter_non_coding_genes_input.as_bool().unwrap();
118
+
119
+ let genedbconn = Connection::open(genedb)?;
120
+ let genedb_result = genedbconn.prepare(&("select * from codingGenes"));
121
+ let mut sample_coding_genes: Vec<gene_order> = Vec::with_capacity(24000);
122
+ match genedb_result {
123
+ Ok(mut x) => {
124
+ let mut genes = x.query([])?;
125
+ while let Some(coding_gene) = genes.next()? {
126
+ //println!("coding_gene:{:?}", coding_gene);
127
+ for sample_gene in &genes_vector {
128
+ let code_gene: String = coding_gene.get(0).unwrap();
129
+ if filter_non_coding_genes == true && code_gene == *sample_gene.gene_name {
130
+ sample_coding_genes.push(sample_gene.clone());
131
+ } else if filter_non_coding_genes == false {
132
+ sample_coding_genes.push(sample_gene.clone());
133
+ }
134
+ }
135
+ }
136
+ }
137
+ Err(_) => {}
138
+ }
139
+
140
+ if sample_coding_genes.len() == 0 {
141
+ panic!("All query genes are non-coding");
142
+ }
143
+
144
+ // Sort sample_coding_gene in descending order
145
+ sample_coding_genes
146
+ .as_mut_slice()
147
+ .sort_by(|a, b| (b.fold_change).partial_cmp(&a.fold_change).unwrap_or(Ordering::Equal));
148
+
149
+ // Assign ranks to each gene
150
+ for i in 0..sample_coding_genes.len() {
151
+ sample_coding_genes[i].rank = Some(i)
152
+ }
153
+
154
+ //println!("sample_genes:{:?}", sample_genes);
155
+ //println!("background_genes:{:?}", background_genes);
156
+
157
+ let msigdbconn = Connection::open(msigdb)?;
158
+ let stmt_result = msigdbconn
159
+ .prepare(&("select id from terms where parent_id='".to_owned() + &genesetgroup + "'"));
160
+ match stmt_result {
161
+ Ok(mut stmt) => {
162
+ #[allow(non_snake_case)]
163
+ let GO_iter = stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
164
+ #[allow(non_snake_case)]
165
+ for GO_term in GO_iter {
166
+ match GO_term {
167
+ Ok(n) => {
168
+ //println!("GO term {:?}", n);
169
+ let sql_statement =
170
+ "select genes from term2genes where id='".to_owned() + &n.GO_id + &"'";
171
+ //println!("sql_statement:{}", sql_statement);
172
+ let mut gene_stmt = msigdbconn.prepare(&(sql_statement))?;
173
+ //println!("gene_stmt:{:?}", gene_stmt);
174
+
175
+ let mut rows = gene_stmt.query([])?;
176
+ let mut names = HashSet::<String>::new();
177
+ while let Some(row) = rows.next()? {
178
+ let a: String = row.get(0)?;
179
+ let input_gene_json = json::parse(&a);
180
+ match input_gene_json {
181
+ Ok(json_genes) => {
182
+ for json_iter in 0..json_genes.len() {
183
+ names.insert(json_genes[json_iter]["symbol"].to_string());
184
+ }
185
+ }
186
+ Err(_) => {
187
+ panic!("Symbol, ensg, enstCanonical structure is missing!")
188
+ }
189
+ }
190
+ }
191
+ let gene_set_size = names.len();
192
+ let (p_value, auc, es, matches, gene_set_hits) =
193
+ cerno(&sample_coding_genes, names);
194
+
195
+ if matches >= 1.0
196
+ && p_value.is_nan() == false
197
+ && es.is_nan() == false
198
+ && es != f64::INFINITY
199
+ && auc != f64::INFINITY
200
+ && auc.is_nan() == false
201
+ {
202
+ pathway_p_values.push(pathway_p_value {
203
+ pathway_name: n.GO_id,
204
+ p_value_original: p_value,
205
+ p_value_adjusted: None,
206
+ auc: auc,
207
+ es: es,
208
+ gene_set_hits: gene_set_hits,
209
+ gene_set_size: gene_set_size,
210
+ })
211
+ }
212
+ }
213
+ Err(_) => {
214
+ println!("GO term not found!")
215
+ }
216
+ }
217
+ }
218
+ }
219
+ Err(_) => panic!("sqlite database file not found"),
220
+ }
221
+ let output_string = adjust_p_values(pathway_p_values);
222
+ println!("{}", output_string);
223
+ }
224
+ Err(error) => println!("Incorrect json:{}", error),
225
+ }
226
+ }
227
+ Err(error) => println!("Piping error: {}", error),
228
+ }
229
+ Ok(())
230
+ }
231
+
232
+ fn cerno(sample_coding_genes: &Vec<gene_order>, genes_in_pathway: HashSet<String>) -> (f64, f64, f64, f64, String) {
233
+ // Filter the sample_coding_genes vector to only include those whose gene_names are in the HashSet genes_in_pathway
234
+ let gene_intersections: Vec<&gene_order> = sample_coding_genes
235
+ .iter()
236
+ .filter(|sample_coding_genes| genes_in_pathway.contains(&sample_coding_genes.gene_name)) // Check if name is in the HashSet genes_in_pathway
237
+ .collect(); // Collect the results into a new vector
238
+
239
+ let N1 = gene_intersections.len() as f64;
240
+ let N = sample_coding_genes.len() as f64;
241
+ let mut gene_set_hits: String = "".to_string();
242
+ for gene in &gene_intersections {
243
+ gene_set_hits += &(gene.gene_name.to_string() + &",");
244
+ }
245
+ if gene_intersections.len() > 0 {
246
+ // Remove the last "," in string
247
+ gene_set_hits.pop();
248
+ }
249
+
250
+ let ranks: Vec<usize> = gene_intersections // x <- l %in% mset$gs2gv[[m]] ; ranks <- c(1:N)[x]
251
+ .iter()
252
+ .map(|x| x.rank.unwrap())
253
+ .collect::<Vec<usize>>();
254
+
255
+ let cerno: f64 = ranks // -2 * sum( log(ranks/N) )
256
+ .iter()
257
+ .map(|x| ((*x as f64) / N).ln())
258
+ .collect::<Vec<f64>>()
259
+ .iter()
260
+ .sum::<f64>()
261
+ * (-2.0);
262
+
263
+ let cES: f64 = cerno / (2.0 * (N1 as f64)); // cES <- cerno/(2*N1)
264
+ let N2 = N - N1; // N2 = N - N1
265
+ let R1 = ranks.iter().sum::<usize>() as f64; // R1 <- sum(ranks)
266
+ let U = N1 * N2 + N1 * (N1 + 1.0) / 2.0 - R1; // U <- N1*N2+N1*(N1+1)/2-R1
267
+ let AUC = U / (N1 * N2); // AUC <- U/(N1*N2)
268
+ let p_value = chi_squared_cdf(cerno, 2.0 * N1, false, false); // pchisq(ret$cerno, 2*N1, lower.tail=FALSE)
269
+ (p_value, AUC, cES, N1, gene_set_hits)
270
+ }
271
+
272
+ fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>) -> String {
273
+ // Sorting p-values in ascending order
274
+ original_p_values.as_mut_slice().sort_by(|a, b| {
275
+ (a.p_value_original)
276
+ .partial_cmp(&b.p_value_original)
277
+ .unwrap_or(Ordering::Equal)
278
+ });
279
+
280
+ let mut adjusted_p_values: Vec<pathway_p_value> = Vec::with_capacity(original_p_values.len());
281
+ let mut old_p_value: f64 = 0.0;
282
+ let mut rank: f64 = original_p_values.len() as f64;
283
+ for j in 0..original_p_values.len() {
284
+ let i = original_p_values.len() - j - 1;
285
+
286
+ //println!("p_val:{}", p_val);
287
+ let mut adjusted_p_val: f64 = original_p_values[i].p_value_original * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
288
+ if adjusted_p_val > 1.0 {
289
+ // p_value should NEVER be greater than 1
290
+ adjusted_p_val = 1.0;
291
+ }
292
+ //println!("Original p_value:{}", original_p_values[i].p_value);
293
+ //println!("Raw adjusted p_value:{}", adjusted_p_value);
294
+ if i != original_p_values.len() - 1 {
295
+ if adjusted_p_val > old_p_value {
296
+ adjusted_p_val = old_p_value;
297
+ }
298
+ }
299
+ old_p_value = adjusted_p_val;
300
+ //println!("adjusted_p_value:{}", adjusted_p_val);
301
+ rank -= 1.0;
302
+
303
+ adjusted_p_values.push(pathway_p_value {
304
+ pathway_name: original_p_values[i].pathway_name.clone(),
305
+ p_value_original: original_p_values[i].p_value_original,
306
+ p_value_adjusted: Some(adjusted_p_val),
307
+ auc: original_p_values[i].auc,
308
+ es: original_p_values[i].es,
309
+ gene_set_hits: original_p_values[i].gene_set_hits.clone(),
310
+ gene_set_size: original_p_values[i].gene_set_size,
311
+ });
312
+ }
313
+ adjusted_p_values.as_mut_slice().sort_by(|a, b| {
314
+ (a.p_value_adjusted.unwrap())
315
+ .partial_cmp(&b.p_value_adjusted.unwrap())
316
+ .unwrap_or(Ordering::Equal)
317
+ });
318
+
319
+ let mut output_string = "{".to_string();
320
+ for i in 0..adjusted_p_values.len() {
321
+ let item = output_struct {
322
+ pval: adjusted_p_values[i].p_value_original,
323
+ fdr: adjusted_p_values[i].p_value_adjusted.unwrap(),
324
+ leading_edge: adjusted_p_values[i].gene_set_hits.clone(),
325
+ geneset_size: adjusted_p_values[i].gene_set_size,
326
+ es: adjusted_p_values[i].es,
327
+ auc: adjusted_p_values[i].auc,
328
+ };
329
+ output_string += &format!(
330
+ "\"{}\":{}",
331
+ adjusted_p_values[i].pathway_name.clone(),
332
+ serde_json::to_string(&item).unwrap()
333
+ );
334
+ if i < adjusted_p_values.len() - 1 {
335
+ output_string += &",".to_string();
336
+ }
337
+ }
338
+ output_string += &"}".to_string();
339
+ output_string
340
+ }