@sjcrh/proteinpaint-rust 2.49.0 → 2.57.1-1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -13,6 +13,7 @@ kodama = "0.3"
13
13
  rayon = "1.7.0"
14
14
  bgzip = "0.3.1"
15
15
  petgraph = "0.6.3"
16
+ rusqlite="0.31.0"
16
17
  ndarray = "0.15.6"
17
18
  nalgebra = {version = "0.32.2", features = ["serde-serialize"]}
18
19
  plotters = "0.3.4"
@@ -32,8 +33,6 @@ reqwest = "0.11"
32
33
  flate2 = "1"
33
34
  futures = "0.3"
34
35
 
35
-
36
-
37
36
  [profile.release]
38
37
  lto = "fat"
39
38
  codegen-units = 1
@@ -83,3 +82,7 @@ path="src/wilcoxon.rs"
83
82
  [[bin]]
84
83
  name="DEanalysis"
85
84
  path="src/DEanalysis.rs"
85
+
86
+ [[bin]]
87
+ name="genesetORA"
88
+ path="src/genesetORA.rs"
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.49.0",
2
+ "version": "2.57.1-1",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
package/src/gdcmaf.rs CHANGED
@@ -42,8 +42,8 @@ fn select_maf_col(d:String,columns:&Vec<String>) -> Vec<u8> {
42
42
  for x in header_indices.iter() {
43
43
  maf_out_lst.push(maf_cont_lst[*x].to_string());
44
44
  };
45
- maf_out_lst.push("\n".to_string());
46
45
  maf_str.push_str(maf_out_lst.join("\t").as_str());
46
+ maf_str.push_str("\n");
47
47
  }
48
48
  };
49
49
  maf_str.as_bytes().to_vec()
@@ -0,0 +1,256 @@
1
+ // The hypergeometric distribution is computed based on the implementation in https://rdrr.io/github/GuangchuangYu/DOSE/src/R/enricher_internal.R
2
+ // Syntax: cd .. && cargo build --release && cat ~/sjpp/test.txt | target/release/genesetORA
3
+ #![allow(non_snake_case)]
4
+ use json::JsonValue;
5
+ use r_mathlib;
6
+ use rusqlite::{Connection, Result};
7
+ use serde::{Deserialize, Serialize};
8
+ use serde_json;
9
+ use std::cmp::Ordering;
10
+ use std::io;
11
+ use std::time::Instant;
12
+
13
+ #[allow(non_camel_case_types)]
14
+ #[allow(non_snake_case)]
15
+ #[derive(Debug)]
16
+ struct GO_pathway {
17
+ GO_id: String,
18
+ }
19
+
20
+ #[allow(non_camel_case_types)]
21
+ #[allow(non_snake_case)]
22
+ #[derive(Debug)]
23
+ struct pathway_genes {
24
+ symbol: String,
25
+ _ensg: String,
26
+ _enstCanonical: String,
27
+ }
28
+
29
+ #[allow(non_camel_case_types)]
30
+ #[allow(non_snake_case)]
31
+ #[derive(Debug, Serialize, Deserialize)]
32
+ //#[allow(dead_code)]
33
+ struct pathway_p_value {
34
+ pathway_name: String,
35
+ p_value_original: f64,
36
+ p_value_adjusted: Option<f64>,
37
+ }
38
+
39
+ fn calculate_hypergeometric_p_value(
40
+ sample_genes: &Vec<&str>,
41
+ background_genes: &Vec<&str>,
42
+ genes_in_pathway: Vec<pathway_genes>,
43
+ ) -> f64 {
44
+ let matching_sample_genes_counts: f64 = sample_genes
45
+ .iter()
46
+ .zip(&genes_in_pathway)
47
+ .filter(|&(a, b)| *a.to_string() == b.symbol)
48
+ .count() as f64;
49
+ //println!("k-1:{}", matching_sample_genes_counts - 1.0);
50
+ //println!("M:{}", genes_in_pathway.len() as f64);
51
+ //println!(
52
+ // "N-M:{}",
53
+ // background_genes.len() as f64 - genes_in_pathway.len() as f64
54
+ //);
55
+ //println!("n:{}", sample_genes.len() as f64);
56
+ let p_value = r_mathlib::hypergeometric_cdf(
57
+ matching_sample_genes_counts - 1.0,
58
+ genes_in_pathway.len() as f64,
59
+ background_genes.len() as f64 - genes_in_pathway.len() as f64,
60
+ sample_genes.len() as f64,
61
+ false,
62
+ false,
63
+ );
64
+ //println!("p_value:{}", p_value);
65
+ p_value
66
+ }
67
+
68
+ fn main() -> Result<()> {
69
+ let mut input = String::new();
70
+ match io::stdin().read_line(&mut input) {
71
+ // Accepting the piped input from nodejs (or command line from testing)
72
+ Ok(_n) => {
73
+ let input_json = json::parse(&input);
74
+ match input_json {
75
+ Ok(json_string) => {
76
+ let run_time = Instant::now();
77
+ let db_input: &JsonValue = &json_string["db"];
78
+ let db;
79
+ match db_input.as_str() {
80
+ Some(db_string) => db = db_string.to_string(),
81
+ None => panic!("db file path is missing"),
82
+ }
83
+ let genesetgroup;
84
+ let genesetgroup_input: &JsonValue = &json_string["gene_set_group"];
85
+ match genesetgroup_input.as_str() {
86
+ Some(genesetgroup_string) => genesetgroup = genesetgroup_string.to_string(),
87
+ None => panic!("genesetgroup is missing"),
88
+ }
89
+ let sample_genes_input: &JsonValue = &json_string["sample_genes"];
90
+ let sample_genes: Vec<&str> =
91
+ sample_genes_input.as_str().unwrap().split(",").collect();
92
+ let mut pathway_p_values: Vec<pathway_p_value> = Vec::with_capacity(10000);
93
+ let background_genes_input: &JsonValue = &json_string["background_genes"];
94
+ let background_genes: Vec<&str> = background_genes_input
95
+ .as_str()
96
+ .unwrap()
97
+ .split(",")
98
+ .collect();
99
+ //println!("sample_genes:{:?}", sample_genes);
100
+ //println!("background_genes:{:?}", background_genes);
101
+
102
+ if sample_genes.len() == 0 {
103
+ panic!("No sample genes provided");
104
+ } else if background_genes.len() == 0 {
105
+ panic!("No background genes provided");
106
+ }
107
+ let num_items_output = 100; // Number of top pathways to be specified in the output
108
+
109
+ let conn = Connection::open(db)?;
110
+ let stmt_result = conn.prepare(
111
+ &("select id from terms where parent_id='".to_owned()
112
+ + &genesetgroup
113
+ + "'"),
114
+ );
115
+ match stmt_result {
116
+ Ok(mut stmt) => {
117
+ #[allow(non_snake_case)]
118
+ let GO_iter =
119
+ stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
120
+ let mut iter = 0;
121
+ #[allow(non_snake_case)]
122
+ for GO_term in GO_iter {
123
+ iter += 1;
124
+ match GO_term {
125
+ Ok(n) => {
126
+ //println!("GO term {:?}", n);
127
+ let sql_statement =
128
+ "select genes from term2genes where id='".to_owned()
129
+ + &n.GO_id
130
+ + &"'";
131
+ //println!("sql_statement:{}", sql_statement);
132
+ let mut gene_stmt = conn.prepare(&(sql_statement))?;
133
+ //println!("gene_stmt:{:?}", gene_stmt);
134
+
135
+ let mut rows = gene_stmt.query([])?;
136
+ let mut names = Vec::<pathway_genes>::new();
137
+ while let Some(row) = rows.next()? {
138
+ let a: String = row.get(0)?;
139
+ let input_gene_json = json::parse(&a);
140
+ match input_gene_json {
141
+ Ok(json_genes) => {
142
+ for json_iter in 0..json_genes.len() {
143
+ let item = pathway_genes {
144
+ symbol: json_genes[json_iter]["symbol"]
145
+ .to_string(),
146
+ _ensg: json_genes[json_iter]["ensg"]
147
+ .to_string(),
148
+ _enstCanonical: json_genes[json_iter]
149
+ ["enstCanonical"]
150
+ .to_string(),
151
+ };
152
+ //println!("item:{:?}", item);
153
+ names.push(item);
154
+ }
155
+ }
156
+ Err(_) => {
157
+ panic!(
158
+ "Symbol, ensg, enstCanonical structure is missing!"
159
+ )
160
+ }
161
+ }
162
+ }
163
+ let p_value = calculate_hypergeometric_p_value(
164
+ &sample_genes,
165
+ &background_genes,
166
+ names,
167
+ );
168
+ if p_value.is_nan() == false {
169
+ pathway_p_values.push(pathway_p_value {
170
+ pathway_name: n.GO_id,
171
+ p_value_original: p_value,
172
+ p_value_adjusted: None,
173
+ })
174
+ }
175
+ }
176
+ Err(_) => {
177
+ println!("GO term not found!")
178
+ }
179
+ }
180
+ }
181
+ println!("Number of pathway entries:{}", iter);
182
+ }
183
+ Err(_) => panic!("sqlite database file not found"),
184
+ }
185
+ println!(
186
+ "pathway_p_values:{}",
187
+ adjust_p_values(pathway_p_values, num_items_output)
188
+ );
189
+ println!(
190
+ "Time for calculating gene overrepresentation:{:?}",
191
+ run_time.elapsed()
192
+ );
193
+ }
194
+ Err(error) => println!("Incorrect json:{}", error),
195
+ }
196
+ }
197
+ Err(error) => println!("Piping error: {}", error),
198
+ }
199
+ Ok(())
200
+ }
201
+
202
+ fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>, num_items_output: usize) -> String {
203
+ // Sorting p-values in ascending order
204
+ original_p_values.as_mut_slice().sort_by(|a, b| {
205
+ (a.p_value_original)
206
+ .partial_cmp(&b.p_value_original)
207
+ .unwrap_or(Ordering::Equal)
208
+ });
209
+
210
+ let mut adjusted_p_values: Vec<pathway_p_value> = Vec::with_capacity(original_p_values.len());
211
+ let mut old_p_value: f64 = 0.0;
212
+ let mut rank: f64 = original_p_values.len() as f64;
213
+ for j in 0..original_p_values.len() {
214
+ let i = original_p_values.len() - j - 1;
215
+
216
+ //println!("p_val:{}", p_val);
217
+ let mut adjusted_p_val: f64 =
218
+ original_p_values[i].p_value_original * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
219
+ if adjusted_p_val > 1.0 {
220
+ // p_value should NEVER be greater than 1
221
+ adjusted_p_val = 1.0;
222
+ }
223
+ //println!("Original p_value:{}", original_p_values[i].p_value);
224
+ //println!("Raw adjusted p_value:{}", adjusted_p_value);
225
+ if i != original_p_values.len() - 1 {
226
+ if adjusted_p_val > old_p_value {
227
+ adjusted_p_val = old_p_value;
228
+ }
229
+ }
230
+ old_p_value = adjusted_p_val;
231
+ //println!("adjusted_p_value:{}", adjusted_p_val);
232
+ rank -= 1.0;
233
+
234
+ adjusted_p_values.push(pathway_p_value {
235
+ pathway_name: original_p_values[i].pathway_name.clone(),
236
+ p_value_original: original_p_values[i].p_value_original,
237
+ p_value_adjusted: Some(adjusted_p_val),
238
+ });
239
+ }
240
+ adjusted_p_values.as_mut_slice().sort_by(|a, b| {
241
+ (a.p_value_adjusted.unwrap())
242
+ .partial_cmp(&b.p_value_adjusted.unwrap())
243
+ .unwrap_or(Ordering::Equal)
244
+ });
245
+
246
+ let mut output_string = "[".to_string();
247
+ for i in 0..num_items_output {
248
+ let j = adjusted_p_values.len() - i - 1;
249
+ output_string += &serde_json::to_string(&adjusted_p_values[j]).unwrap();
250
+ if i < num_items_output - 1 {
251
+ output_string += &",".to_string();
252
+ }
253
+ }
254
+ output_string += &"]".to_string();
255
+ output_string
256
+ }