@sjcrh/proteinpaint-rust 2.73.0 → 2.75.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +2 -2
  2. package/src/genesetORA.rs +76 -35
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.73.0",
2
+ "version": "2.75.0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
@@ -38,5 +38,5 @@
38
38
  "devDependencies": {
39
39
  "tape": "^5.2.2"
40
40
  },
41
- "pp_release_tag": "v2.73.0"
41
+ "pp_release_tag": "v2.75.0"
42
42
  }
package/src/genesetORA.rs CHANGED
@@ -34,35 +34,49 @@ struct pathway_p_value {
34
34
  pathway_name: String,
35
35
  p_value_original: f64,
36
36
  p_value_adjusted: Option<f64>,
37
+ gene_set_hits: String,
38
+ gene_set_size: usize,
37
39
  }
38
40
 
39
41
  fn calculate_hypergeometric_p_value(
40
42
  sample_genes: &Vec<&str>,
41
- background_genes: &Vec<&str>,
43
+ num_background_genes: usize,
42
44
  genes_in_pathway: Vec<pathway_genes>,
43
- ) -> f64 {
44
- let matching_sample_genes_counts: f64 = sample_genes
45
- .iter()
46
- .zip(&genes_in_pathway)
47
- .filter(|&(a, b)| *a.to_string() == b.symbol)
48
- .count() as f64;
45
+ ) -> (f64, f64, String) {
46
+ let mut matching_sample_genes_counts = 0.0;
47
+ let mut gene_set_hits: String = "".to_string();
48
+ for gene in sample_genes {
49
+ for pathway in &genes_in_pathway {
50
+ if pathway.symbol == gene.to_string() {
51
+ matching_sample_genes_counts += 1.0;
52
+ gene_set_hits += &(gene.to_string() + &",");
53
+ }
54
+ }
55
+ }
56
+
57
+ if matching_sample_genes_counts > 0.0 {
58
+ gene_set_hits.pop();
59
+ }
60
+
61
+ //println!("sample_genes:{:?}", sample_genes);
62
+ //println!("genes_in_pathway:{:?}", genes_in_pathway);
49
63
  //println!("k-1:{}", matching_sample_genes_counts - 1.0);
50
64
  //println!("M:{}", genes_in_pathway.len() as f64);
51
65
  //println!(
52
66
  // "N-M:{}",
53
- // background_genes.len() as f64 - genes_in_pathway.len() as f64
67
+ // num_background_genes as f64 - genes_in_pathway.len() as f64
54
68
  //);
55
69
  //println!("n:{}", sample_genes.len() as f64);
56
70
  let p_value = r_mathlib::hypergeometric_cdf(
57
71
  matching_sample_genes_counts - 1.0,
58
72
  genes_in_pathway.len() as f64,
59
- background_genes.len() as f64 - genes_in_pathway.len() as f64,
73
+ num_background_genes as f64 - genes_in_pathway.len() as f64,
60
74
  sample_genes.len() as f64,
61
75
  false,
62
76
  false,
63
77
  );
64
78
  //println!("p_value:{}", p_value);
65
- p_value
79
+ (p_value, matching_sample_genes_counts, gene_set_hits)
66
80
  }
67
81
 
68
82
  fn main() -> Result<()> {
@@ -74,11 +88,11 @@ fn main() -> Result<()> {
74
88
  match input_json {
75
89
  Ok(json_string) => {
76
90
  let run_time = Instant::now();
77
- let db_input: &JsonValue = &json_string["db"];
78
- let db;
79
- match db_input.as_str() {
80
- Some(db_string) => db = db_string.to_string(),
81
- None => panic!("db file path is missing"),
91
+ let msigdb_input: &JsonValue = &json_string["msigdb"];
92
+ let msigdb;
93
+ match msigdb_input.as_str() {
94
+ Some(db_string) => msigdb = db_string.to_string(),
95
+ None => panic!("msigdb file path is missing"),
82
96
  }
83
97
  let genesetgroup;
84
98
  let genesetgroup_input: &JsonValue = &json_string["gene_set_group"];
@@ -91,28 +105,51 @@ fn main() -> Result<()> {
91
105
  sample_genes_input.as_str().unwrap().split(",").collect();
92
106
  let mut pathway_p_values: Vec<pathway_p_value> = Vec::with_capacity(10000);
93
107
  let background_genes_input: &JsonValue = &json_string["background_genes"];
94
- let background_genes: Vec<&str> = background_genes_input
95
- .as_str()
96
- .unwrap()
97
- .split(",")
98
- .collect();
108
+ let mut num_background_genes: usize = 0;
109
+ match background_genes_input.as_str() {
110
+ Some(x) => {
111
+ let background_genes_str: Vec<&str> = x.split(",").collect(); // Background genes is defined for e.g in case of DE analysis
112
+ num_background_genes = background_genes_str.len();
113
+ }
114
+ None => {
115
+ // Background genes not present for e.g. in hierarchial clustering
116
+ // Get background genes from the gene database
117
+ let genedb_input: &JsonValue = &json_string["genedb"];
118
+ let genedb;
119
+ match genedb_input.as_str() {
120
+ Some(gene_db_string) => genedb = gene_db_string.to_string(),
121
+ None => panic!("genedb file path is missing"),
122
+ }
123
+
124
+ let genedbconn = Connection::open(genedb)?;
125
+ let genedb_result = genedbconn.prepare(&("select * from codingGenes"));
126
+ match genedb_result {
127
+ Ok(mut x) => {
128
+ let mut genes = x.query([])?;
129
+ while let Some(_gene) = genes.next()? {
130
+ num_background_genes += 1;
131
+ }
132
+ }
133
+ Err(_) => {}
134
+ }
135
+ }
136
+ }
99
137
  //println!("sample_genes:{:?}", sample_genes);
100
138
  //println!("background_genes:{:?}", background_genes);
101
139
 
102
140
  if sample_genes.len() == 0 {
103
141
  panic!("No sample genes provided");
104
- } else if background_genes.len() == 0 {
142
+ } else if num_background_genes == 0 {
105
143
  panic!("No background genes provided");
106
144
  }
107
145
  let num_items_output = 100; // Number of top pathways to be specified in the output
108
146
 
109
- let conn = Connection::open(db)?;
110
- let stmt_result = conn.prepare(
147
+ let msigdbconn = Connection::open(msigdb)?;
148
+ let stmt_result = msigdbconn.prepare(
111
149
  &("select id from terms where parent_id='".to_owned()
112
150
  + &genesetgroup
113
151
  + "'"),
114
152
  );
115
- let mut iter = 0;
116
153
  match stmt_result {
117
154
  Ok(mut stmt) => {
118
155
  #[allow(non_snake_case)]
@@ -120,7 +157,6 @@ fn main() -> Result<()> {
120
157
  stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
121
158
  #[allow(non_snake_case)]
122
159
  for GO_term in GO_iter {
123
- iter += 1;
124
160
  match GO_term {
125
161
  Ok(n) => {
126
162
  //println!("GO term {:?}", n);
@@ -129,7 +165,7 @@ fn main() -> Result<()> {
129
165
  + &n.GO_id
130
166
  + &"'";
131
167
  //println!("sql_statement:{}", sql_statement);
132
- let mut gene_stmt = conn.prepare(&(sql_statement))?;
168
+ let mut gene_stmt = msigdbconn.prepare(&(sql_statement))?;
133
169
  //println!("gene_stmt:{:?}", gene_stmt);
134
170
 
135
171
  let mut rows = gene_stmt.query([])?;
@@ -160,16 +196,20 @@ fn main() -> Result<()> {
160
196
  }
161
197
  }
162
198
  }
163
- let p_value = calculate_hypergeometric_p_value(
164
- &sample_genes,
165
- &background_genes,
166
- names,
167
- );
168
- if p_value.is_nan() == false {
199
+ let gene_set_size = names.len();
200
+ let (p_value, matches, gene_set_hits) =
201
+ calculate_hypergeometric_p_value(
202
+ &sample_genes,
203
+ num_background_genes,
204
+ names,
205
+ );
206
+ if matches >= 1.0 && p_value.is_nan() == false {
169
207
  pathway_p_values.push(pathway_p_value {
170
208
  pathway_name: n.GO_id,
171
209
  p_value_original: p_value,
172
210
  p_value_adjusted: None,
211
+ gene_set_hits: gene_set_hits,
212
+ gene_set_size: gene_set_size,
173
213
  })
174
214
  }
175
215
  }
@@ -182,7 +222,7 @@ fn main() -> Result<()> {
182
222
  Err(_) => panic!("sqlite database file not found"),
183
223
  }
184
224
  let output_string = "{\"num_pathways\":".to_string()
185
- + &iter.to_string()
225
+ + &pathway_p_values.len().to_string()
186
226
  + &",\"pathways\":"
187
227
  + &adjust_p_values(pathway_p_values, num_items_output)
188
228
  + &"}";
@@ -239,6 +279,8 @@ fn adjust_p_values(
239
279
  pathway_name: original_p_values[i].pathway_name.clone(),
240
280
  p_value_original: original_p_values[i].p_value_original,
241
281
  p_value_adjusted: Some(adjusted_p_val),
282
+ gene_set_hits: original_p_values[i].gene_set_hits.clone(),
283
+ gene_set_size: original_p_values[i].gene_set_size,
242
284
  });
243
285
  }
244
286
  adjusted_p_values.as_mut_slice().sort_by(|a, b| {
@@ -253,8 +295,7 @@ fn adjust_p_values(
253
295
 
254
296
  let mut output_string = "[".to_string();
255
297
  for i in 0..num_items_output {
256
- let j = adjusted_p_values.len() - i - 1;
257
- output_string += &serde_json::to_string(&adjusted_p_values[j]).unwrap();
298
+ output_string += &serde_json::to_string(&adjusted_p_values[i]).unwrap();
258
299
  if i < num_items_output - 1 {
259
300
  output_string += &",".to_string();
260
301
  }