@sjcrh/proteinpaint-rust 2.73.0 → 2.75.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/genesetORA.rs +76 -35
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.
|
|
2
|
+
"version": "2.75.0",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"description": "Rust-based utilities for proteinpaint",
|
|
5
5
|
"main": "index.js",
|
|
@@ -38,5 +38,5 @@
|
|
|
38
38
|
"devDependencies": {
|
|
39
39
|
"tape": "^5.2.2"
|
|
40
40
|
},
|
|
41
|
-
"pp_release_tag": "v2.
|
|
41
|
+
"pp_release_tag": "v2.75.0"
|
|
42
42
|
}
|
package/src/genesetORA.rs
CHANGED
|
@@ -34,35 +34,49 @@ struct pathway_p_value {
|
|
|
34
34
|
pathway_name: String,
|
|
35
35
|
p_value_original: f64,
|
|
36
36
|
p_value_adjusted: Option<f64>,
|
|
37
|
+
gene_set_hits: String,
|
|
38
|
+
gene_set_size: usize,
|
|
37
39
|
}
|
|
38
40
|
|
|
39
41
|
fn calculate_hypergeometric_p_value(
|
|
40
42
|
sample_genes: &Vec<&str>,
|
|
41
|
-
|
|
43
|
+
num_background_genes: usize,
|
|
42
44
|
genes_in_pathway: Vec<pathway_genes>,
|
|
43
|
-
) -> f64 {
|
|
44
|
-
let matching_sample_genes_counts
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
45
|
+
) -> (f64, f64, String) {
|
|
46
|
+
let mut matching_sample_genes_counts = 0.0;
|
|
47
|
+
let mut gene_set_hits: String = "".to_string();
|
|
48
|
+
for gene in sample_genes {
|
|
49
|
+
for pathway in &genes_in_pathway {
|
|
50
|
+
if pathway.symbol == gene.to_string() {
|
|
51
|
+
matching_sample_genes_counts += 1.0;
|
|
52
|
+
gene_set_hits += &(gene.to_string() + &",");
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if matching_sample_genes_counts > 0.0 {
|
|
58
|
+
gene_set_hits.pop();
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
//println!("sample_genes:{:?}", sample_genes);
|
|
62
|
+
//println!("genes_in_pathway:{:?}", genes_in_pathway);
|
|
49
63
|
//println!("k-1:{}", matching_sample_genes_counts - 1.0);
|
|
50
64
|
//println!("M:{}", genes_in_pathway.len() as f64);
|
|
51
65
|
//println!(
|
|
52
66
|
// "N-M:{}",
|
|
53
|
-
//
|
|
67
|
+
// num_background_genes as f64 - genes_in_pathway.len() as f64
|
|
54
68
|
//);
|
|
55
69
|
//println!("n:{}", sample_genes.len() as f64);
|
|
56
70
|
let p_value = r_mathlib::hypergeometric_cdf(
|
|
57
71
|
matching_sample_genes_counts - 1.0,
|
|
58
72
|
genes_in_pathway.len() as f64,
|
|
59
|
-
|
|
73
|
+
num_background_genes as f64 - genes_in_pathway.len() as f64,
|
|
60
74
|
sample_genes.len() as f64,
|
|
61
75
|
false,
|
|
62
76
|
false,
|
|
63
77
|
);
|
|
64
78
|
//println!("p_value:{}", p_value);
|
|
65
|
-
p_value
|
|
79
|
+
(p_value, matching_sample_genes_counts, gene_set_hits)
|
|
66
80
|
}
|
|
67
81
|
|
|
68
82
|
fn main() -> Result<()> {
|
|
@@ -74,11 +88,11 @@ fn main() -> Result<()> {
|
|
|
74
88
|
match input_json {
|
|
75
89
|
Ok(json_string) => {
|
|
76
90
|
let run_time = Instant::now();
|
|
77
|
-
let
|
|
78
|
-
let
|
|
79
|
-
match
|
|
80
|
-
Some(db_string) =>
|
|
81
|
-
None => panic!("
|
|
91
|
+
let msigdb_input: &JsonValue = &json_string["msigdb"];
|
|
92
|
+
let msigdb;
|
|
93
|
+
match msigdb_input.as_str() {
|
|
94
|
+
Some(db_string) => msigdb = db_string.to_string(),
|
|
95
|
+
None => panic!("msigdb file path is missing"),
|
|
82
96
|
}
|
|
83
97
|
let genesetgroup;
|
|
84
98
|
let genesetgroup_input: &JsonValue = &json_string["gene_set_group"];
|
|
@@ -91,28 +105,51 @@ fn main() -> Result<()> {
|
|
|
91
105
|
sample_genes_input.as_str().unwrap().split(",").collect();
|
|
92
106
|
let mut pathway_p_values: Vec<pathway_p_value> = Vec::with_capacity(10000);
|
|
93
107
|
let background_genes_input: &JsonValue = &json_string["background_genes"];
|
|
94
|
-
let
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
108
|
+
let mut num_background_genes: usize = 0;
|
|
109
|
+
match background_genes_input.as_str() {
|
|
110
|
+
Some(x) => {
|
|
111
|
+
let background_genes_str: Vec<&str> = x.split(",").collect(); // Background genes is defined for e.g in case of DE analysis
|
|
112
|
+
num_background_genes = background_genes_str.len();
|
|
113
|
+
}
|
|
114
|
+
None => {
|
|
115
|
+
// Background genes not present for e.g. in hierarchial clustering
|
|
116
|
+
// Get background genes from the gene database
|
|
117
|
+
let genedb_input: &JsonValue = &json_string["genedb"];
|
|
118
|
+
let genedb;
|
|
119
|
+
match genedb_input.as_str() {
|
|
120
|
+
Some(gene_db_string) => genedb = gene_db_string.to_string(),
|
|
121
|
+
None => panic!("genedb file path is missing"),
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
let genedbconn = Connection::open(genedb)?;
|
|
125
|
+
let genedb_result = genedbconn.prepare(&("select * from codingGenes"));
|
|
126
|
+
match genedb_result {
|
|
127
|
+
Ok(mut x) => {
|
|
128
|
+
let mut genes = x.query([])?;
|
|
129
|
+
while let Some(_gene) = genes.next()? {
|
|
130
|
+
num_background_genes += 1;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
Err(_) => {}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
99
137
|
//println!("sample_genes:{:?}", sample_genes);
|
|
100
138
|
//println!("background_genes:{:?}", background_genes);
|
|
101
139
|
|
|
102
140
|
if sample_genes.len() == 0 {
|
|
103
141
|
panic!("No sample genes provided");
|
|
104
|
-
} else if
|
|
142
|
+
} else if num_background_genes == 0 {
|
|
105
143
|
panic!("No background genes provided");
|
|
106
144
|
}
|
|
107
145
|
let num_items_output = 100; // Number of top pathways to be specified in the output
|
|
108
146
|
|
|
109
|
-
let
|
|
110
|
-
let stmt_result =
|
|
147
|
+
let msigdbconn = Connection::open(msigdb)?;
|
|
148
|
+
let stmt_result = msigdbconn.prepare(
|
|
111
149
|
&("select id from terms where parent_id='".to_owned()
|
|
112
150
|
+ &genesetgroup
|
|
113
151
|
+ "'"),
|
|
114
152
|
);
|
|
115
|
-
let mut iter = 0;
|
|
116
153
|
match stmt_result {
|
|
117
154
|
Ok(mut stmt) => {
|
|
118
155
|
#[allow(non_snake_case)]
|
|
@@ -120,7 +157,6 @@ fn main() -> Result<()> {
|
|
|
120
157
|
stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
|
|
121
158
|
#[allow(non_snake_case)]
|
|
122
159
|
for GO_term in GO_iter {
|
|
123
|
-
iter += 1;
|
|
124
160
|
match GO_term {
|
|
125
161
|
Ok(n) => {
|
|
126
162
|
//println!("GO term {:?}", n);
|
|
@@ -129,7 +165,7 @@ fn main() -> Result<()> {
|
|
|
129
165
|
+ &n.GO_id
|
|
130
166
|
+ &"'";
|
|
131
167
|
//println!("sql_statement:{}", sql_statement);
|
|
132
|
-
let mut gene_stmt =
|
|
168
|
+
let mut gene_stmt = msigdbconn.prepare(&(sql_statement))?;
|
|
133
169
|
//println!("gene_stmt:{:?}", gene_stmt);
|
|
134
170
|
|
|
135
171
|
let mut rows = gene_stmt.query([])?;
|
|
@@ -160,16 +196,20 @@ fn main() -> Result<()> {
|
|
|
160
196
|
}
|
|
161
197
|
}
|
|
162
198
|
}
|
|
163
|
-
let
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
199
|
+
let gene_set_size = names.len();
|
|
200
|
+
let (p_value, matches, gene_set_hits) =
|
|
201
|
+
calculate_hypergeometric_p_value(
|
|
202
|
+
&sample_genes,
|
|
203
|
+
num_background_genes,
|
|
204
|
+
names,
|
|
205
|
+
);
|
|
206
|
+
if matches >= 1.0 && p_value.is_nan() == false {
|
|
169
207
|
pathway_p_values.push(pathway_p_value {
|
|
170
208
|
pathway_name: n.GO_id,
|
|
171
209
|
p_value_original: p_value,
|
|
172
210
|
p_value_adjusted: None,
|
|
211
|
+
gene_set_hits: gene_set_hits,
|
|
212
|
+
gene_set_size: gene_set_size,
|
|
173
213
|
})
|
|
174
214
|
}
|
|
175
215
|
}
|
|
@@ -182,7 +222,7 @@ fn main() -> Result<()> {
|
|
|
182
222
|
Err(_) => panic!("sqlite database file not found"),
|
|
183
223
|
}
|
|
184
224
|
let output_string = "{\"num_pathways\":".to_string()
|
|
185
|
-
+ &
|
|
225
|
+
+ &pathway_p_values.len().to_string()
|
|
186
226
|
+ &",\"pathways\":"
|
|
187
227
|
+ &adjust_p_values(pathway_p_values, num_items_output)
|
|
188
228
|
+ &"}";
|
|
@@ -239,6 +279,8 @@ fn adjust_p_values(
|
|
|
239
279
|
pathway_name: original_p_values[i].pathway_name.clone(),
|
|
240
280
|
p_value_original: original_p_values[i].p_value_original,
|
|
241
281
|
p_value_adjusted: Some(adjusted_p_val),
|
|
282
|
+
gene_set_hits: original_p_values[i].gene_set_hits.clone(),
|
|
283
|
+
gene_set_size: original_p_values[i].gene_set_size,
|
|
242
284
|
});
|
|
243
285
|
}
|
|
244
286
|
adjusted_p_values.as_mut_slice().sort_by(|a, b| {
|
|
@@ -253,8 +295,7 @@ fn adjust_p_values(
|
|
|
253
295
|
|
|
254
296
|
let mut output_string = "[".to_string();
|
|
255
297
|
for i in 0..num_items_output {
|
|
256
|
-
|
|
257
|
-
output_string += &serde_json::to_string(&adjusted_p_values[j]).unwrap();
|
|
298
|
+
output_string += &serde_json::to_string(&adjusted_p_values[i]).unwrap();
|
|
258
299
|
if i < num_items_output - 1 {
|
|
259
300
|
output_string += &",".to_string();
|
|
260
301
|
}
|