@sjcrh/proteinpaint-rust 2.74.0 → 2.78.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +6 -1
- package/package.json +2 -2
- package/src/genesetORA.rs +36 -19
- package/src/readHDF5.rs +227 -0
package/Cargo.toml
CHANGED
|
@@ -14,7 +14,8 @@ rayon = "1.7.0"
|
|
|
14
14
|
bgzip = "0.3.1"
|
|
15
15
|
petgraph = "0.6.3"
|
|
16
16
|
rusqlite="0.31.0"
|
|
17
|
-
ndarray = "0.
|
|
17
|
+
ndarray = "0.16.1"
|
|
18
|
+
hdf5 = { package = "hdf5-metno", version = "0.9.0" }
|
|
18
19
|
nalgebra = {version = "0.32.2", features = ["serde-serialize"]}
|
|
19
20
|
plotters = "0.3.4"
|
|
20
21
|
colorgrad = "0.6.2"
|
|
@@ -90,3 +91,7 @@ path="src/genesetORA.rs"
|
|
|
90
91
|
[[bin]]
|
|
91
92
|
name="computeTopTerms"
|
|
92
93
|
path="src/computeTopTerms.rs"
|
|
94
|
+
|
|
95
|
+
[[bin]]
|
|
96
|
+
name="readHDF5"
|
|
97
|
+
path="src/readHDF5.rs"
|
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.
|
|
2
|
+
"version": "2.78.0",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"description": "Rust-based utilities for proteinpaint",
|
|
5
5
|
"main": "index.js",
|
|
@@ -38,5 +38,5 @@
|
|
|
38
38
|
"devDependencies": {
|
|
39
39
|
"tape": "^5.2.2"
|
|
40
40
|
},
|
|
41
|
-
"pp_release_tag": "v2.
|
|
41
|
+
"pp_release_tag": "v2.78.0"
|
|
42
42
|
}
|
package/src/genesetORA.rs
CHANGED
|
@@ -34,23 +34,37 @@ struct pathway_p_value {
|
|
|
34
34
|
pathway_name: String,
|
|
35
35
|
p_value_original: f64,
|
|
36
36
|
p_value_adjusted: Option<f64>,
|
|
37
|
+
gene_set_hits: String,
|
|
38
|
+
gene_set_size: usize,
|
|
37
39
|
}
|
|
38
40
|
|
|
39
41
|
fn calculate_hypergeometric_p_value(
|
|
40
42
|
sample_genes: &Vec<&str>,
|
|
41
43
|
num_background_genes: usize,
|
|
42
44
|
genes_in_pathway: Vec<pathway_genes>,
|
|
43
|
-
) -> f64 {
|
|
44
|
-
let matching_sample_genes_counts
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
45
|
+
) -> (f64, f64, String) {
|
|
46
|
+
let mut matching_sample_genes_counts = 0.0;
|
|
47
|
+
let mut gene_set_hits: String = "".to_string();
|
|
48
|
+
for gene in sample_genes {
|
|
49
|
+
for pathway in &genes_in_pathway {
|
|
50
|
+
if pathway.symbol == gene.to_string() {
|
|
51
|
+
matching_sample_genes_counts += 1.0;
|
|
52
|
+
gene_set_hits += &(gene.to_string() + &",");
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if matching_sample_genes_counts > 0.0 {
|
|
58
|
+
gene_set_hits.pop();
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
//println!("sample_genes:{:?}", sample_genes);
|
|
62
|
+
//println!("genes_in_pathway:{:?}", genes_in_pathway);
|
|
49
63
|
//println!("k-1:{}", matching_sample_genes_counts - 1.0);
|
|
50
64
|
//println!("M:{}", genes_in_pathway.len() as f64);
|
|
51
65
|
//println!(
|
|
52
66
|
// "N-M:{}",
|
|
53
|
-
//
|
|
67
|
+
// num_background_genes as f64 - genes_in_pathway.len() as f64
|
|
54
68
|
//);
|
|
55
69
|
//println!("n:{}", sample_genes.len() as f64);
|
|
56
70
|
let p_value = r_mathlib::hypergeometric_cdf(
|
|
@@ -62,7 +76,7 @@ fn calculate_hypergeometric_p_value(
|
|
|
62
76
|
false,
|
|
63
77
|
);
|
|
64
78
|
//println!("p_value:{}", p_value);
|
|
65
|
-
p_value
|
|
79
|
+
(p_value, matching_sample_genes_counts, gene_set_hits)
|
|
66
80
|
}
|
|
67
81
|
|
|
68
82
|
fn main() -> Result<()> {
|
|
@@ -136,7 +150,6 @@ fn main() -> Result<()> {
|
|
|
136
150
|
+ &genesetgroup
|
|
137
151
|
+ "'"),
|
|
138
152
|
);
|
|
139
|
-
let mut iter = 0;
|
|
140
153
|
match stmt_result {
|
|
141
154
|
Ok(mut stmt) => {
|
|
142
155
|
#[allow(non_snake_case)]
|
|
@@ -144,7 +157,6 @@ fn main() -> Result<()> {
|
|
|
144
157
|
stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
|
|
145
158
|
#[allow(non_snake_case)]
|
|
146
159
|
for GO_term in GO_iter {
|
|
147
|
-
iter += 1;
|
|
148
160
|
match GO_term {
|
|
149
161
|
Ok(n) => {
|
|
150
162
|
//println!("GO term {:?}", n);
|
|
@@ -184,16 +196,20 @@ fn main() -> Result<()> {
|
|
|
184
196
|
}
|
|
185
197
|
}
|
|
186
198
|
}
|
|
187
|
-
let
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
199
|
+
let gene_set_size = names.len();
|
|
200
|
+
let (p_value, matches, gene_set_hits) =
|
|
201
|
+
calculate_hypergeometric_p_value(
|
|
202
|
+
&sample_genes,
|
|
203
|
+
num_background_genes,
|
|
204
|
+
names,
|
|
205
|
+
);
|
|
206
|
+
if matches >= 1.0 && p_value.is_nan() == false {
|
|
193
207
|
pathway_p_values.push(pathway_p_value {
|
|
194
208
|
pathway_name: n.GO_id,
|
|
195
209
|
p_value_original: p_value,
|
|
196
210
|
p_value_adjusted: None,
|
|
211
|
+
gene_set_hits: gene_set_hits,
|
|
212
|
+
gene_set_size: gene_set_size,
|
|
197
213
|
})
|
|
198
214
|
}
|
|
199
215
|
}
|
|
@@ -206,7 +222,7 @@ fn main() -> Result<()> {
|
|
|
206
222
|
Err(_) => panic!("sqlite database file not found"),
|
|
207
223
|
}
|
|
208
224
|
let output_string = "{\"num_pathways\":".to_string()
|
|
209
|
-
+ &
|
|
225
|
+
+ &pathway_p_values.len().to_string()
|
|
210
226
|
+ &",\"pathways\":"
|
|
211
227
|
+ &adjust_p_values(pathway_p_values, num_items_output)
|
|
212
228
|
+ &"}";
|
|
@@ -263,6 +279,8 @@ fn adjust_p_values(
|
|
|
263
279
|
pathway_name: original_p_values[i].pathway_name.clone(),
|
|
264
280
|
p_value_original: original_p_values[i].p_value_original,
|
|
265
281
|
p_value_adjusted: Some(adjusted_p_val),
|
|
282
|
+
gene_set_hits: original_p_values[i].gene_set_hits.clone(),
|
|
283
|
+
gene_set_size: original_p_values[i].gene_set_size,
|
|
266
284
|
});
|
|
267
285
|
}
|
|
268
286
|
adjusted_p_values.as_mut_slice().sort_by(|a, b| {
|
|
@@ -277,8 +295,7 @@ fn adjust_p_values(
|
|
|
277
295
|
|
|
278
296
|
let mut output_string = "[".to_string();
|
|
279
297
|
for i in 0..num_items_output {
|
|
280
|
-
|
|
281
|
-
output_string += &serde_json::to_string(&adjusted_p_values[j]).unwrap();
|
|
298
|
+
output_string += &serde_json::to_string(&adjusted_p_values[i]).unwrap();
|
|
282
299
|
if i < num_items_output - 1 {
|
|
283
300
|
output_string += &",".to_string();
|
|
284
301
|
}
|
package/src/readHDF5.rs
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
// Need to set HDF5_DIR and LD_LIBRARY_PATH in ~/.bash_profile
|
|
2
|
+
// Syntax: HDF5_DIR=/usr/local/Homebrew/Cellar/hdf5/1.14.3_1 && echo $HDF5_DIR && cd .. && cargo build --release && json='{"gene":"TP53","hdf5_file":"matrix_with_na_comp_9.h5"}' && time echo $json | target/release/rust_hdf5
|
|
3
|
+
|
|
4
|
+
use hdf5::types::FixedAscii;
|
|
5
|
+
use hdf5::{File, Result};
|
|
6
|
+
use json;
|
|
7
|
+
use ndarray::Array1;
|
|
8
|
+
use ndarray::Dim;
|
|
9
|
+
use std::io;
|
|
10
|
+
use std::time::Instant;
|
|
11
|
+
|
|
12
|
+
fn read_hdf5(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
13
|
+
let file = File::open(&hdf5_filename)?; // open for reading
|
|
14
|
+
let ds_dim = file.dataset("data/dim")?; // open the dataset
|
|
15
|
+
|
|
16
|
+
// Check the data type and read the dataset accordingly
|
|
17
|
+
let data_dim: Array1<_> = ds_dim.read::<usize, Dim<[usize; 1]>>()?;
|
|
18
|
+
let num_samples = data_dim[0]; // Number of total columns in the dataset
|
|
19
|
+
let num_genes = data_dim[1];
|
|
20
|
+
println!("num_samples:{}", num_samples);
|
|
21
|
+
println!("num_genes:{}", num_genes);
|
|
22
|
+
|
|
23
|
+
//let now_partial_i = Instant::now();
|
|
24
|
+
//let data_partial_i: Array1<usize> = ds_i.read_slice_1d(0..20)?;
|
|
25
|
+
//println!("Data_partial_i: {:?}", data_partial_i);
|
|
26
|
+
//println!("Time for partial_i dataset:{:?}", now_partial_i.elapsed());
|
|
27
|
+
//
|
|
28
|
+
//let now_x = Instant::now();
|
|
29
|
+
//let ds_x = file.dataset("data/x")?; // open the dataset
|
|
30
|
+
//let data_x: Array1<_> = ds_x.read::<f64, Dim<[usize; 1]>>()?;
|
|
31
|
+
//println!("Data_x: {:?}", data_x);
|
|
32
|
+
//println!("Time for x dataset:{:?}", now_x.elapsed());
|
|
33
|
+
|
|
34
|
+
let now_genes = Instant::now();
|
|
35
|
+
let ds_genes = file.dataset("gene_names")?;
|
|
36
|
+
let genes = ds_genes.read_1d::<FixedAscii<104>>()?;
|
|
37
|
+
//println!("\tgenes = {:?}", genes);
|
|
38
|
+
//println!("\tgenes.shape() = {:?}", genes.shape());
|
|
39
|
+
//println!("\tgenes.strides() = {:?}", genes.strides());
|
|
40
|
+
//println!("\tgenes.ndim() = {:?}", genes.ndim());
|
|
41
|
+
println!("Time for parsing genes:{:?}", now_genes.elapsed());
|
|
42
|
+
|
|
43
|
+
let now_samples = Instant::now();
|
|
44
|
+
let ds_samples = file.dataset("sample_names")?;
|
|
45
|
+
let samples = ds_samples.read_1d::<FixedAscii<104>>()?;
|
|
46
|
+
//println!("\tsamples = {:?}", samples);
|
|
47
|
+
//println!("\tsamples.shape() = {:?}", samples.shape());
|
|
48
|
+
//println!("\tsamples.strides() = {:?}", samples.strides());
|
|
49
|
+
//println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
50
|
+
println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
51
|
+
|
|
52
|
+
let gene_index;
|
|
53
|
+
match genes.iter().position(|&x| x == gene_name) {
|
|
54
|
+
Some(index) => {
|
|
55
|
+
println!(
|
|
56
|
+
"The index of '{}' is {} in 0-based format (add 1 to compare with R output)",
|
|
57
|
+
gene_name, index
|
|
58
|
+
);
|
|
59
|
+
gene_index = index;
|
|
60
|
+
}
|
|
61
|
+
None => panic!(
|
|
62
|
+
"Gene '{}' not found in the HDF5 file '{}'",
|
|
63
|
+
gene_name, &hdf5_filename
|
|
64
|
+
),
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Find the number of columns that are populated for that gene
|
|
68
|
+
let now_p = Instant::now();
|
|
69
|
+
let ds_p = file.dataset("data/p")?; // open the dataset
|
|
70
|
+
|
|
71
|
+
//let data_p: Array1<_> = ds_p.read::<usize, Dim<[usize; 1]>>()?;
|
|
72
|
+
let data_partial_p: Array1<usize> = ds_p.read_slice_1d(gene_index..gene_index + 2)?;
|
|
73
|
+
//println!("Data_p: {:?}", data_p);
|
|
74
|
+
println!("Data_partial_p: {:?}", data_partial_p);
|
|
75
|
+
println!("Time for p dataset:{:?}", now_p.elapsed());
|
|
76
|
+
|
|
77
|
+
let array_start_point = data_partial_p[0];
|
|
78
|
+
let array_stop_point = data_partial_p[1];
|
|
79
|
+
let num_populated_cells = data_partial_p[1] - array_start_point;
|
|
80
|
+
println!("Number of populated cells:{}", num_populated_cells);
|
|
81
|
+
|
|
82
|
+
//Find all columns indices that are populated for the given gene
|
|
83
|
+
let now_i = Instant::now();
|
|
84
|
+
let ds_i = file.dataset("data/i")?; // open the dataset
|
|
85
|
+
|
|
86
|
+
//let data_i: Array1<_> = ds_i.read::<f64, Dim<[usize; 1]>>()?;
|
|
87
|
+
//println!("Data_i: {:?}", data_i);
|
|
88
|
+
let populated_column_ids: Array1<usize> =
|
|
89
|
+
ds_i.read_slice_1d(array_start_point..array_stop_point - 1)?;
|
|
90
|
+
println!(
|
|
91
|
+
"Length of populated_column_ids:{}",
|
|
92
|
+
populated_column_ids.len()
|
|
93
|
+
);
|
|
94
|
+
|
|
95
|
+
// Do a sanity check (for testing)
|
|
96
|
+
//let mut min = 0;
|
|
97
|
+
//for i in 0..populated_column_ids.len() {
|
|
98
|
+
// if populated_column_ids[i] < min {
|
|
99
|
+
// println!("Value is decreasing {},{}", populated_column_ids[i], min);
|
|
100
|
+
// } else {
|
|
101
|
+
// min = populated_column_ids[i];
|
|
102
|
+
// }
|
|
103
|
+
//}
|
|
104
|
+
println!("Populated cells:{:?}", populated_column_ids);
|
|
105
|
+
println!("Time for i dataset:{:?}", now_i.elapsed());
|
|
106
|
+
|
|
107
|
+
//Find all columns values that are populated for the given gene
|
|
108
|
+
let now_x = Instant::now();
|
|
109
|
+
let ds_x = file.dataset("data/x")?; // open the dataset
|
|
110
|
+
|
|
111
|
+
//let data_x: Array1<_> = ds_x.read::<f64, Dim<[usize; 1]>>()?;
|
|
112
|
+
//println!("Data_x: {:?}", data_x);
|
|
113
|
+
let populated_column_values: Array1<f64> =
|
|
114
|
+
ds_x.read_slice_1d(array_start_point..array_stop_point - 1)?;
|
|
115
|
+
println!(
|
|
116
|
+
"Length of populated_column_ids:{}",
|
|
117
|
+
populated_column_values.len()
|
|
118
|
+
);
|
|
119
|
+
println!("Time for x dataset:{:?}", now_x.elapsed());
|
|
120
|
+
|
|
121
|
+
// Generate the complete array from the sparse array
|
|
122
|
+
|
|
123
|
+
let mut gene_array: Array1<f64> = Array1::zeros(num_samples);
|
|
124
|
+
let time_generating_full_array = Instant::now();
|
|
125
|
+
//let mut gene_array: Vec<f64> = Vec::with_capacity(num_samples);
|
|
126
|
+
for index in 0..num_samples {
|
|
127
|
+
match populated_column_ids.iter().any(|&x| x == index) {
|
|
128
|
+
true => match populated_column_ids.iter().position(|&x| x == index) {
|
|
129
|
+
Some(y) => {
|
|
130
|
+
gene_array[index] = populated_column_values[y] //gene_array.push(populated_column_values[y]),
|
|
131
|
+
}
|
|
132
|
+
None => {} // should not happen because if the index is found, its position in the array should also be found
|
|
133
|
+
},
|
|
134
|
+
false => gene_array[index] = 0.0, //gene_array.push(0.0), // If index not found, it means the value is 0 for that sample
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
let mut output_string = "{".to_string();
|
|
139
|
+
for i in 0..gene_array.len() {
|
|
140
|
+
//let item_json = "{\"".to_string()
|
|
141
|
+
// + &samples[i].to_string()
|
|
142
|
+
// + &"\","
|
|
143
|
+
// + &gene_array[i].to_string()
|
|
144
|
+
// + &"}";
|
|
145
|
+
|
|
146
|
+
//let item_json = format!("{{\"{}\"}}", samples[i].to_string());
|
|
147
|
+
|
|
148
|
+
output_string += &format!(
|
|
149
|
+
"\"{}\":{}",
|
|
150
|
+
samples[i].to_string(),
|
|
151
|
+
gene_array[i].to_string()
|
|
152
|
+
);
|
|
153
|
+
//println!("item_json:{}", item_json);
|
|
154
|
+
|
|
155
|
+
//let item_json = format!(
|
|
156
|
+
// r##"{{"{}",{}}}"##,
|
|
157
|
+
// samples[i].to_string().replace("\\", ""),
|
|
158
|
+
// gene_array[i].to_string()
|
|
159
|
+
//);
|
|
160
|
+
if i != gene_array.len() - 1 {
|
|
161
|
+
output_string += &",";
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
output_string += &"}".to_string();
|
|
165
|
+
output_string = output_string.replace("\\", "");
|
|
166
|
+
println!(
|
|
167
|
+
"Time generating full array:{:?}",
|
|
168
|
+
time_generating_full_array.elapsed()
|
|
169
|
+
);
|
|
170
|
+
println!("output_string:{}", output_string);
|
|
171
|
+
|
|
172
|
+
// Print individual element in array
|
|
173
|
+
|
|
174
|
+
//let arr = v.iter().collect::<Vec<_>>();
|
|
175
|
+
//for (idx, val) in arr.iter().enumerate() {
|
|
176
|
+
// println!("\tarr[{:?}] = {:?} ({:?})", idx, val.to_string(), val.len());
|
|
177
|
+
//}
|
|
178
|
+
|
|
179
|
+
//for item in data_i {
|
|
180
|
+
// println!("i:{}", item);
|
|
181
|
+
//}
|
|
182
|
+
Ok(())
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
fn main() -> Result<()> {
|
|
186
|
+
let mut input = String::new();
|
|
187
|
+
match io::stdin().read_line(&mut input) {
|
|
188
|
+
// Accepting the piped input from nodejs (or command line from testing)
|
|
189
|
+
Ok(_bytes_read) => {
|
|
190
|
+
//println!("{} bytes read", bytes_read);
|
|
191
|
+
//println!("{}", input);
|
|
192
|
+
let input_json = json::parse(&input);
|
|
193
|
+
match input_json {
|
|
194
|
+
Ok(json_string) => {
|
|
195
|
+
let now = Instant::now();
|
|
196
|
+
let hdf5_filename_result = &json_string["hdf5_file"].to_owned();
|
|
197
|
+
let hdf5_filename;
|
|
198
|
+
match hdf5_filename_result.as_str() {
|
|
199
|
+
Some(x) => {
|
|
200
|
+
hdf5_filename = x.to_string();
|
|
201
|
+
}
|
|
202
|
+
None => {
|
|
203
|
+
panic!("HDF5 filename not provided");
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
let gene_result = &json_string["gene"].to_owned();
|
|
208
|
+
let gene_name;
|
|
209
|
+
match gene_result.as_str() {
|
|
210
|
+
Some(x) => {
|
|
211
|
+
gene_name = x.to_string();
|
|
212
|
+
}
|
|
213
|
+
None => {
|
|
214
|
+
panic!("Gene name not provided");
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
read_hdf5(hdf5_filename, gene_name)?;
|
|
219
|
+
println!("Time for parsing genes from HDF5:{:?}", now.elapsed());
|
|
220
|
+
}
|
|
221
|
+
Err(error) => println!("Incorrect json: {}", error),
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
Err(error) => println!("Piping error: {}", error),
|
|
225
|
+
}
|
|
226
|
+
Ok(())
|
|
227
|
+
}
|