@sjcrh/proteinpaint-rust 2.123.0 → 2.125.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/gdcmaf.rs +107 -107
- package/src/genesetORA.rs +22 -48
- package/src/readHDF5.rs +34 -73
- package/src/validateHDF5.rs +0 -1
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.
|
|
2
|
+
"version": "2.125.0",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Rust-based utilities for proteinpaint",
|
|
@@ -39,5 +39,5 @@
|
|
|
39
39
|
"devDependencies": {
|
|
40
40
|
"tape": "^5.2.2"
|
|
41
41
|
},
|
|
42
|
-
"pp_release_tag": "v2.
|
|
42
|
+
"pp_release_tag": "v2.125.0"
|
|
43
43
|
}
|
package/src/gdcmaf.rs
CHANGED
|
@@ -10,16 +10,16 @@
|
|
|
10
10
|
echo '{"host": "https://api.gdc.cancer.gov/data/","columns": ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome", "Start_Position"], "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
|
+
use flate2::Compression;
|
|
13
14
|
use flate2::read::GzDecoder;
|
|
14
15
|
use flate2::write::GzEncoder;
|
|
15
|
-
use flate2::Compression;
|
|
16
|
-
use serde_json::{Value};
|
|
17
16
|
use futures::StreamExt;
|
|
18
|
-
use
|
|
17
|
+
use serde_json::Value;
|
|
18
|
+
use std::io::{self, Read, Write};
|
|
19
|
+
use std::sync::{Arc, Mutex};
|
|
19
20
|
use std::time::Duration;
|
|
20
21
|
use tokio::io::{AsyncReadExt, BufReader};
|
|
21
22
|
use tokio::time::timeout;
|
|
22
|
-
use std::sync::{Arc, Mutex};
|
|
23
23
|
|
|
24
24
|
// Struct to hold error information
|
|
25
25
|
#[derive(serde::Serialize)]
|
|
@@ -28,14 +28,14 @@ struct ErrorEntry {
|
|
|
28
28
|
error: String,
|
|
29
29
|
}
|
|
30
30
|
|
|
31
|
-
fn select_maf_col(d:String,columns
|
|
31
|
+
fn select_maf_col(d: String, columns: &Vec<String>, url: &str) -> Result<(Vec<u8>, i32), (String, String)> {
|
|
32
32
|
let mut maf_str: String = String::new();
|
|
33
33
|
let mut header_indices: Vec<usize> = Vec::new();
|
|
34
34
|
let lines = d.trim_end().split("\n");
|
|
35
35
|
let mut mafrows = 0;
|
|
36
36
|
for line in lines {
|
|
37
37
|
if line.starts_with("#") {
|
|
38
|
-
continue
|
|
38
|
+
continue;
|
|
39
39
|
} else if line.contains("Hugo_Symbol") {
|
|
40
40
|
let header: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
41
41
|
for col in columns {
|
|
@@ -48,7 +48,7 @@ fn select_maf_col(d:String,columns:&Vec<String>,url:&str) -> Result<(Vec<u8>,i32
|
|
|
48
48
|
return Err((url.to_string(), error_msg));
|
|
49
49
|
}
|
|
50
50
|
}
|
|
51
|
-
}
|
|
51
|
+
}
|
|
52
52
|
if header_indices.is_empty() {
|
|
53
53
|
return Err((url.to_string(), "No matching columns found".to_string()));
|
|
54
54
|
}
|
|
@@ -57,19 +57,17 @@ fn select_maf_col(d:String,columns:&Vec<String>,url:&str) -> Result<(Vec<u8>,i32
|
|
|
57
57
|
let mut maf_out_lst: Vec<String> = Vec::new();
|
|
58
58
|
for x in header_indices.iter() {
|
|
59
59
|
maf_out_lst.push(maf_cont_lst[*x].to_string());
|
|
60
|
-
}
|
|
60
|
+
}
|
|
61
61
|
maf_str.push_str(maf_out_lst.join("\t").as_str());
|
|
62
62
|
maf_str.push_str("\n");
|
|
63
63
|
mafrows += 1;
|
|
64
64
|
}
|
|
65
|
-
}
|
|
66
|
-
Ok((maf_str.as_bytes().to_vec(),mafrows))
|
|
65
|
+
}
|
|
66
|
+
Ok((maf_str.as_bytes().to_vec(), mafrows))
|
|
67
67
|
}
|
|
68
68
|
|
|
69
|
-
|
|
70
|
-
|
|
71
69
|
#[tokio::main]
|
|
72
|
-
async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
70
|
+
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
73
71
|
// Accepting the piped input json from jodejs and assign to the variable
|
|
74
72
|
// host: GDC host
|
|
75
73
|
// url: urls to download single maf files
|
|
@@ -84,23 +82,21 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
|
84
82
|
})
|
|
85
83
|
.await;
|
|
86
84
|
// Handle the result of the timeout operation
|
|
87
|
-
let
|
|
88
|
-
Ok(Ok(buffer)) => {
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
)) as Box<dyn std::error::Error>);
|
|
101
|
-
}
|
|
85
|
+
let file_id_lst_js: Value = match result {
|
|
86
|
+
Ok(Ok(buffer)) => match serde_json::from_str(&buffer) {
|
|
87
|
+
Ok(js) => js,
|
|
88
|
+
Err(e) => {
|
|
89
|
+
let stdin_error = ErrorEntry {
|
|
90
|
+
url: String::new(),
|
|
91
|
+
error: format!("JSON parsing error: {}", e),
|
|
92
|
+
};
|
|
93
|
+
writeln!(io::stderr(), "{}", serde_json::to_string(&stdin_error).unwrap()).unwrap();
|
|
94
|
+
return Err(Box::new(std::io::Error::new(
|
|
95
|
+
std::io::ErrorKind::InvalidInput,
|
|
96
|
+
"JSON parsing error!",
|
|
97
|
+
)) as Box<dyn std::error::Error>);
|
|
102
98
|
}
|
|
103
|
-
}
|
|
99
|
+
},
|
|
104
100
|
Ok(Err(_e)) => {
|
|
105
101
|
let stdin_error = ErrorEntry {
|
|
106
102
|
url: String::new(),
|
|
@@ -128,22 +124,30 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
|
128
124
|
};
|
|
129
125
|
|
|
130
126
|
// reading the input from PP
|
|
131
|
-
let host = file_id_lst_js
|
|
127
|
+
let host = file_id_lst_js
|
|
128
|
+
.get("host")
|
|
129
|
+
.expect("Host was not provided")
|
|
130
|
+
.as_str()
|
|
131
|
+
.expect("Host is not a string");
|
|
132
132
|
let mut url: Vec<String> = Vec::new();
|
|
133
|
-
let file_id_lst = file_id_lst_js
|
|
133
|
+
let file_id_lst = file_id_lst_js
|
|
134
|
+
.get("fileIdLst")
|
|
135
|
+
.expect("File ID list is missed!")
|
|
136
|
+
.as_array()
|
|
137
|
+
.expect("File ID list is not an array");
|
|
134
138
|
for v in file_id_lst {
|
|
135
139
|
//url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
|
|
136
|
-
url.push(format!("{}/{}",host.trim_end_matches('/'), v.as_str().unwrap()));
|
|
137
|
-
}
|
|
140
|
+
url.push(format!("{}/{}", host.trim_end_matches('/'), v.as_str().unwrap()));
|
|
141
|
+
}
|
|
138
142
|
|
|
139
143
|
// read columns as array from input json and convert data type from Vec<Value> to Vec<String>
|
|
140
|
-
let maf_col:Vec<String>;
|
|
144
|
+
let maf_col: Vec<String>;
|
|
141
145
|
if let Some(maf_col_value) = file_id_lst_js.get("columns") {
|
|
142
146
|
//convert Vec<Value> to Vec<String>
|
|
143
147
|
if let Some(maf_col_array) = maf_col_value.as_array() {
|
|
144
148
|
maf_col = maf_col_array
|
|
145
149
|
.iter()
|
|
146
|
-
.map(|v| v.to_string().replace("\"",""))
|
|
150
|
+
.map(|v| v.to_string().replace("\"", ""))
|
|
147
151
|
.collect::<Vec<String>>();
|
|
148
152
|
} else {
|
|
149
153
|
let column_error = ErrorEntry {
|
|
@@ -165,62 +169,58 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
|
165
169
|
let column_error_js = serde_json::to_string(&column_error).unwrap();
|
|
166
170
|
writeln!(io::stderr(), "{}", column_error_js).expect("Failed to output stderr!");
|
|
167
171
|
return Err(Box::new(std::io::Error::new(
|
|
168
|
-
|
|
169
|
-
|
|
172
|
+
std::io::ErrorKind::InvalidInput,
|
|
173
|
+
"Columns was not selected",
|
|
170
174
|
)) as Box<dyn std::error::Error>);
|
|
171
175
|
};
|
|
172
|
-
|
|
176
|
+
|
|
173
177
|
//downloading maf files parallelly and merge them into single maf file
|
|
174
|
-
let download_futures = futures::stream::iter(
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
Ok(
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
let text = String::from_utf8_lossy(&decompressed_content).to_string();
|
|
198
|
-
return Ok((url.clone(),text))
|
|
199
|
-
}
|
|
200
|
-
Err(e) => {
|
|
201
|
-
let error_msg = format!("Failed to decompress downloaded maf file: {}", e);
|
|
202
|
-
Err((url.clone(), error_msg))
|
|
203
|
-
}
|
|
204
|
-
}
|
|
178
|
+
let download_futures = futures::stream::iter(url.into_iter().map(|url| {
|
|
179
|
+
async move {
|
|
180
|
+
let client = reqwest::Client::builder()
|
|
181
|
+
.timeout(Duration::from_secs(60)) // 60-second timeout per request
|
|
182
|
+
.connect_timeout(Duration::from_secs(15))
|
|
183
|
+
.build()
|
|
184
|
+
.map_err(|_e| {
|
|
185
|
+
let client_error = ErrorEntry {
|
|
186
|
+
url: url.clone(),
|
|
187
|
+
error: "Client build error".to_string(),
|
|
188
|
+
};
|
|
189
|
+
let client_error_js = serde_json::to_string(&client_error).unwrap();
|
|
190
|
+
writeln!(io::stderr(), "{}", client_error_js).expect("Failed to build reqwest client!");
|
|
191
|
+
});
|
|
192
|
+
match client.unwrap().get(&url).send().await {
|
|
193
|
+
Ok(resp) if resp.status().is_success() => match resp.bytes().await {
|
|
194
|
+
Ok(content) => {
|
|
195
|
+
let mut decoder = GzDecoder::new(&content[..]);
|
|
196
|
+
let mut decompressed_content = Vec::new();
|
|
197
|
+
match decoder.read_to_end(&mut decompressed_content) {
|
|
198
|
+
Ok(_) => {
|
|
199
|
+
let text = String::from_utf8_lossy(&decompressed_content).to_string();
|
|
200
|
+
return Ok((url.clone(), text));
|
|
205
201
|
}
|
|
206
202
|
Err(e) => {
|
|
207
|
-
let error_msg = format!("Failed to decompress downloaded
|
|
203
|
+
let error_msg = format!("Failed to decompress downloaded MAF file: {}", e);
|
|
208
204
|
Err((url.clone(), error_msg))
|
|
209
205
|
}
|
|
210
206
|
}
|
|
211
207
|
}
|
|
212
|
-
Ok(resp) => {
|
|
213
|
-
let error_msg = format!("HTTP error: {}", resp.status());
|
|
214
|
-
Err((url.clone(), error_msg))
|
|
215
|
-
}
|
|
216
208
|
Err(e) => {
|
|
217
|
-
let error_msg = format!("
|
|
209
|
+
let error_msg = format!("Failed to decompress downloaded MAF file: {}", e);
|
|
218
210
|
Err((url.clone(), error_msg))
|
|
219
211
|
}
|
|
212
|
+
},
|
|
213
|
+
Ok(resp) => {
|
|
214
|
+
let error_msg = format!("HTTP error: {}", resp.status());
|
|
215
|
+
Err((url.clone(), error_msg))
|
|
216
|
+
}
|
|
217
|
+
Err(e) => {
|
|
218
|
+
let error_msg = format!("Server request failed: {}", e);
|
|
219
|
+
Err((url.clone(), error_msg))
|
|
220
220
|
}
|
|
221
221
|
}
|
|
222
|
-
}
|
|
223
|
-
);
|
|
222
|
+
}
|
|
223
|
+
}));
|
|
224
224
|
|
|
225
225
|
// binary output
|
|
226
226
|
let encoder = Arc::new(Mutex::new(GzEncoder::new(io::stdout(), Compression::default())));
|
|
@@ -228,57 +228,57 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
|
228
228
|
// Write the header
|
|
229
229
|
{
|
|
230
230
|
let mut encoder_guard = encoder.lock().unwrap(); // Lock the Mutex to get access to the inner GzEncoder
|
|
231
|
-
encoder_guard
|
|
231
|
+
encoder_guard
|
|
232
|
+
.write_all(&maf_col.join("\t").as_bytes().to_vec())
|
|
233
|
+
.expect("Failed to write header");
|
|
232
234
|
encoder_guard.write_all(b"\n").expect("Failed to write newline");
|
|
233
235
|
}
|
|
234
|
-
|
|
235
|
-
download_futures
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
236
|
+
|
|
237
|
+
download_futures
|
|
238
|
+
.buffer_unordered(20)
|
|
239
|
+
.for_each(|result| {
|
|
240
|
+
let encoder = Arc::clone(&encoder); // Clone the Arc for each task
|
|
241
|
+
let maf_col_cp = maf_col.clone();
|
|
242
|
+
async move {
|
|
243
|
+
match result {
|
|
244
|
+
Ok((url, content)) => match select_maf_col(content, &maf_col_cp, &url) {
|
|
245
|
+
Ok((maf_bit, mafrows)) => {
|
|
243
246
|
if mafrows > 0 {
|
|
244
247
|
let mut encoder_guard = encoder.lock().unwrap();
|
|
245
248
|
encoder_guard.write_all(&maf_bit).expect("Failed to write file");
|
|
246
249
|
} else {
|
|
247
250
|
let error = ErrorEntry {
|
|
248
251
|
url: url.clone(),
|
|
249
|
-
error: "Empty
|
|
252
|
+
error: "Empty MAF file".to_string(),
|
|
250
253
|
};
|
|
251
254
|
let error_js = serde_json::to_string(&error).unwrap();
|
|
252
255
|
writeln!(io::stderr(), "{}", error_js).expect("Failed to output stderr!");
|
|
253
256
|
}
|
|
254
257
|
}
|
|
255
|
-
Err((url,error)) => {
|
|
256
|
-
let error = ErrorEntry {
|
|
257
|
-
url,
|
|
258
|
-
error,
|
|
259
|
-
};
|
|
258
|
+
Err((url, error)) => {
|
|
259
|
+
let error = ErrorEntry { url, error };
|
|
260
260
|
let error_js = serde_json::to_string(&error).unwrap();
|
|
261
261
|
writeln!(io::stderr(), "{}", error_js).expect("Failed to output stderr!");
|
|
262
262
|
}
|
|
263
|
+
},
|
|
264
|
+
Err((url, error)) => {
|
|
265
|
+
let error = ErrorEntry { url, error };
|
|
266
|
+
let error_js = serde_json::to_string(&error).unwrap();
|
|
267
|
+
writeln!(io::stderr(), "{}", error_js).expect("Failed to output stderr!");
|
|
263
268
|
}
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
};
|
|
270
|
-
let error_js = serde_json::to_string(&error).unwrap();
|
|
271
|
-
writeln!(io::stderr(), "{}", error_js).expect("Failed to output stderr!");
|
|
272
|
-
}
|
|
273
|
-
};
|
|
274
|
-
}
|
|
275
|
-
}).await;
|
|
276
|
-
|
|
269
|
+
};
|
|
270
|
+
}
|
|
271
|
+
})
|
|
272
|
+
.await;
|
|
273
|
+
|
|
277
274
|
// Finalize output
|
|
278
275
|
|
|
279
276
|
// Replace the value inside the Mutex with a dummy value (e.g., None)
|
|
280
277
|
let mut encoder_guard = encoder.lock().unwrap();
|
|
281
|
-
let encoder = std::mem::replace(
|
|
278
|
+
let encoder = std::mem::replace(
|
|
279
|
+
&mut *encoder_guard,
|
|
280
|
+
GzEncoder::new(io::stdout(), Compression::default()),
|
|
281
|
+
);
|
|
282
282
|
// Finalize the encoder
|
|
283
283
|
encoder.finish().expect("Maf file output error!");
|
|
284
284
|
|
package/src/genesetORA.rs
CHANGED
|
@@ -9,7 +9,7 @@ use serde_json;
|
|
|
9
9
|
use std::cmp::Ordering;
|
|
10
10
|
use std::collections::HashSet;
|
|
11
11
|
use std::io;
|
|
12
|
-
use std::time::Instant;
|
|
12
|
+
//use std::time::Instant;
|
|
13
13
|
|
|
14
14
|
#[allow(non_camel_case_types)]
|
|
15
15
|
#[allow(non_snake_case)]
|
|
@@ -37,10 +37,7 @@ fn calculate_hypergeometric_p_value(
|
|
|
37
37
|
) -> (f64, f64, String) {
|
|
38
38
|
let mut gene_set_hits: String = "".to_string();
|
|
39
39
|
|
|
40
|
-
let gene_intersections: HashSet<String> = genes_in_pathway
|
|
41
|
-
.intersection(sample_genes)
|
|
42
|
-
.cloned()
|
|
43
|
-
.collect();
|
|
40
|
+
let gene_intersections: HashSet<String> = genes_in_pathway.intersection(sample_genes).cloned().collect();
|
|
44
41
|
for gene in &gene_intersections {
|
|
45
42
|
gene_set_hits += &(gene.to_string() + &",");
|
|
46
43
|
}
|
|
@@ -78,7 +75,7 @@ fn main() -> Result<()> {
|
|
|
78
75
|
let input_json = json::parse(&input);
|
|
79
76
|
match input_json {
|
|
80
77
|
Ok(json_string) => {
|
|
81
|
-
let run_time = Instant::now();
|
|
78
|
+
//let run_time = Instant::now();
|
|
82
79
|
let msigdb_input: &JsonValue = &json_string["msigdb"];
|
|
83
80
|
let msigdb;
|
|
84
81
|
match msigdb_input.as_str() {
|
|
@@ -92,8 +89,7 @@ fn main() -> Result<()> {
|
|
|
92
89
|
None => panic!("genesetgroup is missing"),
|
|
93
90
|
}
|
|
94
91
|
let sample_genes_input: &JsonValue = &json_string["sample_genes"];
|
|
95
|
-
let sample_genes: Vec<&str> =
|
|
96
|
-
sample_genes_input.as_str().unwrap().split(",").collect();
|
|
92
|
+
let sample_genes: Vec<&str> = sample_genes_input.as_str().unwrap().split(",").collect();
|
|
97
93
|
let mut pathway_p_values: Vec<pathway_p_value> = Vec::with_capacity(10000);
|
|
98
94
|
|
|
99
95
|
let genedb_input: &JsonValue = &json_string["genedb"];
|
|
@@ -103,10 +99,8 @@ fn main() -> Result<()> {
|
|
|
103
99
|
None => panic!("genedb file path is missing"),
|
|
104
100
|
}
|
|
105
101
|
|
|
106
|
-
let filter_non_coding_genes_input: &JsonValue =
|
|
107
|
-
|
|
108
|
-
let filter_non_coding_genes: bool =
|
|
109
|
-
filter_non_coding_genes_input.as_bool().unwrap();
|
|
102
|
+
let filter_non_coding_genes_input: &JsonValue = &json_string["filter_non_coding_genes"];
|
|
103
|
+
let filter_non_coding_genes: bool = filter_non_coding_genes_input.as_bool().unwrap();
|
|
110
104
|
|
|
111
105
|
let genedbconn = Connection::open(genedb)?;
|
|
112
106
|
let genedb_result = genedbconn.prepare(&("select * from codingGenes"));
|
|
@@ -120,8 +114,7 @@ fn main() -> Result<()> {
|
|
|
120
114
|
//println!("coding_gene:{:?}", coding_gene);
|
|
121
115
|
for sample_gene in &sample_genes {
|
|
122
116
|
let code_gene: String = coding_gene.get(0).unwrap();
|
|
123
|
-
if filter_non_coding_genes == true && code_gene == *sample_gene
|
|
124
|
-
{
|
|
117
|
+
if filter_non_coding_genes == true && code_gene == *sample_gene {
|
|
125
118
|
sample_coding_genes.insert(code_gene);
|
|
126
119
|
} else if filter_non_coding_genes == false {
|
|
127
120
|
sample_coding_genes.insert(code_gene);
|
|
@@ -160,25 +153,19 @@ fn main() -> Result<()> {
|
|
|
160
153
|
let num_items_output = 100; // Number of top pathways to be specified in the output
|
|
161
154
|
|
|
162
155
|
let msigdbconn = Connection::open(msigdb)?;
|
|
163
|
-
let stmt_result = msigdbconn
|
|
164
|
-
&("select id from terms where parent_id='".to_owned()
|
|
165
|
-
+ &genesetgroup
|
|
166
|
-
+ "'"),
|
|
167
|
-
);
|
|
156
|
+
let stmt_result = msigdbconn
|
|
157
|
+
.prepare(&("select id from terms where parent_id='".to_owned() + &genesetgroup + "'"));
|
|
168
158
|
match stmt_result {
|
|
169
159
|
Ok(mut stmt) => {
|
|
170
160
|
#[allow(non_snake_case)]
|
|
171
|
-
let GO_iter =
|
|
172
|
-
stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
|
|
161
|
+
let GO_iter = stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
|
|
173
162
|
#[allow(non_snake_case)]
|
|
174
163
|
for GO_term in GO_iter {
|
|
175
164
|
match GO_term {
|
|
176
165
|
Ok(n) => {
|
|
177
166
|
//println!("GO term {:?}", n);
|
|
178
167
|
let sql_statement =
|
|
179
|
-
"select genes from term2genes where id='".to_owned()
|
|
180
|
-
+ &n.GO_id
|
|
181
|
-
+ &"'";
|
|
168
|
+
"select genes from term2genes where id='".to_owned() + &n.GO_id + &"'";
|
|
182
169
|
//println!("sql_statement:{}", sql_statement);
|
|
183
170
|
let mut gene_stmt = msigdbconn.prepare(&(sql_statement))?;
|
|
184
171
|
//println!("gene_stmt:{:?}", gene_stmt);
|
|
@@ -191,26 +178,20 @@ fn main() -> Result<()> {
|
|
|
191
178
|
match input_gene_json {
|
|
192
179
|
Ok(json_genes) => {
|
|
193
180
|
for json_iter in 0..json_genes.len() {
|
|
194
|
-
names.insert(
|
|
195
|
-
json_genes[json_iter]["symbol"]
|
|
196
|
-
.to_string(),
|
|
197
|
-
);
|
|
181
|
+
names.insert(json_genes[json_iter]["symbol"].to_string());
|
|
198
182
|
}
|
|
199
183
|
}
|
|
200
184
|
Err(_) => {
|
|
201
|
-
panic!(
|
|
202
|
-
"Symbol, ensg, enstCanonical structure is missing!"
|
|
203
|
-
)
|
|
185
|
+
panic!("Symbol, ensg, enstCanonical structure is missing!")
|
|
204
186
|
}
|
|
205
187
|
}
|
|
206
188
|
}
|
|
207
189
|
let gene_set_size = names.len();
|
|
208
|
-
let (p_value, matches, gene_set_hits) =
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
);
|
|
190
|
+
let (p_value, matches, gene_set_hits) = calculate_hypergeometric_p_value(
|
|
191
|
+
&sample_coding_genes,
|
|
192
|
+
num_background_genes,
|
|
193
|
+
names,
|
|
194
|
+
);
|
|
214
195
|
if matches >= 1.0 && p_value.is_nan() == false {
|
|
215
196
|
pathway_p_values.push(pathway_p_value {
|
|
216
197
|
pathway_name: n.GO_id,
|
|
@@ -234,11 +215,8 @@ fn main() -> Result<()> {
|
|
|
234
215
|
+ &",\"pathways\":"
|
|
235
216
|
+ &adjust_p_values(pathway_p_values, num_items_output)
|
|
236
217
|
+ &"}";
|
|
237
|
-
println!("
|
|
238
|
-
println!(
|
|
239
|
-
"Time for calculating gene overrepresentation:{:?}",
|
|
240
|
-
run_time.elapsed()
|
|
241
|
-
);
|
|
218
|
+
println!("{}", output_string);
|
|
219
|
+
//println!("Time for calculating gene overrepresentation:{:?}", run_time.elapsed());
|
|
242
220
|
}
|
|
243
221
|
Err(error) => println!("Incorrect json:{}", error),
|
|
244
222
|
}
|
|
@@ -248,10 +226,7 @@ fn main() -> Result<()> {
|
|
|
248
226
|
Ok(())
|
|
249
227
|
}
|
|
250
228
|
|
|
251
|
-
fn adjust_p_values(
|
|
252
|
-
mut original_p_values: Vec<pathway_p_value>,
|
|
253
|
-
mut num_items_output: usize,
|
|
254
|
-
) -> String {
|
|
229
|
+
fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>, mut num_items_output: usize) -> String {
|
|
255
230
|
// Sorting p-values in ascending order
|
|
256
231
|
original_p_values.as_mut_slice().sort_by(|a, b| {
|
|
257
232
|
(a.p_value_original)
|
|
@@ -266,8 +241,7 @@ fn adjust_p_values(
|
|
|
266
241
|
let i = original_p_values.len() - j - 1;
|
|
267
242
|
|
|
268
243
|
//println!("p_val:{}", p_val);
|
|
269
|
-
let mut adjusted_p_val: f64 =
|
|
270
|
-
original_p_values[i].p_value_original * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
|
|
244
|
+
let mut adjusted_p_val: f64 = original_p_values[i].p_value_original * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
|
|
271
245
|
if adjusted_p_val > 1.0 {
|
|
272
246
|
// p_value should NEVER be greater than 1
|
|
273
247
|
adjusted_p_val = 1.0;
|
package/src/readHDF5.rs
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
//------------------------------------------------------------------------------
|
|
2
2
|
// readHDF5.rs - HDF5 Gene Expression Data Reader
|
|
3
3
|
//------------------------------------------------------------------------------
|
|
4
|
-
//
|
|
4
|
+
//
|
|
5
5
|
// Extracts gene expression values from HDF5 files in dense or sparse formats.
|
|
6
|
-
// Supports single genes with memory optimization and multiple genes with
|
|
6
|
+
// Supports single genes with memory optimization and multiple genes with
|
|
7
7
|
// parallel processing.
|
|
8
8
|
//
|
|
9
9
|
// Features:
|
|
@@ -12,16 +12,16 @@
|
|
|
12
12
|
// - Parallel processing for multiple genes
|
|
13
13
|
// - JSON output with timing metrics
|
|
14
14
|
//
|
|
15
|
-
// Usage:
|
|
16
|
-
// HDF5_DIR=/usr/local/Homebrew/Cellar/hdf5/1.14.3_1 &&
|
|
15
|
+
// Usage:
|
|
16
|
+
// HDF5_DIR=/usr/local/Homebrew/Cellar/hdf5/1.14.3_1 &&
|
|
17
17
|
// echo $json='{"gene":"TP53","hdf5_file":"matrix.h5"}' | target/release/readHDF5
|
|
18
18
|
//------------------------------------------------------------------------------
|
|
19
|
-
use rayon::prelude::*;
|
|
20
19
|
use hdf5::types::{FixedAscii, VarLenAscii};
|
|
21
20
|
use hdf5::{File, Result};
|
|
22
21
|
use ndarray::Dim;
|
|
23
|
-
use ndarray::{
|
|
24
|
-
use
|
|
22
|
+
use ndarray::{s, Array1};
|
|
23
|
+
use rayon::prelude::*;
|
|
24
|
+
use serde_json::{json, Map, Value};
|
|
25
25
|
use std::io;
|
|
26
26
|
use std::sync::Arc;
|
|
27
27
|
use std::time::Instant;
|
|
@@ -259,11 +259,7 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
259
259
|
// Create direct key-value pairs where sample names are the keys
|
|
260
260
|
for i in 0..gene_expression.len() {
|
|
261
261
|
// Add each sample name as a key pointing directly to its expression value
|
|
262
|
-
output_string += &format!(
|
|
263
|
-
"\"{}\":{}",
|
|
264
|
-
samples[i].to_string(),
|
|
265
|
-
gene_expression[i].to_string()
|
|
266
|
-
);
|
|
262
|
+
output_string += &format!("\"{}\":{}", samples[i].to_string(), gene_expression[i].to_string());
|
|
267
263
|
|
|
268
264
|
// Add comma if not the last item
|
|
269
265
|
if i < gene_expression.len() - 1 {
|
|
@@ -296,10 +292,7 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
296
292
|
Value::Null
|
|
297
293
|
};
|
|
298
294
|
|
|
299
|
-
samples_map.insert(
|
|
300
|
-
sample.replace("\\", ""),
|
|
301
|
-
value,
|
|
302
|
-
);
|
|
295
|
+
samples_map.insert(sample.replace("\\", ""), value);
|
|
303
296
|
}
|
|
304
297
|
}
|
|
305
298
|
|
|
@@ -317,7 +310,7 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
317
310
|
|
|
318
311
|
/// Reads expression data for a specific gene from a sparse format HDF5 file
|
|
319
312
|
///
|
|
320
|
-
/// Extracts expression values from sparse matrix HDF5 files using Compressed
|
|
313
|
+
/// Extracts expression values from sparse matrix HDF5 files using Compressed
|
|
321
314
|
/// Sparse Column (CSC) structure.
|
|
322
315
|
///
|
|
323
316
|
/// # Arguments
|
|
@@ -391,15 +384,13 @@ fn query_gene_sparse(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
391
384
|
// Find all columns indices that are populated for the given gene
|
|
392
385
|
let now_i = Instant::now();
|
|
393
386
|
let ds_i = file.dataset("data/i")?;
|
|
394
|
-
let populated_column_ids: Array1<usize> =
|
|
395
|
-
ds_i.read_slice_1d(array_start_point..array_stop_point)?;
|
|
387
|
+
let populated_column_ids: Array1<usize> = ds_i.read_slice_1d(array_start_point..array_stop_point)?;
|
|
396
388
|
println!("Time for i dataset:{:?}", now_i.elapsed());
|
|
397
389
|
|
|
398
390
|
// Find all columns values that are populated for the given gene
|
|
399
391
|
let now_x = Instant::now();
|
|
400
392
|
let ds_x = file.dataset("data/x")?;
|
|
401
|
-
let populated_column_values: Array1<f64> =
|
|
402
|
-
ds_x.read_slice_1d(array_start_point..array_stop_point)?;
|
|
393
|
+
let populated_column_values: Array1<f64> = ds_x.read_slice_1d(array_start_point..array_stop_point)?;
|
|
403
394
|
println!("Time for x dataset:{:?}", now_x.elapsed());
|
|
404
395
|
|
|
405
396
|
// Generate the complete array from the sparse array
|
|
@@ -425,10 +416,7 @@ fn query_gene_sparse(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
425
416
|
}
|
|
426
417
|
output_string += &"}".to_string();
|
|
427
418
|
|
|
428
|
-
println!(
|
|
429
|
-
"Time generating full array:{:?}",
|
|
430
|
-
time_generating_full_array.elapsed()
|
|
431
|
-
);
|
|
419
|
+
println!("Time generating full array:{:?}", time_generating_full_array.elapsed());
|
|
432
420
|
println!("output_string:{}", output_string);
|
|
433
421
|
|
|
434
422
|
Ok(())
|
|
@@ -465,7 +453,6 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
465
453
|
}
|
|
466
454
|
};
|
|
467
455
|
|
|
468
|
-
|
|
469
456
|
let genes_dataset = match file.dataset("gene_ids") {
|
|
470
457
|
Ok(ds) => ds,
|
|
471
458
|
Err(err) => {
|
|
@@ -495,7 +482,7 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
495
482
|
};
|
|
496
483
|
|
|
497
484
|
let genes: Vec<String> = genes_varlen.iter().map(|g| g.to_string()).collect();
|
|
498
|
-
|
|
485
|
+
|
|
499
486
|
// Only create HashMap for multiple gene queries
|
|
500
487
|
let gene_to_index: Option<std::collections::HashMap<String, usize>> = if gene_names.len() > 1 {
|
|
501
488
|
let hashmap_start_time = Instant::now();
|
|
@@ -504,8 +491,8 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
504
491
|
map.insert(gene.clone(), idx);
|
|
505
492
|
}
|
|
506
493
|
timings.insert(
|
|
507
|
-
"build_hashmap_ms".to_string(),
|
|
508
|
-
Value::from(hashmap_start_time.elapsed().as_millis() as u64)
|
|
494
|
+
"build_hashmap_ms".to_string(),
|
|
495
|
+
Value::from(hashmap_start_time.elapsed().as_millis() as u64),
|
|
509
496
|
);
|
|
510
497
|
Some(map)
|
|
511
498
|
} else {
|
|
@@ -586,14 +573,11 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
586
573
|
};
|
|
587
574
|
|
|
588
575
|
// Configurable thread count for testing
|
|
589
|
-
let thread_count = 2;
|
|
576
|
+
let thread_count = 2;
|
|
590
577
|
timings.insert("thread_count".to_string(), Value::from(thread_count));
|
|
591
578
|
|
|
592
579
|
// Create a scoped thread pool with specified number of threads
|
|
593
|
-
match rayon::ThreadPoolBuilder::new()
|
|
594
|
-
.num_threads(thread_count)
|
|
595
|
-
.build()
|
|
596
|
-
{
|
|
580
|
+
match rayon::ThreadPoolBuilder::new().num_threads(thread_count).build() {
|
|
597
581
|
Ok(pool) => {
|
|
598
582
|
// Use the pool for this specific work
|
|
599
583
|
pool.install(|| {
|
|
@@ -650,26 +634,20 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
650
634
|
genes_map.insert(gene_name.clone(), gene_data);
|
|
651
635
|
} else {
|
|
652
636
|
// Fallback to per-gene reading if bulk load failed
|
|
653
|
-
match counts_dataset
|
|
654
|
-
.read_slice_1d::<f64, _>(s![gene_index, ..])
|
|
655
|
-
{
|
|
637
|
+
match counts_dataset.read_slice_1d::<f64, _>(s![gene_index, ..]) {
|
|
656
638
|
Ok(gene_expression) => {
|
|
657
639
|
// Create samples map for this gene
|
|
658
640
|
let mut samples_map = Map::new();
|
|
659
641
|
for (i, sample) in samples.iter().enumerate() {
|
|
660
642
|
if i < gene_expression.len() {
|
|
661
643
|
// Handle potential NaN or infinity values
|
|
662
|
-
let value =
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
samples_map.insert(
|
|
670
|
-
sample.replace("\\", ""),
|
|
671
|
-
value,
|
|
672
|
-
);
|
|
644
|
+
let value = if gene_expression[i].is_finite() {
|
|
645
|
+
Value::from(gene_expression[i])
|
|
646
|
+
} else {
|
|
647
|
+
Value::Null
|
|
648
|
+
};
|
|
649
|
+
|
|
650
|
+
samples_map.insert(sample.replace("\\", ""), value);
|
|
673
651
|
}
|
|
674
652
|
}
|
|
675
653
|
|
|
@@ -693,10 +671,7 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
693
671
|
);
|
|
694
672
|
|
|
695
673
|
let mut genes_map = genes_map.lock().unwrap();
|
|
696
|
-
genes_map.insert(
|
|
697
|
-
gene_name.clone(),
|
|
698
|
-
Value::Object(error_map),
|
|
699
|
-
);
|
|
674
|
+
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
700
675
|
}
|
|
701
676
|
}
|
|
702
677
|
}
|
|
@@ -736,7 +711,7 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
736
711
|
&counts_dataset,
|
|
737
712
|
&all_gene_data,
|
|
738
713
|
&samples,
|
|
739
|
-
&genes_map
|
|
714
|
+
&genes_map,
|
|
740
715
|
);
|
|
741
716
|
}
|
|
742
717
|
}
|
|
@@ -758,7 +733,6 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
758
733
|
// Read just this single gene's data directly
|
|
759
734
|
match counts_dataset.read_slice_1d::<f64, _>(s![gene_index, ..]) {
|
|
760
735
|
Ok(gene_expression) => {
|
|
761
|
-
|
|
762
736
|
// Create samples map for this gene
|
|
763
737
|
let mut samples_map = Map::new();
|
|
764
738
|
for (i, sample) in samples.iter().enumerate() {
|
|
@@ -786,10 +760,7 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
786
760
|
let mut error_map = Map::new();
|
|
787
761
|
error_map.insert(
|
|
788
762
|
"error".to_string(),
|
|
789
|
-
Value::String(format!(
|
|
790
|
-
"Failed to read expression values: {:?}",
|
|
791
|
-
err
|
|
792
|
-
)),
|
|
763
|
+
Value::String(format!("Failed to read expression values: {:?}", err)),
|
|
793
764
|
);
|
|
794
765
|
|
|
795
766
|
let mut genes_map = genes_map.lock().unwrap();
|
|
@@ -833,7 +804,7 @@ fn process_genes_sequentially(
|
|
|
833
804
|
counts_dataset: &hdf5::Dataset,
|
|
834
805
|
all_gene_data: &Option<ndarray::ArrayBase<ndarray::OwnedRepr<f64>, ndarray::Dim<[usize; 2]>>>,
|
|
835
806
|
samples: &Vec<String>,
|
|
836
|
-
genes_map: &Arc<std::sync::Mutex<Map<String, Value
|
|
807
|
+
genes_map: &Arc<std::sync::Mutex<Map<String, Value>>>,
|
|
837
808
|
) {
|
|
838
809
|
for gene_name in gene_names {
|
|
839
810
|
// Find the index of the requested gene, using HashMap if available
|
|
@@ -911,10 +882,7 @@ fn process_genes_sequentially(
|
|
|
911
882
|
let mut error_map = Map::new();
|
|
912
883
|
error_map.insert(
|
|
913
884
|
"error".to_string(),
|
|
914
|
-
Value::String(format!(
|
|
915
|
-
"Failed to read expression values: {:?}",
|
|
916
|
-
err1
|
|
917
|
-
)),
|
|
885
|
+
Value::String(format!("Failed to read expression values: {:?}", err1)),
|
|
918
886
|
);
|
|
919
887
|
|
|
920
888
|
let mut genes_map = genes_map.lock().unwrap();
|
|
@@ -935,7 +903,6 @@ fn process_genes_sequentially(
|
|
|
935
903
|
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
936
904
|
}
|
|
937
905
|
}
|
|
938
|
-
|
|
939
906
|
}
|
|
940
907
|
}
|
|
941
908
|
/// Queries expression data for multiple genes from a sparse format HDF5 file
|
|
@@ -1006,7 +973,6 @@ fn query_multiple_genes_sparse(hdf5_filename: String, gene_names: Vec<String>) -
|
|
|
1006
973
|
let num_threads = num_cpus::get();
|
|
1007
974
|
timings.insert("num_threads".to_string(), Value::from(num_threads as u64));
|
|
1008
975
|
|
|
1009
|
-
|
|
1010
976
|
// Thread-safe maps for results
|
|
1011
977
|
let genes_map = Arc::new(std::sync::Mutex::new(Map::new()));
|
|
1012
978
|
let gene_timings = Arc::new(std::sync::Mutex::new(Map::new()));
|
|
@@ -1041,8 +1007,7 @@ fn query_multiple_genes_sparse(hdf5_filename: String, gene_names: Vec<String>) -
|
|
|
1041
1007
|
// Read data for this gene
|
|
1042
1008
|
match ds_i.read_slice_1d::<usize, _>(array_start_point..array_stop_point) {
|
|
1043
1009
|
Ok(populated_column_ids) => {
|
|
1044
|
-
match ds_x.read_slice_1d::<f64, _>(array_start_point..array_stop_point)
|
|
1045
|
-
{
|
|
1010
|
+
match ds_x.read_slice_1d::<f64, _>(array_start_point..array_stop_point) {
|
|
1046
1011
|
Ok(populated_column_values) => {
|
|
1047
1012
|
// Generate the complete array from sparse representation
|
|
1048
1013
|
let mut gene_array: Array1<f64> = Array1::zeros(num_samples);
|
|
@@ -1061,8 +1026,7 @@ fn query_multiple_genes_sparse(hdf5_filename: String, gene_names: Vec<String>) -
|
|
|
1061
1026
|
Value::Null
|
|
1062
1027
|
};
|
|
1063
1028
|
|
|
1064
|
-
samples_map
|
|
1065
|
-
.insert(sample.to_string().replace("\\", ""), value);
|
|
1029
|
+
samples_map.insert(sample.to_string().replace("\\", ""), value);
|
|
1066
1030
|
}
|
|
1067
1031
|
|
|
1068
1032
|
let gene_data = json!({
|
|
@@ -1077,10 +1041,7 @@ fn query_multiple_genes_sparse(hdf5_filename: String, gene_names: Vec<String>) -
|
|
|
1077
1041
|
let mut error_map = Map::new();
|
|
1078
1042
|
error_map.insert(
|
|
1079
1043
|
"error".to_string(),
|
|
1080
|
-
Value::String(format!(
|
|
1081
|
-
"Failed to read x dataset: {:?}",
|
|
1082
|
-
err
|
|
1083
|
-
)),
|
|
1044
|
+
Value::String(format!("Failed to read x dataset: {:?}", err)),
|
|
1084
1045
|
);
|
|
1085
1046
|
|
|
1086
1047
|
let mut genes_map = genes_map.lock().unwrap();
|