@sjcrh/proteinpaint-rust 2.123.0 → 2.125.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.123.0",
2
+ "version": "2.125.0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "type": "module",
5
5
  "description": "Rust-based utilities for proteinpaint",
@@ -39,5 +39,5 @@
39
39
  "devDependencies": {
40
40
  "tape": "^5.2.2"
41
41
  },
42
- "pp_release_tag": "v2.123.0"
42
+ "pp_release_tag": "v2.125.0"
43
43
  }
package/src/gdcmaf.rs CHANGED
@@ -10,16 +10,16 @@
10
10
  echo '{"host": "https://api.gdc.cancer.gov/data/","columns": ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome", "Start_Position"], "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
11
11
  */
12
12
 
13
+ use flate2::Compression;
13
14
  use flate2::read::GzDecoder;
14
15
  use flate2::write::GzEncoder;
15
- use flate2::Compression;
16
- use serde_json::{Value};
17
16
  use futures::StreamExt;
18
- use std::io::{self,Read,Write};
17
+ use serde_json::Value;
18
+ use std::io::{self, Read, Write};
19
+ use std::sync::{Arc, Mutex};
19
20
  use std::time::Duration;
20
21
  use tokio::io::{AsyncReadExt, BufReader};
21
22
  use tokio::time::timeout;
22
- use std::sync::{Arc, Mutex};
23
23
 
24
24
  // Struct to hold error information
25
25
  #[derive(serde::Serialize)]
@@ -28,14 +28,14 @@ struct ErrorEntry {
28
28
  error: String,
29
29
  }
30
30
 
31
- fn select_maf_col(d:String,columns:&Vec<String>,url:&str) -> Result<(Vec<u8>,i32), (String, String)> {
31
+ fn select_maf_col(d: String, columns: &Vec<String>, url: &str) -> Result<(Vec<u8>, i32), (String, String)> {
32
32
  let mut maf_str: String = String::new();
33
33
  let mut header_indices: Vec<usize> = Vec::new();
34
34
  let lines = d.trim_end().split("\n");
35
35
  let mut mafrows = 0;
36
36
  for line in lines {
37
37
  if line.starts_with("#") {
38
- continue
38
+ continue;
39
39
  } else if line.contains("Hugo_Symbol") {
40
40
  let header: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
41
41
  for col in columns {
@@ -48,7 +48,7 @@ fn select_maf_col(d:String,columns:&Vec<String>,url:&str) -> Result<(Vec<u8>,i32
48
48
  return Err((url.to_string(), error_msg));
49
49
  }
50
50
  }
51
- };
51
+ }
52
52
  if header_indices.is_empty() {
53
53
  return Err((url.to_string(), "No matching columns found".to_string()));
54
54
  }
@@ -57,19 +57,17 @@ fn select_maf_col(d:String,columns:&Vec<String>,url:&str) -> Result<(Vec<u8>,i32
57
57
  let mut maf_out_lst: Vec<String> = Vec::new();
58
58
  for x in header_indices.iter() {
59
59
  maf_out_lst.push(maf_cont_lst[*x].to_string());
60
- };
60
+ }
61
61
  maf_str.push_str(maf_out_lst.join("\t").as_str());
62
62
  maf_str.push_str("\n");
63
63
  mafrows += 1;
64
64
  }
65
- };
66
- Ok((maf_str.as_bytes().to_vec(),mafrows))
65
+ }
66
+ Ok((maf_str.as_bytes().to_vec(), mafrows))
67
67
  }
68
68
 
69
-
70
-
71
69
  #[tokio::main]
72
- async fn main() -> Result<(),Box<dyn std::error::Error>> {
70
+ async fn main() -> Result<(), Box<dyn std::error::Error>> {
73
71
  // Accepting the piped input json from jodejs and assign to the variable
74
72
  // host: GDC host
75
73
  // url: urls to download single maf files
@@ -84,23 +82,21 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
84
82
  })
85
83
  .await;
86
84
  // Handle the result of the timeout operation
87
- let file_id_lst_js: Value = match result {
88
- Ok(Ok(buffer)) => {
89
- match serde_json::from_str(&buffer) {
90
- Ok(js) => js,
91
- Err(e) => {
92
- let stdin_error = ErrorEntry {
93
- url: String::new(),
94
- error: format!("JSON parsing error: {}", e),
95
- };
96
- writeln!(io::stderr(), "{}", serde_json::to_string(&stdin_error).unwrap()).unwrap();
97
- return Err(Box::new(std::io::Error::new(
98
- std::io::ErrorKind::InvalidInput,
99
- "JSON parsing error!",
100
- )) as Box<dyn std::error::Error>);
101
- }
85
+ let file_id_lst_js: Value = match result {
86
+ Ok(Ok(buffer)) => match serde_json::from_str(&buffer) {
87
+ Ok(js) => js,
88
+ Err(e) => {
89
+ let stdin_error = ErrorEntry {
90
+ url: String::new(),
91
+ error: format!("JSON parsing error: {}", e),
92
+ };
93
+ writeln!(io::stderr(), "{}", serde_json::to_string(&stdin_error).unwrap()).unwrap();
94
+ return Err(Box::new(std::io::Error::new(
95
+ std::io::ErrorKind::InvalidInput,
96
+ "JSON parsing error!",
97
+ )) as Box<dyn std::error::Error>);
102
98
  }
103
- }
99
+ },
104
100
  Ok(Err(_e)) => {
105
101
  let stdin_error = ErrorEntry {
106
102
  url: String::new(),
@@ -128,22 +124,30 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
128
124
  };
129
125
 
130
126
  // reading the input from PP
131
- let host = file_id_lst_js.get("host").expect("Host was not provided").as_str().expect("Host is not a string");
127
+ let host = file_id_lst_js
128
+ .get("host")
129
+ .expect("Host was not provided")
130
+ .as_str()
131
+ .expect("Host is not a string");
132
132
  let mut url: Vec<String> = Vec::new();
133
- let file_id_lst = file_id_lst_js.get("fileIdLst").expect("File ID list is missed!").as_array().expect("File ID list is not an array");
133
+ let file_id_lst = file_id_lst_js
134
+ .get("fileIdLst")
135
+ .expect("File ID list is missed!")
136
+ .as_array()
137
+ .expect("File ID list is not an array");
134
138
  for v in file_id_lst {
135
139
  //url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
136
- url.push(format!("{}/{}",host.trim_end_matches('/'), v.as_str().unwrap()));
137
- };
140
+ url.push(format!("{}/{}", host.trim_end_matches('/'), v.as_str().unwrap()));
141
+ }
138
142
 
139
143
  // read columns as array from input json and convert data type from Vec<Value> to Vec<String>
140
- let maf_col:Vec<String>;
144
+ let maf_col: Vec<String>;
141
145
  if let Some(maf_col_value) = file_id_lst_js.get("columns") {
142
146
  //convert Vec<Value> to Vec<String>
143
147
  if let Some(maf_col_array) = maf_col_value.as_array() {
144
148
  maf_col = maf_col_array
145
149
  .iter()
146
- .map(|v| v.to_string().replace("\"",""))
150
+ .map(|v| v.to_string().replace("\"", ""))
147
151
  .collect::<Vec<String>>();
148
152
  } else {
149
153
  let column_error = ErrorEntry {
@@ -165,62 +169,58 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
165
169
  let column_error_js = serde_json::to_string(&column_error).unwrap();
166
170
  writeln!(io::stderr(), "{}", column_error_js).expect("Failed to output stderr!");
167
171
  return Err(Box::new(std::io::Error::new(
168
- std::io::ErrorKind::InvalidInput,
169
- "Columns was not selected",
172
+ std::io::ErrorKind::InvalidInput,
173
+ "Columns was not selected",
170
174
  )) as Box<dyn std::error::Error>);
171
175
  };
172
-
176
+
173
177
  //downloading maf files parallelly and merge them into single maf file
174
- let download_futures = futures::stream::iter(
175
- url.into_iter().map(|url|{
176
- async move {
177
- let client = reqwest::Client::builder()
178
- .timeout(Duration::from_secs(60)) // 60-second timeout per request
179
- .connect_timeout(Duration::from_secs(15))
180
- .build()
181
- .map_err(|_e| {
182
- let client_error = ErrorEntry{
183
- url: url.clone(),
184
- error: "Client build error".to_string(),
185
- };
186
- let client_error_js = serde_json::to_string(&client_error).unwrap();
187
- writeln!(io::stderr(), "{}", client_error_js).expect("Failed to build reqwest client!");
188
- });
189
- match client.unwrap().get(&url).send().await {
190
- Ok(resp) if resp.status().is_success() => {
191
- match resp.bytes().await {
192
- Ok(content) => {
193
- let mut decoder = GzDecoder::new(&content[..]);
194
- let mut decompressed_content = Vec::new();
195
- match decoder.read_to_end(&mut decompressed_content) {
196
- Ok(_) => {
197
- let text = String::from_utf8_lossy(&decompressed_content).to_string();
198
- return Ok((url.clone(),text))
199
- }
200
- Err(e) => {
201
- let error_msg = format!("Failed to decompress downloaded maf file: {}", e);
202
- Err((url.clone(), error_msg))
203
- }
204
- }
178
+ let download_futures = futures::stream::iter(url.into_iter().map(|url| {
179
+ async move {
180
+ let client = reqwest::Client::builder()
181
+ .timeout(Duration::from_secs(60)) // 60-second timeout per request
182
+ .connect_timeout(Duration::from_secs(15))
183
+ .build()
184
+ .map_err(|_e| {
185
+ let client_error = ErrorEntry {
186
+ url: url.clone(),
187
+ error: "Client build error".to_string(),
188
+ };
189
+ let client_error_js = serde_json::to_string(&client_error).unwrap();
190
+ writeln!(io::stderr(), "{}", client_error_js).expect("Failed to build reqwest client!");
191
+ });
192
+ match client.unwrap().get(&url).send().await {
193
+ Ok(resp) if resp.status().is_success() => match resp.bytes().await {
194
+ Ok(content) => {
195
+ let mut decoder = GzDecoder::new(&content[..]);
196
+ let mut decompressed_content = Vec::new();
197
+ match decoder.read_to_end(&mut decompressed_content) {
198
+ Ok(_) => {
199
+ let text = String::from_utf8_lossy(&decompressed_content).to_string();
200
+ return Ok((url.clone(), text));
205
201
  }
206
202
  Err(e) => {
207
- let error_msg = format!("Failed to decompress downloaded maf file: {}", e);
203
+ let error_msg = format!("Failed to decompress downloaded MAF file: {}", e);
208
204
  Err((url.clone(), error_msg))
209
205
  }
210
206
  }
211
207
  }
212
- Ok(resp) => {
213
- let error_msg = format!("HTTP error: {}", resp.status());
214
- Err((url.clone(), error_msg))
215
- }
216
208
  Err(e) => {
217
- let error_msg = format!("Server request failed: {}", e);
209
+ let error_msg = format!("Failed to decompress downloaded MAF file: {}", e);
218
210
  Err((url.clone(), error_msg))
219
211
  }
212
+ },
213
+ Ok(resp) => {
214
+ let error_msg = format!("HTTP error: {}", resp.status());
215
+ Err((url.clone(), error_msg))
216
+ }
217
+ Err(e) => {
218
+ let error_msg = format!("Server request failed: {}", e);
219
+ Err((url.clone(), error_msg))
220
220
  }
221
221
  }
222
- })
223
- );
222
+ }
223
+ }));
224
224
 
225
225
  // binary output
226
226
  let encoder = Arc::new(Mutex::new(GzEncoder::new(io::stdout(), Compression::default())));
@@ -228,57 +228,57 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
228
228
  // Write the header
229
229
  {
230
230
  let mut encoder_guard = encoder.lock().unwrap(); // Lock the Mutex to get access to the inner GzEncoder
231
- encoder_guard.write_all(&maf_col.join("\t").as_bytes().to_vec()).expect("Failed to write header");
231
+ encoder_guard
232
+ .write_all(&maf_col.join("\t").as_bytes().to_vec())
233
+ .expect("Failed to write header");
232
234
  encoder_guard.write_all(b"\n").expect("Failed to write newline");
233
235
  }
234
-
235
- download_futures.buffer_unordered(20).for_each( |result| {
236
- let encoder = Arc::clone(&encoder); // Clone the Arc for each task
237
- let maf_col_cp = maf_col.clone();
238
- async move {
239
- match result {
240
- Ok((url, content)) => {
241
- match select_maf_col(content, &maf_col_cp, &url) {
242
- Ok((maf_bit,mafrows)) => {
236
+
237
+ download_futures
238
+ .buffer_unordered(20)
239
+ .for_each(|result| {
240
+ let encoder = Arc::clone(&encoder); // Clone the Arc for each task
241
+ let maf_col_cp = maf_col.clone();
242
+ async move {
243
+ match result {
244
+ Ok((url, content)) => match select_maf_col(content, &maf_col_cp, &url) {
245
+ Ok((maf_bit, mafrows)) => {
243
246
  if mafrows > 0 {
244
247
  let mut encoder_guard = encoder.lock().unwrap();
245
248
  encoder_guard.write_all(&maf_bit).expect("Failed to write file");
246
249
  } else {
247
250
  let error = ErrorEntry {
248
251
  url: url.clone(),
249
- error: "Empty maf file".to_string(),
252
+ error: "Empty MAF file".to_string(),
250
253
  };
251
254
  let error_js = serde_json::to_string(&error).unwrap();
252
255
  writeln!(io::stderr(), "{}", error_js).expect("Failed to output stderr!");
253
256
  }
254
257
  }
255
- Err((url,error)) => {
256
- let error = ErrorEntry {
257
- url,
258
- error,
259
- };
258
+ Err((url, error)) => {
259
+ let error = ErrorEntry { url, error };
260
260
  let error_js = serde_json::to_string(&error).unwrap();
261
261
  writeln!(io::stderr(), "{}", error_js).expect("Failed to output stderr!");
262
262
  }
263
+ },
264
+ Err((url, error)) => {
265
+ let error = ErrorEntry { url, error };
266
+ let error_js = serde_json::to_string(&error).unwrap();
267
+ writeln!(io::stderr(), "{}", error_js).expect("Failed to output stderr!");
263
268
  }
264
- }
265
- Err((url, error)) => {
266
- let error = ErrorEntry {
267
- url,
268
- error,
269
- };
270
- let error_js = serde_json::to_string(&error).unwrap();
271
- writeln!(io::stderr(), "{}", error_js).expect("Failed to output stderr!");
272
- }
273
- };
274
- }
275
- }).await;
276
-
269
+ };
270
+ }
271
+ })
272
+ .await;
273
+
277
274
  // Finalize output
278
275
 
279
276
  // Replace the value inside the Mutex with a dummy value (e.g., None)
280
277
  let mut encoder_guard = encoder.lock().unwrap();
281
- let encoder = std::mem::replace(&mut *encoder_guard, GzEncoder::new(io::stdout(), Compression::default()));
278
+ let encoder = std::mem::replace(
279
+ &mut *encoder_guard,
280
+ GzEncoder::new(io::stdout(), Compression::default()),
281
+ );
282
282
  // Finalize the encoder
283
283
  encoder.finish().expect("Maf file output error!");
284
284
 
package/src/genesetORA.rs CHANGED
@@ -9,7 +9,7 @@ use serde_json;
9
9
  use std::cmp::Ordering;
10
10
  use std::collections::HashSet;
11
11
  use std::io;
12
- use std::time::Instant;
12
+ //use std::time::Instant;
13
13
 
14
14
  #[allow(non_camel_case_types)]
15
15
  #[allow(non_snake_case)]
@@ -37,10 +37,7 @@ fn calculate_hypergeometric_p_value(
37
37
  ) -> (f64, f64, String) {
38
38
  let mut gene_set_hits: String = "".to_string();
39
39
 
40
- let gene_intersections: HashSet<String> = genes_in_pathway
41
- .intersection(sample_genes)
42
- .cloned()
43
- .collect();
40
+ let gene_intersections: HashSet<String> = genes_in_pathway.intersection(sample_genes).cloned().collect();
44
41
  for gene in &gene_intersections {
45
42
  gene_set_hits += &(gene.to_string() + &",");
46
43
  }
@@ -78,7 +75,7 @@ fn main() -> Result<()> {
78
75
  let input_json = json::parse(&input);
79
76
  match input_json {
80
77
  Ok(json_string) => {
81
- let run_time = Instant::now();
78
+ //let run_time = Instant::now();
82
79
  let msigdb_input: &JsonValue = &json_string["msigdb"];
83
80
  let msigdb;
84
81
  match msigdb_input.as_str() {
@@ -92,8 +89,7 @@ fn main() -> Result<()> {
92
89
  None => panic!("genesetgroup is missing"),
93
90
  }
94
91
  let sample_genes_input: &JsonValue = &json_string["sample_genes"];
95
- let sample_genes: Vec<&str> =
96
- sample_genes_input.as_str().unwrap().split(",").collect();
92
+ let sample_genes: Vec<&str> = sample_genes_input.as_str().unwrap().split(",").collect();
97
93
  let mut pathway_p_values: Vec<pathway_p_value> = Vec::with_capacity(10000);
98
94
 
99
95
  let genedb_input: &JsonValue = &json_string["genedb"];
@@ -103,10 +99,8 @@ fn main() -> Result<()> {
103
99
  None => panic!("genedb file path is missing"),
104
100
  }
105
101
 
106
- let filter_non_coding_genes_input: &JsonValue =
107
- &json_string["filter_non_coding_genes"];
108
- let filter_non_coding_genes: bool =
109
- filter_non_coding_genes_input.as_bool().unwrap();
102
+ let filter_non_coding_genes_input: &JsonValue = &json_string["filter_non_coding_genes"];
103
+ let filter_non_coding_genes: bool = filter_non_coding_genes_input.as_bool().unwrap();
110
104
 
111
105
  let genedbconn = Connection::open(genedb)?;
112
106
  let genedb_result = genedbconn.prepare(&("select * from codingGenes"));
@@ -120,8 +114,7 @@ fn main() -> Result<()> {
120
114
  //println!("coding_gene:{:?}", coding_gene);
121
115
  for sample_gene in &sample_genes {
122
116
  let code_gene: String = coding_gene.get(0).unwrap();
123
- if filter_non_coding_genes == true && code_gene == *sample_gene
124
- {
117
+ if filter_non_coding_genes == true && code_gene == *sample_gene {
125
118
  sample_coding_genes.insert(code_gene);
126
119
  } else if filter_non_coding_genes == false {
127
120
  sample_coding_genes.insert(code_gene);
@@ -160,25 +153,19 @@ fn main() -> Result<()> {
160
153
  let num_items_output = 100; // Number of top pathways to be specified in the output
161
154
 
162
155
  let msigdbconn = Connection::open(msigdb)?;
163
- let stmt_result = msigdbconn.prepare(
164
- &("select id from terms where parent_id='".to_owned()
165
- + &genesetgroup
166
- + "'"),
167
- );
156
+ let stmt_result = msigdbconn
157
+ .prepare(&("select id from terms where parent_id='".to_owned() + &genesetgroup + "'"));
168
158
  match stmt_result {
169
159
  Ok(mut stmt) => {
170
160
  #[allow(non_snake_case)]
171
- let GO_iter =
172
- stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
161
+ let GO_iter = stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
173
162
  #[allow(non_snake_case)]
174
163
  for GO_term in GO_iter {
175
164
  match GO_term {
176
165
  Ok(n) => {
177
166
  //println!("GO term {:?}", n);
178
167
  let sql_statement =
179
- "select genes from term2genes where id='".to_owned()
180
- + &n.GO_id
181
- + &"'";
168
+ "select genes from term2genes where id='".to_owned() + &n.GO_id + &"'";
182
169
  //println!("sql_statement:{}", sql_statement);
183
170
  let mut gene_stmt = msigdbconn.prepare(&(sql_statement))?;
184
171
  //println!("gene_stmt:{:?}", gene_stmt);
@@ -191,26 +178,20 @@ fn main() -> Result<()> {
191
178
  match input_gene_json {
192
179
  Ok(json_genes) => {
193
180
  for json_iter in 0..json_genes.len() {
194
- names.insert(
195
- json_genes[json_iter]["symbol"]
196
- .to_string(),
197
- );
181
+ names.insert(json_genes[json_iter]["symbol"].to_string());
198
182
  }
199
183
  }
200
184
  Err(_) => {
201
- panic!(
202
- "Symbol, ensg, enstCanonical structure is missing!"
203
- )
185
+ panic!("Symbol, ensg, enstCanonical structure is missing!")
204
186
  }
205
187
  }
206
188
  }
207
189
  let gene_set_size = names.len();
208
- let (p_value, matches, gene_set_hits) =
209
- calculate_hypergeometric_p_value(
210
- &sample_coding_genes,
211
- num_background_genes,
212
- names,
213
- );
190
+ let (p_value, matches, gene_set_hits) = calculate_hypergeometric_p_value(
191
+ &sample_coding_genes,
192
+ num_background_genes,
193
+ names,
194
+ );
214
195
  if matches >= 1.0 && p_value.is_nan() == false {
215
196
  pathway_p_values.push(pathway_p_value {
216
197
  pathway_name: n.GO_id,
@@ -234,11 +215,8 @@ fn main() -> Result<()> {
234
215
  + &",\"pathways\":"
235
216
  + &adjust_p_values(pathway_p_values, num_items_output)
236
217
  + &"}";
237
- println!("pathway_p_values:{}", output_string);
238
- println!(
239
- "Time for calculating gene overrepresentation:{:?}",
240
- run_time.elapsed()
241
- );
218
+ println!("{}", output_string);
219
+ //println!("Time for calculating gene overrepresentation:{:?}", run_time.elapsed());
242
220
  }
243
221
  Err(error) => println!("Incorrect json:{}", error),
244
222
  }
@@ -248,10 +226,7 @@ fn main() -> Result<()> {
248
226
  Ok(())
249
227
  }
250
228
 
251
- fn adjust_p_values(
252
- mut original_p_values: Vec<pathway_p_value>,
253
- mut num_items_output: usize,
254
- ) -> String {
229
+ fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>, mut num_items_output: usize) -> String {
255
230
  // Sorting p-values in ascending order
256
231
  original_p_values.as_mut_slice().sort_by(|a, b| {
257
232
  (a.p_value_original)
@@ -266,8 +241,7 @@ fn adjust_p_values(
266
241
  let i = original_p_values.len() - j - 1;
267
242
 
268
243
  //println!("p_val:{}", p_val);
269
- let mut adjusted_p_val: f64 =
270
- original_p_values[i].p_value_original * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
244
+ let mut adjusted_p_val: f64 = original_p_values[i].p_value_original * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
271
245
  if adjusted_p_val > 1.0 {
272
246
  // p_value should NEVER be greater than 1
273
247
  adjusted_p_val = 1.0;
package/src/readHDF5.rs CHANGED
@@ -1,9 +1,9 @@
1
1
  //------------------------------------------------------------------------------
2
2
  // readHDF5.rs - HDF5 Gene Expression Data Reader
3
3
  //------------------------------------------------------------------------------
4
- //
4
+ //
5
5
  // Extracts gene expression values from HDF5 files in dense or sparse formats.
6
- // Supports single genes with memory optimization and multiple genes with
6
+ // Supports single genes with memory optimization and multiple genes with
7
7
  // parallel processing.
8
8
  //
9
9
  // Features:
@@ -12,16 +12,16 @@
12
12
  // - Parallel processing for multiple genes
13
13
  // - JSON output with timing metrics
14
14
  //
15
- // Usage:
16
- // HDF5_DIR=/usr/local/Homebrew/Cellar/hdf5/1.14.3_1 &&
15
+ // Usage:
16
+ // HDF5_DIR=/usr/local/Homebrew/Cellar/hdf5/1.14.3_1 &&
17
17
  // echo $json='{"gene":"TP53","hdf5_file":"matrix.h5"}' | target/release/readHDF5
18
18
  //------------------------------------------------------------------------------
19
- use rayon::prelude::*;
20
19
  use hdf5::types::{FixedAscii, VarLenAscii};
21
20
  use hdf5::{File, Result};
22
21
  use ndarray::Dim;
23
- use ndarray::{Array1, s};
24
- use serde_json::{Map, Value, json};
22
+ use ndarray::{s, Array1};
23
+ use rayon::prelude::*;
24
+ use serde_json::{json, Map, Value};
25
25
  use std::io;
26
26
  use std::sync::Arc;
27
27
  use std::time::Instant;
@@ -259,11 +259,7 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
259
259
  // Create direct key-value pairs where sample names are the keys
260
260
  for i in 0..gene_expression.len() {
261
261
  // Add each sample name as a key pointing directly to its expression value
262
- output_string += &format!(
263
- "\"{}\":{}",
264
- samples[i].to_string(),
265
- gene_expression[i].to_string()
266
- );
262
+ output_string += &format!("\"{}\":{}", samples[i].to_string(), gene_expression[i].to_string());
267
263
 
268
264
  // Add comma if not the last item
269
265
  if i < gene_expression.len() - 1 {
@@ -296,10 +292,7 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
296
292
  Value::Null
297
293
  };
298
294
 
299
- samples_map.insert(
300
- sample.replace("\\", ""),
301
- value,
302
- );
295
+ samples_map.insert(sample.replace("\\", ""), value);
303
296
  }
304
297
  }
305
298
 
@@ -317,7 +310,7 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
317
310
 
318
311
  /// Reads expression data for a specific gene from a sparse format HDF5 file
319
312
  ///
320
- /// Extracts expression values from sparse matrix HDF5 files using Compressed
313
+ /// Extracts expression values from sparse matrix HDF5 files using Compressed
321
314
  /// Sparse Column (CSC) structure.
322
315
  ///
323
316
  /// # Arguments
@@ -391,15 +384,13 @@ fn query_gene_sparse(hdf5_filename: String, gene_name: String) -> Result<()> {
391
384
  // Find all columns indices that are populated for the given gene
392
385
  let now_i = Instant::now();
393
386
  let ds_i = file.dataset("data/i")?;
394
- let populated_column_ids: Array1<usize> =
395
- ds_i.read_slice_1d(array_start_point..array_stop_point)?;
387
+ let populated_column_ids: Array1<usize> = ds_i.read_slice_1d(array_start_point..array_stop_point)?;
396
388
  println!("Time for i dataset:{:?}", now_i.elapsed());
397
389
 
398
390
  // Find all columns values that are populated for the given gene
399
391
  let now_x = Instant::now();
400
392
  let ds_x = file.dataset("data/x")?;
401
- let populated_column_values: Array1<f64> =
402
- ds_x.read_slice_1d(array_start_point..array_stop_point)?;
393
+ let populated_column_values: Array1<f64> = ds_x.read_slice_1d(array_start_point..array_stop_point)?;
403
394
  println!("Time for x dataset:{:?}", now_x.elapsed());
404
395
 
405
396
  // Generate the complete array from the sparse array
@@ -425,10 +416,7 @@ fn query_gene_sparse(hdf5_filename: String, gene_name: String) -> Result<()> {
425
416
  }
426
417
  output_string += &"}".to_string();
427
418
 
428
- println!(
429
- "Time generating full array:{:?}",
430
- time_generating_full_array.elapsed()
431
- );
419
+ println!("Time generating full array:{:?}", time_generating_full_array.elapsed());
432
420
  println!("output_string:{}", output_string);
433
421
 
434
422
  Ok(())
@@ -465,7 +453,6 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
465
453
  }
466
454
  };
467
455
 
468
-
469
456
  let genes_dataset = match file.dataset("gene_ids") {
470
457
  Ok(ds) => ds,
471
458
  Err(err) => {
@@ -495,7 +482,7 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
495
482
  };
496
483
 
497
484
  let genes: Vec<String> = genes_varlen.iter().map(|g| g.to_string()).collect();
498
-
485
+
499
486
  // Only create HashMap for multiple gene queries
500
487
  let gene_to_index: Option<std::collections::HashMap<String, usize>> = if gene_names.len() > 1 {
501
488
  let hashmap_start_time = Instant::now();
@@ -504,8 +491,8 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
504
491
  map.insert(gene.clone(), idx);
505
492
  }
506
493
  timings.insert(
507
- "build_hashmap_ms".to_string(),
508
- Value::from(hashmap_start_time.elapsed().as_millis() as u64)
494
+ "build_hashmap_ms".to_string(),
495
+ Value::from(hashmap_start_time.elapsed().as_millis() as u64),
509
496
  );
510
497
  Some(map)
511
498
  } else {
@@ -586,14 +573,11 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
586
573
  };
587
574
 
588
575
  // Configurable thread count for testing
589
- let thread_count = 2;
576
+ let thread_count = 2;
590
577
  timings.insert("thread_count".to_string(), Value::from(thread_count));
591
578
 
592
579
  // Create a scoped thread pool with specified number of threads
593
- match rayon::ThreadPoolBuilder::new()
594
- .num_threads(thread_count)
595
- .build()
596
- {
580
+ match rayon::ThreadPoolBuilder::new().num_threads(thread_count).build() {
597
581
  Ok(pool) => {
598
582
  // Use the pool for this specific work
599
583
  pool.install(|| {
@@ -650,26 +634,20 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
650
634
  genes_map.insert(gene_name.clone(), gene_data);
651
635
  } else {
652
636
  // Fallback to per-gene reading if bulk load failed
653
- match counts_dataset
654
- .read_slice_1d::<f64, _>(s![gene_index, ..])
655
- {
637
+ match counts_dataset.read_slice_1d::<f64, _>(s![gene_index, ..]) {
656
638
  Ok(gene_expression) => {
657
639
  // Create samples map for this gene
658
640
  let mut samples_map = Map::new();
659
641
  for (i, sample) in samples.iter().enumerate() {
660
642
  if i < gene_expression.len() {
661
643
  // Handle potential NaN or infinity values
662
- let value =
663
- if gene_expression[i].is_finite() {
664
- Value::from(gene_expression[i])
665
- } else {
666
- Value::Null
667
- };
668
-
669
- samples_map.insert(
670
- sample.replace("\\", ""),
671
- value,
672
- );
644
+ let value = if gene_expression[i].is_finite() {
645
+ Value::from(gene_expression[i])
646
+ } else {
647
+ Value::Null
648
+ };
649
+
650
+ samples_map.insert(sample.replace("\\", ""), value);
673
651
  }
674
652
  }
675
653
 
@@ -693,10 +671,7 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
693
671
  );
694
672
 
695
673
  let mut genes_map = genes_map.lock().unwrap();
696
- genes_map.insert(
697
- gene_name.clone(),
698
- Value::Object(error_map),
699
- );
674
+ genes_map.insert(gene_name.clone(), Value::Object(error_map));
700
675
  }
701
676
  }
702
677
  }
@@ -736,7 +711,7 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
736
711
  &counts_dataset,
737
712
  &all_gene_data,
738
713
  &samples,
739
- &genes_map
714
+ &genes_map,
740
715
  );
741
716
  }
742
717
  }
@@ -758,7 +733,6 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
758
733
  // Read just this single gene's data directly
759
734
  match counts_dataset.read_slice_1d::<f64, _>(s![gene_index, ..]) {
760
735
  Ok(gene_expression) => {
761
-
762
736
  // Create samples map for this gene
763
737
  let mut samples_map = Map::new();
764
738
  for (i, sample) in samples.iter().enumerate() {
@@ -786,10 +760,7 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
786
760
  let mut error_map = Map::new();
787
761
  error_map.insert(
788
762
  "error".to_string(),
789
- Value::String(format!(
790
- "Failed to read expression values: {:?}",
791
- err
792
- )),
763
+ Value::String(format!("Failed to read expression values: {:?}", err)),
793
764
  );
794
765
 
795
766
  let mut genes_map = genes_map.lock().unwrap();
@@ -833,7 +804,7 @@ fn process_genes_sequentially(
833
804
  counts_dataset: &hdf5::Dataset,
834
805
  all_gene_data: &Option<ndarray::ArrayBase<ndarray::OwnedRepr<f64>, ndarray::Dim<[usize; 2]>>>,
835
806
  samples: &Vec<String>,
836
- genes_map: &Arc<std::sync::Mutex<Map<String, Value>>>
807
+ genes_map: &Arc<std::sync::Mutex<Map<String, Value>>>,
837
808
  ) {
838
809
  for gene_name in gene_names {
839
810
  // Find the index of the requested gene, using HashMap if available
@@ -911,10 +882,7 @@ fn process_genes_sequentially(
911
882
  let mut error_map = Map::new();
912
883
  error_map.insert(
913
884
  "error".to_string(),
914
- Value::String(format!(
915
- "Failed to read expression values: {:?}",
916
- err1
917
- )),
885
+ Value::String(format!("Failed to read expression values: {:?}", err1)),
918
886
  );
919
887
 
920
888
  let mut genes_map = genes_map.lock().unwrap();
@@ -935,7 +903,6 @@ fn process_genes_sequentially(
935
903
  genes_map.insert(gene_name.clone(), Value::Object(error_map));
936
904
  }
937
905
  }
938
-
939
906
  }
940
907
  }
941
908
  /// Queries expression data for multiple genes from a sparse format HDF5 file
@@ -1006,7 +973,6 @@ fn query_multiple_genes_sparse(hdf5_filename: String, gene_names: Vec<String>) -
1006
973
  let num_threads = num_cpus::get();
1007
974
  timings.insert("num_threads".to_string(), Value::from(num_threads as u64));
1008
975
 
1009
-
1010
976
  // Thread-safe maps for results
1011
977
  let genes_map = Arc::new(std::sync::Mutex::new(Map::new()));
1012
978
  let gene_timings = Arc::new(std::sync::Mutex::new(Map::new()));
@@ -1041,8 +1007,7 @@ fn query_multiple_genes_sparse(hdf5_filename: String, gene_names: Vec<String>) -
1041
1007
  // Read data for this gene
1042
1008
  match ds_i.read_slice_1d::<usize, _>(array_start_point..array_stop_point) {
1043
1009
  Ok(populated_column_ids) => {
1044
- match ds_x.read_slice_1d::<f64, _>(array_start_point..array_stop_point)
1045
- {
1010
+ match ds_x.read_slice_1d::<f64, _>(array_start_point..array_stop_point) {
1046
1011
  Ok(populated_column_values) => {
1047
1012
  // Generate the complete array from sparse representation
1048
1013
  let mut gene_array: Array1<f64> = Array1::zeros(num_samples);
@@ -1061,8 +1026,7 @@ fn query_multiple_genes_sparse(hdf5_filename: String, gene_names: Vec<String>) -
1061
1026
  Value::Null
1062
1027
  };
1063
1028
 
1064
- samples_map
1065
- .insert(sample.to_string().replace("\\", ""), value);
1029
+ samples_map.insert(sample.to_string().replace("\\", ""), value);
1066
1030
  }
1067
1031
 
1068
1032
  let gene_data = json!({
@@ -1077,10 +1041,7 @@ fn query_multiple_genes_sparse(hdf5_filename: String, gene_names: Vec<String>) -
1077
1041
  let mut error_map = Map::new();
1078
1042
  error_map.insert(
1079
1043
  "error".to_string(),
1080
- Value::String(format!(
1081
- "Failed to read x dataset: {:?}",
1082
- err
1083
- )),
1044
+ Value::String(format!("Failed to read x dataset: {:?}", err)),
1084
1045
  );
1085
1046
 
1086
1047
  let mut genes_map = genes_map.lock().unwrap();
@@ -6,7 +6,6 @@ use hdf5::{File, Result};
6
6
  use ndarray::Array1;
7
7
  use ndarray::Dim;
8
8
  use std::io;
9
- use serde_json;
10
9
 
11
10
  /// Detects the format of the HDF5 file
12
11
  pub fn detect_hdf5_format(hdf5_filename: &str) -> Result<&'static str> {