@sjcrh/proteinpaint-rust 2.114.0 → 2.117.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +2 -1
- package/package.json +2 -2
- package/src/readHDF5.rs +809 -240
- package/src/topGeneByExpressionVariance.rs +515 -33
package/src/readHDF5.rs
CHANGED
|
@@ -1,76 +1,43 @@
|
|
|
1
|
-
|
|
2
|
-
//
|
|
3
|
-
|
|
4
|
-
//
|
|
1
|
+
//------------------------------------------------------------------------------
|
|
2
|
+
// readHDF5.rs - HDF5 Gene Expression Data Reader
|
|
3
|
+
//------------------------------------------------------------------------------
|
|
4
|
+
//
|
|
5
|
+
// Extracts gene expression values from HDF5 files in dense or sparse formats.
|
|
6
|
+
// Supports single genes with memory optimization and multiple genes with
|
|
7
|
+
// parallel processing.
|
|
8
|
+
//
|
|
9
|
+
// Features:
|
|
10
|
+
// - Auto format detection (dense/sparse)
|
|
11
|
+
// - Optimized single and multi-gene queries
|
|
12
|
+
// - Parallel processing for multiple genes
|
|
13
|
+
// - JSON output with timing metrics
|
|
14
|
+
//
|
|
15
|
+
// Usage:
|
|
16
|
+
// HDF5_DIR=/usr/local/Homebrew/Cellar/hdf5/1.14.3_1 &&
|
|
17
|
+
// echo $json='{"gene":"TP53","hdf5_file":"matrix.h5"}' | target/release/readHDF5
|
|
18
|
+
//------------------------------------------------------------------------------
|
|
5
19
|
use hdf5::types::{FixedAscii, VarLenAscii};
|
|
6
20
|
use hdf5::{File, Result};
|
|
7
21
|
use ndarray::Dim;
|
|
8
22
|
use ndarray::{Array1, s};
|
|
23
|
+
use rayon::prelude::*;
|
|
9
24
|
use serde_json::{Map, Value, json};
|
|
10
25
|
use std::io;
|
|
26
|
+
use std::sync::Arc;
|
|
11
27
|
use std::time::Instant;
|
|
12
28
|
|
|
13
29
|
/// Determines the format of an HDF5 gene expression file
|
|
14
30
|
///
|
|
15
|
-
///
|
|
16
|
-
///
|
|
17
|
-
///
|
|
18
|
-
///
|
|
19
|
-
///
|
|
20
|
-
/// # HDF5 Format Specifications
|
|
21
|
-
///
|
|
22
|
-
/// The function identifies the following formats:
|
|
23
|
-
///
|
|
24
|
-
/// - **Dense format**:
|
|
25
|
-
/// - Contains a "counts" dataset (2D matrix of gene expression values)
|
|
26
|
-
/// - Contains a "gene_names" dataset (gene identifiers)
|
|
27
|
-
/// - Contains a "samples" dataset (sample identifiers)
|
|
28
|
-
///
|
|
29
|
-
/// - **Sparse format**:
|
|
30
|
-
/// - Contains a "data" group with sparse matrix components
|
|
31
|
-
/// - Contains a "sample_names" dataset
|
|
32
|
-
///
|
|
33
|
-
/// - **Unknown format**:
|
|
34
|
-
/// - Does not match either the dense or sparse format criteria
|
|
31
|
+
/// Examines the structure of an HDF5 file to detect its format:
|
|
32
|
+
/// - "dense": Contains "counts", "gene_names", and "samples" datasets
|
|
33
|
+
/// - "sparse": Contains "data" group and "sample_names" dataset
|
|
34
|
+
/// - "unknown": Does not match either format
|
|
35
35
|
///
|
|
36
36
|
/// # Arguments
|
|
37
|
-
///
|
|
38
37
|
/// * `hdf5_filename` - Path to the HDF5 file to analyze
|
|
39
38
|
///
|
|
40
39
|
/// # Returns
|
|
41
|
-
///
|
|
42
|
-
/// A result containing one of the following static string values:
|
|
43
|
-
/// - `"dense"` - If the file is in dense matrix format
|
|
44
|
-
/// - `"sparse"` - If the file is in sparse matrix format
|
|
45
|
-
/// - `"unknown"` - If the file format cannot be determined
|
|
46
|
-
///
|
|
47
|
-
/// # Errors
|
|
48
|
-
///
|
|
49
|
-
/// This function will return an error if:
|
|
50
|
-
/// - The file cannot be opened
|
|
51
|
-
/// - The file is not a valid HDF5 file
|
|
52
|
-
///
|
|
53
|
-
/// # Algorithm
|
|
54
|
-
///
|
|
55
|
-
/// The detection algorithm works by checking for the presence of specific datasets
|
|
56
|
-
/// and groups that are characteristic of each format:
|
|
57
|
-
///
|
|
58
|
-
/// 1. Opens the HDF5 file
|
|
59
|
-
/// 2. Checks for datasets/groups that indicate dense format
|
|
60
|
-
/// 3. Checks for datasets/groups that indicate sparse format
|
|
61
|
-
/// 4. Returns the detected format or "unknown"
|
|
62
|
-
///
|
|
63
|
-
/// # Examples
|
|
64
|
-
///
|
|
65
|
-
/// ```rust
|
|
66
|
-
/// // Example usage (not runnable)
|
|
67
|
-
/// match detect_hdf5_format("expression_data.h5") {
|
|
68
|
-
/// Ok("dense") => println!("Dense format detected"),
|
|
69
|
-
/// Ok("sparse") => println!("Sparse format detected"),
|
|
70
|
-
/// Ok("unknown") => println!("Unknown format detected"),
|
|
71
|
-
/// Err(e) => println!("Error: {}", e),
|
|
72
|
-
/// }
|
|
73
|
-
/// ```
|
|
40
|
+
/// The detected format as a static string: "dense", "sparse", or "unknown"
|
|
74
41
|
fn detect_hdf5_format(hdf5_filename: &str) -> Result<&'static str> {
|
|
75
42
|
let file = File::open(hdf5_filename)?;
|
|
76
43
|
|
|
@@ -95,63 +62,16 @@ fn detect_hdf5_format(hdf5_filename: &str) -> Result<&'static str> {
|
|
|
95
62
|
}
|
|
96
63
|
}
|
|
97
64
|
|
|
98
|
-
/// Unified function for querying gene expression data from
|
|
65
|
+
/// Unified function for querying gene expression data from an HDF5 file
|
|
99
66
|
///
|
|
100
|
-
///
|
|
101
|
-
/// from an HDF5 file. It automatically detects the format of the provided file (dense or sparse)
|
|
102
|
-
/// and routes the query to the appropriate specialized handler function.
|
|
103
|
-
///
|
|
104
|
-
/// # Supported HDF5 Formats
|
|
105
|
-
///
|
|
106
|
-
/// - **Dense format**: Contains explicit "gene_ids", "samples", and "counts" datasets where
|
|
107
|
-
/// the expression matrix is stored as a direct 2D array
|
|
108
|
-
/// - **Sparse format**: Contains a "data" group with "p", "i", "x" datasets using the
|
|
109
|
-
/// Compressed Sparse Column (CSC) representation for the expression matrix
|
|
67
|
+
/// Automatically detects file format (dense or sparse) and routes to the appropriate handler.
|
|
110
68
|
///
|
|
111
69
|
/// # Arguments
|
|
112
|
-
///
|
|
113
70
|
/// * `hdf5_filename` - Path to the HDF5 file containing gene expression data
|
|
114
71
|
/// * `gene_name` - Name of the gene whose expression values to extract
|
|
115
72
|
///
|
|
116
73
|
/// # Returns
|
|
117
|
-
///
|
|
118
|
-
/// A result indicating success or error. On success, the function prints the gene
|
|
119
|
-
/// expression data in JSON format to stdout for dense matrix HDF5 files. For spare matrix files it
|
|
120
|
-
/// sends the expression data in JSON format with "output_string:" prefix to stdout.
|
|
121
|
-
///
|
|
122
|
-
/// # Example Output Format
|
|
123
|
-
///
|
|
124
|
-
/// ```json
|
|
125
|
-
/// {
|
|
126
|
-
/// "gene": "TP53",
|
|
127
|
-
/// "dataId": "TP53",
|
|
128
|
-
/// "samples": {
|
|
129
|
-
/// "sample1": 10.5,
|
|
130
|
-
/// "sample2": 8.2,
|
|
131
|
-
/// "sample3": 15.7
|
|
132
|
-
/// }
|
|
133
|
-
/// }
|
|
134
|
-
/// ```
|
|
135
|
-
///
|
|
136
|
-
/// # Error Handling
|
|
137
|
-
///
|
|
138
|
-
/// The function handles several types of errors:
|
|
139
|
-
/// - File format detection failures
|
|
140
|
-
/// - Unsupported or unknown file formats
|
|
141
|
-
/// - Errors from the format-specific query functions
|
|
142
|
-
///
|
|
143
|
-
/// When an error occurs, the function returns a structured JSON error message.
|
|
144
|
-
///
|
|
145
|
-
/// # Processing Flow
|
|
146
|
-
///
|
|
147
|
-
/// 1. Detects the format of the HDF5 file using `detect_hdf5_format`
|
|
148
|
-
/// 2. Routes to the appropriate specialized function:
|
|
149
|
-
/// - `query_gene_dense` for dense matrix files
|
|
150
|
-
/// - `query_gene_sparse` for sparse matrix files
|
|
151
|
-
/// 3. Returns an error for unsupported formats
|
|
152
|
-
///
|
|
153
|
-
/// This unified approach allows client code to work with either format without needing
|
|
154
|
-
/// to know the specific structure of the underlying HDF5 file.
|
|
74
|
+
/// Outputs gene expression data in JSON format to stdout
|
|
155
75
|
fn query_gene(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
156
76
|
// First, detect the file format
|
|
157
77
|
let file_format = detect_hdf5_format(&hdf5_filename)?;
|
|
@@ -161,7 +81,6 @@ fn query_gene(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
161
81
|
"dense" => query_gene_dense(hdf5_filename, gene_name),
|
|
162
82
|
"sparse" => query_gene_sparse(hdf5_filename, gene_name),
|
|
163
83
|
_ => {
|
|
164
|
-
// For unknown format, return an error
|
|
165
84
|
println!(
|
|
166
85
|
"{}",
|
|
167
86
|
serde_json::json!({
|
|
@@ -179,62 +98,18 @@ fn query_gene(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
179
98
|
|
|
180
99
|
/// Reads expression data for a specific gene from a dense format HDF5 file
|
|
181
100
|
///
|
|
182
|
-
///
|
|
183
|
-
/// that follows the dense matrix format. The dense format is characterized by:
|
|
184
|
-
/// - A "gene_ids" dataset containing gene identifiers
|
|
185
|
-
/// - A "samples" dataset containing sample identifiers
|
|
186
|
-
/// - A "counts" dataset containing a gene × sample expression matrix
|
|
187
|
-
///
|
|
188
|
-
/// The function returns the expression values in a JSON format where sample names
|
|
189
|
-
/// are keys and their corresponding expression values are the values.
|
|
101
|
+
/// Dense format contains "gene_ids", "samples", and "counts" datasets.
|
|
190
102
|
///
|
|
191
103
|
/// # Arguments
|
|
192
|
-
///
|
|
193
104
|
/// * `hdf5_filename` - Path to the HDF5 file
|
|
194
105
|
/// * `gene_name` - Name of the gene to query
|
|
195
106
|
///
|
|
196
107
|
/// # Returns
|
|
197
|
-
///
|
|
198
|
-
/// A result indicating success or error. On success, the function prints the gene
|
|
199
|
-
/// expression data in JSON format to stdout.
|
|
200
|
-
///
|
|
201
|
-
/// # Output Format
|
|
202
|
-
///
|
|
203
|
-
/// ```json
|
|
204
|
-
/// {
|
|
205
|
-
/// "gene": "GENE_NAME",
|
|
206
|
-
/// "dataId": "GENE_NAME",
|
|
207
|
-
/// "samples": {
|
|
208
|
-
/// "SAMPLE1": VALUE1,
|
|
209
|
-
/// "SAMPLE2": VALUE2,
|
|
210
|
-
/// ...
|
|
211
|
-
/// }
|
|
212
|
-
/// }
|
|
213
|
-
/// ```
|
|
108
|
+
/// Prints gene expression data in JSON format to stdout
|
|
214
109
|
///
|
|
215
110
|
/// # Error Handling
|
|
216
|
-
///
|
|
217
|
-
/// The function handles several potential errors:
|
|
218
|
-
/// - File opening errors
|
|
219
|
-
/// - Missing or inaccessible datasets ("gene_ids", "samples", "counts")
|
|
220
|
-
/// - Gene not found in the dataset
|
|
221
|
-
/// - Out of bounds gene index
|
|
222
|
-
/// - Expression data reading failures
|
|
223
|
-
///
|
|
224
|
-
/// If an error occurs, the function returns an explanatory error message in JSON format.
|
|
225
|
-
///
|
|
226
|
-
/// # Reading Strategy
|
|
227
|
-
///
|
|
228
|
-
/// The function tries two methods to read expression data:
|
|
229
|
-
/// 1. First attempts to read a 1D slice directly from the counts dataset
|
|
230
|
-
/// 2. If that fails, tries reading the entire dataset and extracting the row of interest
|
|
231
|
-
///
|
|
232
|
-
/// This dual approach ensures compatibility with different HDF5 library implementations
|
|
233
|
-
/// and dataset configurations.
|
|
111
|
+
/// Handles file access issues, missing datasets, and gene not found scenarios
|
|
234
112
|
fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
235
|
-
// let start_time = Instant::now();
|
|
236
|
-
|
|
237
|
-
// Open the HDF5 file
|
|
238
113
|
let file = match File::open(hdf5_filename) {
|
|
239
114
|
Ok(f) => f,
|
|
240
115
|
Err(err) => {
|
|
@@ -249,7 +124,6 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
249
124
|
}
|
|
250
125
|
};
|
|
251
126
|
|
|
252
|
-
// Read gene ids using VarLenAscii
|
|
253
127
|
let genes_dataset = match file.dataset("gene_ids") {
|
|
254
128
|
Ok(ds) => ds,
|
|
255
129
|
Err(err) => {
|
|
@@ -264,7 +138,6 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
264
138
|
}
|
|
265
139
|
};
|
|
266
140
|
|
|
267
|
-
// Read genes as VarLenAscii
|
|
268
141
|
let genes_varlen = match genes_dataset.read_1d::<VarLenAscii>() {
|
|
269
142
|
Ok(g) => g,
|
|
270
143
|
Err(err) => {
|
|
@@ -282,7 +155,6 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
282
155
|
// Convert to Vec<String> for easier handling
|
|
283
156
|
let genes: Vec<String> = genes_varlen.iter().map(|g| g.to_string()).collect();
|
|
284
157
|
|
|
285
|
-
// Read sample names using VarLenAscii
|
|
286
158
|
let samples_dataset = match file.dataset("samples") {
|
|
287
159
|
Ok(ds) => ds,
|
|
288
160
|
Err(err) => {
|
|
@@ -297,7 +169,6 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
297
169
|
}
|
|
298
170
|
};
|
|
299
171
|
|
|
300
|
-
// Read samples as VarLenAscii
|
|
301
172
|
let samples_varlen = match samples_dataset.read_1d::<VarLenAscii>() {
|
|
302
173
|
Ok(s) => s,
|
|
303
174
|
Err(err) => {
|
|
@@ -330,7 +201,6 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
330
201
|
}
|
|
331
202
|
};
|
|
332
203
|
|
|
333
|
-
// Read the expression data for the gene
|
|
334
204
|
let counts_dataset = match file.dataset("counts") {
|
|
335
205
|
Ok(ds) => ds,
|
|
336
206
|
Err(err) => {
|
|
@@ -345,7 +215,6 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
345
215
|
}
|
|
346
216
|
};
|
|
347
217
|
|
|
348
|
-
// Make sure the gene index is valid for this dataset
|
|
349
218
|
if gene_index >= counts_dataset.shape()[0] {
|
|
350
219
|
println!(
|
|
351
220
|
"{}",
|
|
@@ -357,7 +226,6 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
357
226
|
return Ok(());
|
|
358
227
|
}
|
|
359
228
|
|
|
360
|
-
// Try to read the expression data
|
|
361
229
|
let gene_expression: Array1<f64>;
|
|
362
230
|
|
|
363
231
|
// Method 1: Try to read a 1D slice directly (for 2D datasets)
|
|
@@ -367,8 +235,6 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
367
235
|
}
|
|
368
236
|
Err(err1) => {
|
|
369
237
|
// Method 2: Try a different approach
|
|
370
|
-
|
|
371
|
-
// First get the dimensions
|
|
372
238
|
let dataset_shape = counts_dataset.shape();
|
|
373
239
|
if dataset_shape.len() != 2 {
|
|
374
240
|
println!(
|
|
@@ -388,7 +254,6 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
388
254
|
let row = all_data.slice(s![gene_index, ..]).to_owned();
|
|
389
255
|
gene_expression = row;
|
|
390
256
|
|
|
391
|
-
// Start building a flatter JSON structure
|
|
392
257
|
let mut output_string = String::from("{\"samples\":{");
|
|
393
258
|
|
|
394
259
|
// Create direct key-value pairs where sample names are the keys
|
|
@@ -408,8 +273,6 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
408
273
|
|
|
409
274
|
// Close the JSON object
|
|
410
275
|
output_string += "}}";
|
|
411
|
-
|
|
412
|
-
// println!("{}", output_string);
|
|
413
276
|
}
|
|
414
277
|
Err(err2) => {
|
|
415
278
|
println!(
|
|
@@ -424,26 +287,22 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
424
287
|
}
|
|
425
288
|
}
|
|
426
289
|
}
|
|
427
|
-
// Create samples map
|
|
428
290
|
let mut samples_map = Map::new();
|
|
429
291
|
for (i, sample) in samples.iter().enumerate() {
|
|
430
292
|
if i < gene_expression.len() {
|
|
431
|
-
// Add each sample to the map, clean the sample name and convert value to JSON Number
|
|
432
|
-
// Note: We need to handle potential NaN or infinity values that aren't valid in JSON
|
|
433
293
|
let value = if gene_expression[i].is_finite() {
|
|
434
294
|
Value::from(gene_expression[i])
|
|
435
295
|
} else {
|
|
436
|
-
Value::Null
|
|
296
|
+
Value::Null
|
|
437
297
|
};
|
|
438
298
|
|
|
439
299
|
samples_map.insert(
|
|
440
|
-
sample.replace("\\", ""),
|
|
300
|
+
sample.replace("\\", ""),
|
|
441
301
|
value,
|
|
442
302
|
);
|
|
443
303
|
}
|
|
444
304
|
}
|
|
445
305
|
|
|
446
|
-
// Build the complete JSON structure
|
|
447
306
|
let output_json = json!({
|
|
448
307
|
"gene": gene_name,
|
|
449
308
|
"dataId": gene_name,
|
|
@@ -456,74 +315,24 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
456
315
|
Ok(())
|
|
457
316
|
}
|
|
458
317
|
|
|
459
|
-
/// Reads expression data for a specific gene from a sparse format HDF5 file
|
|
460
|
-
///
|
|
461
|
-
/// This function extracts expression values for a specified gene from an HDF5 file
|
|
462
|
-
/// that uses a sparse matrix representation. Sparse matrices are efficient for storing
|
|
463
|
-
/// genomic data where many genes have zero expression in many samples. The sparse
|
|
464
|
-
/// format follows the Compressed Sparse Column (CSC) structure with:
|
|
318
|
+
/// Reads expression data for a specific gene from a sparse format HDF5 file
|
|
465
319
|
///
|
|
466
|
-
///
|
|
467
|
-
///
|
|
468
|
-
/// - A "sample_names" dataset containing sample identifiers
|
|
469
|
-
/// - A "data/p" dataset containing pointers to where each gene's data starts and ends
|
|
470
|
-
/// - A "data/i" dataset containing column indices for non-zero values
|
|
471
|
-
/// - A "data/x" dataset containing the actual non-zero expression values
|
|
320
|
+
/// Extracts expression values from sparse matrix HDF5 files using Compressed
|
|
321
|
+
/// Sparse Column (CSC) structure.
|
|
472
322
|
///
|
|
473
323
|
/// # Arguments
|
|
474
|
-
///
|
|
475
324
|
/// * `hdf5_filename` - Path to the HDF5 file
|
|
476
325
|
/// * `gene_name` - Name of the gene to query
|
|
477
326
|
///
|
|
478
327
|
/// # Returns
|
|
328
|
+
/// Prints gene expression data as JSON to stdout with "output_string:" prefix.
|
|
329
|
+
/// Sample names are keys, expression values are values.
|
|
479
330
|
///
|
|
480
|
-
///
|
|
481
|
-
///
|
|
482
|
-
///
|
|
483
|
-
///
|
|
484
|
-
///
|
|
485
|
-
/// The function outputs a JSON object where sample names are keys and their
|
|
486
|
-
/// corresponding expression values are the values:
|
|
487
|
-
///
|
|
488
|
-
/// ```json
|
|
489
|
-
/// {
|
|
490
|
-
/// "sample1": 0.0,
|
|
491
|
-
/// "sample2": 4.5,
|
|
492
|
-
/// "sample3": 0.0,
|
|
493
|
-
/// "sample4": 7.2,
|
|
494
|
-
/// ...
|
|
495
|
-
/// }
|
|
496
|
-
/// ```
|
|
497
|
-
///
|
|
498
|
-
/// # Algorithm
|
|
499
|
-
///
|
|
500
|
-
/// 1. Opens the HDF5 file and reads matrix dimensions
|
|
501
|
-
/// 2. Reads gene and sample names
|
|
502
|
-
/// 3. Finds the index of the requested gene
|
|
503
|
-
/// 4. Reads the sparse representation:
|
|
504
|
-
/// - Gets pointers from "data/p" to determine which values belong to the gene
|
|
505
|
-
/// - Reads column indices from "data/i" to know which samples have non-zero values
|
|
506
|
-
/// - Reads actual values from "data/x"
|
|
507
|
-
/// 5. Reconstructs a dense vector from the sparse representation
|
|
508
|
-
/// 6. Formats and outputs the result as JSON
|
|
509
|
-
///
|
|
510
|
-
/// # Performance Tracking
|
|
511
|
-
///
|
|
512
|
-
/// The function tracks performance at various stages using timestamps:
|
|
513
|
-
/// - Time spent parsing genes
|
|
514
|
-
/// - Time spent parsing samples
|
|
515
|
-
/// - Time spent reading the p, i, and x datasets
|
|
516
|
-
/// - Time spent generating the full array from sparse representation
|
|
517
|
-
///
|
|
518
|
-
/// # Error Handling
|
|
519
|
-
///
|
|
520
|
-
/// The function handles several potential errors:
|
|
521
|
-
/// - File opening failures
|
|
522
|
-
/// - Dataset access failures
|
|
523
|
-
/// - Gene not found in the dataset
|
|
524
|
-
/// - Sparse matrix reading failures
|
|
525
|
-
///
|
|
526
|
-
/// If an error occurs, the function returns a structured JSON error message.
|
|
331
|
+
/// The sparse format includes:
|
|
332
|
+
/// - "data/dim" - Matrix dimensions
|
|
333
|
+
/// - "gene_names" - Gene identifiers
|
|
334
|
+
/// - "sample_names" - Sample identifiers
|
|
335
|
+
/// - "data/p", "data/i", "data/x" - CSC matrix components
|
|
527
336
|
fn query_gene_sparse(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
528
337
|
let file = File::open(&hdf5_filename)?;
|
|
529
338
|
let ds_dim = file.dataset("data/dim")?;
|
|
@@ -602,7 +411,6 @@ fn query_gene_sparse(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
602
411
|
gene_array[col_id] = populated_column_values[idx];
|
|
603
412
|
}
|
|
604
413
|
|
|
605
|
-
// Format output as JSON
|
|
606
414
|
let mut output_string = "{".to_string();
|
|
607
415
|
for i in 0..gene_array.len() {
|
|
608
416
|
output_string += &format!(
|
|
@@ -626,7 +434,705 @@ fn query_gene_sparse(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
626
434
|
Ok(())
|
|
627
435
|
}
|
|
628
436
|
|
|
629
|
-
|
|
437
|
+
/// Queries expression data for multiple genes from a dense format HDF5 file
|
|
438
|
+
///
|
|
439
|
+
/// Extracts expression values for multiple genes from a dense matrix HDF5 file,
|
|
440
|
+
/// optimizing for both single gene (linear search) and multi-gene (hashmap) queries.
|
|
441
|
+
///
|
|
442
|
+
/// # Arguments
|
|
443
|
+
/// * `hdf5_filename` - Path to the HDF5 file
|
|
444
|
+
/// * `gene_names` - Vector of gene names to query
|
|
445
|
+
///
|
|
446
|
+
/// # Returns
|
|
447
|
+
/// Prints a JSON object with expression data for all requested genes to stdout.
|
|
448
|
+
fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) -> Result<()> {
|
|
449
|
+
let overall_start_time = Instant::now();
|
|
450
|
+
|
|
451
|
+
// Create timing map to store all timing data
|
|
452
|
+
let mut timings = Map::new();
|
|
453
|
+
|
|
454
|
+
let file = match File::open(&hdf5_filename) {
|
|
455
|
+
Ok(f) => f,
|
|
456
|
+
Err(err) => {
|
|
457
|
+
println!(
|
|
458
|
+
"{}",
|
|
459
|
+
serde_json::json!({
|
|
460
|
+
"status": "error",
|
|
461
|
+
"message": format!("Failed to open HDF5 file: {}", err)
|
|
462
|
+
})
|
|
463
|
+
);
|
|
464
|
+
return Ok(());
|
|
465
|
+
}
|
|
466
|
+
};
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
let genes_dataset = match file.dataset("gene_ids") {
|
|
470
|
+
Ok(ds) => ds,
|
|
471
|
+
Err(err) => {
|
|
472
|
+
println!(
|
|
473
|
+
"{}",
|
|
474
|
+
serde_json::json!({
|
|
475
|
+
"status": "error",
|
|
476
|
+
"message": format!("Failed to open gene_ids dataset: {}", err)
|
|
477
|
+
})
|
|
478
|
+
);
|
|
479
|
+
return Ok(());
|
|
480
|
+
}
|
|
481
|
+
};
|
|
482
|
+
|
|
483
|
+
let genes_varlen = match genes_dataset.read_1d::<VarLenAscii>() {
|
|
484
|
+
Ok(g) => g,
|
|
485
|
+
Err(err) => {
|
|
486
|
+
println!(
|
|
487
|
+
"{}",
|
|
488
|
+
serde_json::json!({
|
|
489
|
+
"status": "error",
|
|
490
|
+
"message": format!("Failed to read gene names as VarLenAscii: {}", err)
|
|
491
|
+
})
|
|
492
|
+
);
|
|
493
|
+
return Ok(());
|
|
494
|
+
}
|
|
495
|
+
};
|
|
496
|
+
|
|
497
|
+
let genes: Vec<String> = genes_varlen.iter().map(|g| g.to_string()).collect();
|
|
498
|
+
|
|
499
|
+
// Only create HashMap for multiple gene queries
|
|
500
|
+
let gene_to_index: Option<std::collections::HashMap<String, usize>> = if gene_names.len() > 1 {
|
|
501
|
+
let hashmap_start_time = Instant::now();
|
|
502
|
+
let mut map = std::collections::HashMap::with_capacity(genes.len());
|
|
503
|
+
for (idx, gene) in genes.iter().enumerate() {
|
|
504
|
+
map.insert(gene.clone(), idx);
|
|
505
|
+
}
|
|
506
|
+
timings.insert(
|
|
507
|
+
"build_hashmap_ms".to_string(),
|
|
508
|
+
Value::from(hashmap_start_time.elapsed().as_millis() as u64)
|
|
509
|
+
);
|
|
510
|
+
Some(map)
|
|
511
|
+
} else {
|
|
512
|
+
// Skip HashMap creation for single gene queries
|
|
513
|
+
None
|
|
514
|
+
};
|
|
515
|
+
|
|
516
|
+
let samples_dataset = match file.dataset("samples") {
|
|
517
|
+
Ok(ds) => ds,
|
|
518
|
+
Err(err) => {
|
|
519
|
+
println!(
|
|
520
|
+
"{}",
|
|
521
|
+
serde_json::json!({
|
|
522
|
+
"status": "error",
|
|
523
|
+
"message": format!("Failed to open samples dataset: {}", err)
|
|
524
|
+
})
|
|
525
|
+
);
|
|
526
|
+
return Ok(());
|
|
527
|
+
}
|
|
528
|
+
};
|
|
529
|
+
|
|
530
|
+
let samples_varlen = match samples_dataset.read_1d::<VarLenAscii>() {
|
|
531
|
+
Ok(s) => s,
|
|
532
|
+
Err(err) => {
|
|
533
|
+
println!(
|
|
534
|
+
"{}",
|
|
535
|
+
serde_json::json!({
|
|
536
|
+
"status": "error",
|
|
537
|
+
"message": format!("Failed to read samples as VarLenAscii: {}", err)
|
|
538
|
+
})
|
|
539
|
+
);
|
|
540
|
+
return Ok(());
|
|
541
|
+
}
|
|
542
|
+
};
|
|
543
|
+
|
|
544
|
+
let samples: Vec<String> = samples_varlen.iter().map(|s| s.to_string()).collect();
|
|
545
|
+
|
|
546
|
+
let counts_dataset = match file.dataset("counts") {
|
|
547
|
+
Ok(ds) => ds,
|
|
548
|
+
Err(err) => {
|
|
549
|
+
println!(
|
|
550
|
+
"{}",
|
|
551
|
+
serde_json::json!({
|
|
552
|
+
"status": "error",
|
|
553
|
+
"message": format!("Failed to open counts dataset: {}", err)
|
|
554
|
+
})
|
|
555
|
+
);
|
|
556
|
+
return Ok(());
|
|
557
|
+
}
|
|
558
|
+
};
|
|
559
|
+
|
|
560
|
+
// Create thread-local storage for results
|
|
561
|
+
let genes_map = Arc::new(std::sync::Mutex::new(Map::new()));
|
|
562
|
+
let gene_timings = Arc::new(std::sync::Mutex::new(Map::new()));
|
|
563
|
+
|
|
564
|
+
if gene_names.len() > 1 {
|
|
565
|
+
// For multiple genes: preload all data and use parallel processing
|
|
566
|
+
timings.insert("parallel_processing".to_string(), Value::from(true));
|
|
567
|
+
|
|
568
|
+
// Load all gene data upfront only when processing multiple genes
|
|
569
|
+
let all_data_start_time = Instant::now();
|
|
570
|
+
let all_gene_data = match counts_dataset.read::<f64, Dim<[usize; 2]>>() {
|
|
571
|
+
Ok(data) => {
|
|
572
|
+
timings.insert(
|
|
573
|
+
"read_all_gene_data_ms".to_string(),
|
|
574
|
+
Value::from(all_data_start_time.elapsed().as_millis() as u64),
|
|
575
|
+
);
|
|
576
|
+
Some(data)
|
|
577
|
+
}
|
|
578
|
+
Err(err) => {
|
|
579
|
+
// Failed to read all data at once, will fallback to per-gene reading
|
|
580
|
+
timings.insert(
|
|
581
|
+
"read_all_gene_data_error".to_string(),
|
|
582
|
+
Value::String(format!("{:?}", err)),
|
|
583
|
+
);
|
|
584
|
+
None
|
|
585
|
+
}
|
|
586
|
+
};
|
|
587
|
+
|
|
588
|
+
// Configurable thread count for testing
|
|
589
|
+
let thread_count = 2;
|
|
590
|
+
timings.insert("thread_count".to_string(), Value::from(thread_count));
|
|
591
|
+
|
|
592
|
+
// Create a scoped thread pool with specified number of threads
|
|
593
|
+
match rayon::ThreadPoolBuilder::new()
|
|
594
|
+
.num_threads(thread_count)
|
|
595
|
+
.build()
|
|
596
|
+
{
|
|
597
|
+
Ok(pool) => {
|
|
598
|
+
// Use the pool for this specific work
|
|
599
|
+
pool.install(|| {
|
|
600
|
+
gene_names.par_iter().for_each(|gene_name| {
|
|
601
|
+
let gene_start_time = Instant::now();
|
|
602
|
+
|
|
603
|
+
// Use HashMap for O(1) lookup for multiple genes
|
|
604
|
+
let gene_index = match &gene_to_index {
|
|
605
|
+
Some(map) => map.get(gene_name).cloned(),
|
|
606
|
+
None => genes.iter().position(|x| *x == *gene_name),
|
|
607
|
+
};
|
|
608
|
+
|
|
609
|
+
match gene_index {
|
|
610
|
+
Some(gene_index) => {
|
|
611
|
+
// Make sure the gene index is valid for this dataset
|
|
612
|
+
if gene_index >= counts_dataset.shape()[0] {
|
|
613
|
+
let mut error_map = Map::new();
|
|
614
|
+
error_map.insert(
|
|
615
|
+
"error".to_string(),
|
|
616
|
+
Value::String("Gene index out of bounds".to_string()),
|
|
617
|
+
);
|
|
618
|
+
|
|
619
|
+
// Store the error result
|
|
620
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
621
|
+
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
622
|
+
} else {
|
|
623
|
+
// Use pre-loaded data if available
|
|
624
|
+
if let Some(ref all_data) = all_gene_data {
|
|
625
|
+
// Extract the row directly from pre-loaded data
|
|
626
|
+
let gene_expression = all_data.slice(s![gene_index, ..]);
|
|
627
|
+
|
|
628
|
+
// Create samples map for this gene
|
|
629
|
+
let mut samples_map = Map::new();
|
|
630
|
+
for (i, sample) in samples.iter().enumerate() {
|
|
631
|
+
if i < gene_expression.len() {
|
|
632
|
+
// Handle potential NaN or infinity values
|
|
633
|
+
let value = if gene_expression[i].is_finite() {
|
|
634
|
+
Value::from(gene_expression[i])
|
|
635
|
+
} else {
|
|
636
|
+
Value::Null
|
|
637
|
+
};
|
|
638
|
+
|
|
639
|
+
samples_map.insert(sample.replace("\\", ""), value);
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
// Create gene data and store it
|
|
644
|
+
let gene_data = json!({
|
|
645
|
+
"dataId": gene_name,
|
|
646
|
+
"samples": samples_map
|
|
647
|
+
});
|
|
648
|
+
|
|
649
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
650
|
+
genes_map.insert(gene_name.clone(), gene_data);
|
|
651
|
+
} else {
|
|
652
|
+
// Fallback to per-gene reading if bulk load failed
|
|
653
|
+
match counts_dataset
|
|
654
|
+
.read_slice_1d::<f64, _>(s![gene_index, ..])
|
|
655
|
+
{
|
|
656
|
+
Ok(gene_expression) => {
|
|
657
|
+
// Create samples map for this gene
|
|
658
|
+
let mut samples_map = Map::new();
|
|
659
|
+
for (i, sample) in samples.iter().enumerate() {
|
|
660
|
+
if i < gene_expression.len() {
|
|
661
|
+
// Handle potential NaN or infinity values
|
|
662
|
+
let value =
|
|
663
|
+
if gene_expression[i].is_finite() {
|
|
664
|
+
Value::from(gene_expression[i])
|
|
665
|
+
} else {
|
|
666
|
+
Value::Null
|
|
667
|
+
};
|
|
668
|
+
|
|
669
|
+
samples_map.insert(
|
|
670
|
+
sample.replace("\\", ""),
|
|
671
|
+
value,
|
|
672
|
+
);
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
// Create gene data and store it
|
|
677
|
+
let gene_data = json!({
|
|
678
|
+
"dataId": gene_name,
|
|
679
|
+
"samples": samples_map
|
|
680
|
+
});
|
|
681
|
+
|
|
682
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
683
|
+
genes_map.insert(gene_name.clone(), gene_data);
|
|
684
|
+
}
|
|
685
|
+
Err(err1) => {
|
|
686
|
+
let mut error_map = Map::new();
|
|
687
|
+
error_map.insert(
|
|
688
|
+
"error".to_string(),
|
|
689
|
+
Value::String(format!(
|
|
690
|
+
"Failed to read expression values: {:?}",
|
|
691
|
+
err1
|
|
692
|
+
)),
|
|
693
|
+
);
|
|
694
|
+
|
|
695
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
696
|
+
genes_map.insert(
|
|
697
|
+
gene_name.clone(),
|
|
698
|
+
Value::Object(error_map),
|
|
699
|
+
);
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
None => {
|
|
706
|
+
// Gene not found
|
|
707
|
+
let mut error_map = Map::new();
|
|
708
|
+
error_map.insert(
|
|
709
|
+
"error".to_string(),
|
|
710
|
+
Value::String("Gene not found in dataset".to_string()),
|
|
711
|
+
);
|
|
712
|
+
|
|
713
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
714
|
+
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
// Record timing
|
|
719
|
+
let elapsed_time = gene_start_time.elapsed().as_millis() as u64;
|
|
720
|
+
let mut gene_timings = gene_timings.lock().unwrap();
|
|
721
|
+
gene_timings.insert(gene_name.clone(), Value::from(elapsed_time));
|
|
722
|
+
});
|
|
723
|
+
});
|
|
724
|
+
}
|
|
725
|
+
Err(err) => {
|
|
726
|
+
// If thread pool creation fails, fall back to sequential processing
|
|
727
|
+
timings.insert(
|
|
728
|
+
"thread_pool_error".to_string(),
|
|
729
|
+
Value::String(format!("Failed to create thread pool: {:?}", err)),
|
|
730
|
+
);
|
|
731
|
+
|
|
732
|
+
process_genes_sequentially(
|
|
733
|
+
&gene_names,
|
|
734
|
+
&genes,
|
|
735
|
+
&gene_to_index,
|
|
736
|
+
&counts_dataset,
|
|
737
|
+
&all_gene_data,
|
|
738
|
+
&samples,
|
|
739
|
+
&genes_map
|
|
740
|
+
);
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
} else if gene_names.len() == 1 {
|
|
744
|
+
let gene_name = &gene_names[0];
|
|
745
|
+
|
|
746
|
+
match genes.iter().position(|x| *x == *gene_name) {
|
|
747
|
+
Some(gene_index) => {
|
|
748
|
+
if gene_index >= counts_dataset.shape()[0] {
|
|
749
|
+
let mut error_map = Map::new();
|
|
750
|
+
error_map.insert(
|
|
751
|
+
"error".to_string(),
|
|
752
|
+
Value::String("Gene index out of bounds".to_string()),
|
|
753
|
+
);
|
|
754
|
+
|
|
755
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
756
|
+
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
757
|
+
} else {
|
|
758
|
+
// Read just this single gene's data directly
|
|
759
|
+
match counts_dataset.read_slice_1d::<f64, _>(s![gene_index, ..]) {
|
|
760
|
+
Ok(gene_expression) => {
|
|
761
|
+
|
|
762
|
+
// Create samples map for this gene
|
|
763
|
+
let mut samples_map = Map::new();
|
|
764
|
+
for (i, sample) in samples.iter().enumerate() {
|
|
765
|
+
if i < gene_expression.len() {
|
|
766
|
+
// Handle potential NaN or infinity values
|
|
767
|
+
let value = if gene_expression[i].is_finite() {
|
|
768
|
+
Value::from(gene_expression[i])
|
|
769
|
+
} else {
|
|
770
|
+
Value::Null
|
|
771
|
+
};
|
|
772
|
+
|
|
773
|
+
samples_map.insert(sample.replace("\\", ""), value);
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
let gene_data = json!({
|
|
778
|
+
"dataId": gene_name,
|
|
779
|
+
"samples": samples_map
|
|
780
|
+
});
|
|
781
|
+
|
|
782
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
783
|
+
genes_map.insert(gene_name.clone(), gene_data);
|
|
784
|
+
}
|
|
785
|
+
Err(err) => {
|
|
786
|
+
let mut error_map = Map::new();
|
|
787
|
+
error_map.insert(
|
|
788
|
+
"error".to_string(),
|
|
789
|
+
Value::String(format!(
|
|
790
|
+
"Failed to read expression values: {:?}",
|
|
791
|
+
err
|
|
792
|
+
)),
|
|
793
|
+
);
|
|
794
|
+
|
|
795
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
796
|
+
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
797
|
+
}
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
None => {
|
|
802
|
+
let mut error_map = Map::new();
|
|
803
|
+
error_map.insert(
|
|
804
|
+
"error".to_string(),
|
|
805
|
+
Value::String("Gene not found in dataset".to_string()),
|
|
806
|
+
);
|
|
807
|
+
|
|
808
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
809
|
+
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
810
|
+
}
|
|
811
|
+
}
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
// Get the final maps from the Arc<Mutex<>>
|
|
815
|
+
let genes_map = Arc::try_unwrap(genes_map).unwrap().into_inner().unwrap();
|
|
816
|
+
|
|
817
|
+
let output_json = json!({
|
|
818
|
+
"genes": genes_map,
|
|
819
|
+
"timings": timings,
|
|
820
|
+
"total_time_ms": overall_start_time.elapsed().as_millis() as u64
|
|
821
|
+
});
|
|
822
|
+
|
|
823
|
+
println!("{}", output_json);
|
|
824
|
+
|
|
825
|
+
Ok(())
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
// Helper function to process genes sequentially with optional HashMap lookup
|
|
829
|
+
fn process_genes_sequentially(
|
|
830
|
+
gene_names: &Vec<String>,
|
|
831
|
+
genes: &Vec<String>,
|
|
832
|
+
gene_to_index: &Option<std::collections::HashMap<String, usize>>,
|
|
833
|
+
counts_dataset: &hdf5::Dataset,
|
|
834
|
+
all_gene_data: &Option<ndarray::ArrayBase<ndarray::OwnedRepr<f64>, ndarray::Dim<[usize; 2]>>>,
|
|
835
|
+
samples: &Vec<String>,
|
|
836
|
+
genes_map: &Arc<std::sync::Mutex<Map<String, Value>>>
|
|
837
|
+
) {
|
|
838
|
+
for gene_name in gene_names {
|
|
839
|
+
// Find the index of the requested gene, using HashMap if available
|
|
840
|
+
let gene_index = match gene_to_index {
|
|
841
|
+
Some(map) => map.get(gene_name).cloned(),
|
|
842
|
+
None => genes.iter().position(|x| *x == *gene_name),
|
|
843
|
+
};
|
|
844
|
+
|
|
845
|
+
match gene_index {
|
|
846
|
+
Some(gene_index) => {
|
|
847
|
+
// Make sure the gene index is valid for this dataset
|
|
848
|
+
if gene_index >= counts_dataset.shape()[0] {
|
|
849
|
+
let mut error_map = Map::new();
|
|
850
|
+
error_map.insert(
|
|
851
|
+
"error".to_string(),
|
|
852
|
+
Value::String("Gene index out of bounds".to_string()),
|
|
853
|
+
);
|
|
854
|
+
|
|
855
|
+
// Store the error result
|
|
856
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
857
|
+
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
858
|
+
} else {
|
|
859
|
+
// Use pre-loaded data if available
|
|
860
|
+
if let Some(ref all_data) = all_gene_data {
|
|
861
|
+
let gene_expression = all_data.slice(s![gene_index, ..]);
|
|
862
|
+
|
|
863
|
+
// Create samples map for this gene
|
|
864
|
+
let mut samples_map = Map::new();
|
|
865
|
+
for (i, sample) in samples.iter().enumerate() {
|
|
866
|
+
if i < gene_expression.len() {
|
|
867
|
+
let value = if gene_expression[i].is_finite() {
|
|
868
|
+
Value::from(gene_expression[i])
|
|
869
|
+
} else {
|
|
870
|
+
Value::Null
|
|
871
|
+
};
|
|
872
|
+
|
|
873
|
+
samples_map.insert(sample.replace("\\", ""), value);
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
let gene_data = json!({
|
|
878
|
+
"dataId": gene_name,
|
|
879
|
+
"samples": samples_map
|
|
880
|
+
});
|
|
881
|
+
|
|
882
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
883
|
+
genes_map.insert(gene_name.clone(), gene_data);
|
|
884
|
+
} else {
|
|
885
|
+
// Fallback to per-gene reading if bulk load failed
|
|
886
|
+
match counts_dataset.read_slice_1d::<f64, _>(s![gene_index, ..]) {
|
|
887
|
+
Ok(gene_expression) => {
|
|
888
|
+
// Create samples map for this gene
|
|
889
|
+
let mut samples_map = Map::new();
|
|
890
|
+
for (i, sample) in samples.iter().enumerate() {
|
|
891
|
+
if i < gene_expression.len() {
|
|
892
|
+
let value = if gene_expression[i].is_finite() {
|
|
893
|
+
Value::from(gene_expression[i])
|
|
894
|
+
} else {
|
|
895
|
+
Value::Null
|
|
896
|
+
};
|
|
897
|
+
|
|
898
|
+
samples_map.insert(sample.replace("\\", ""), value);
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
let gene_data = json!({
|
|
903
|
+
"dataId": gene_name,
|
|
904
|
+
"samples": samples_map
|
|
905
|
+
});
|
|
906
|
+
|
|
907
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
908
|
+
genes_map.insert(gene_name.clone(), gene_data);
|
|
909
|
+
}
|
|
910
|
+
Err(err1) => {
|
|
911
|
+
let mut error_map = Map::new();
|
|
912
|
+
error_map.insert(
|
|
913
|
+
"error".to_string(),
|
|
914
|
+
Value::String(format!(
|
|
915
|
+
"Failed to read expression values: {:?}",
|
|
916
|
+
err1
|
|
917
|
+
)),
|
|
918
|
+
);
|
|
919
|
+
|
|
920
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
921
|
+
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
922
|
+
}
|
|
923
|
+
}
|
|
924
|
+
}
|
|
925
|
+
}
|
|
926
|
+
}
|
|
927
|
+
None => {
|
|
928
|
+
let mut error_map = Map::new();
|
|
929
|
+
error_map.insert(
|
|
930
|
+
"error".to_string(),
|
|
931
|
+
Value::String("Gene not found in dataset".to_string()),
|
|
932
|
+
);
|
|
933
|
+
|
|
934
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
935
|
+
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
/// Queries expression data for multiple genes from a sparse format HDF5 file
|
|
942
|
+
///
|
|
943
|
+
/// This function extracts expression values for multiple specified genes from an HDF5 file
|
|
944
|
+
/// that uses a sparse matrix representation. It optimizes the query by reading shared datasets only once.
|
|
945
|
+
///
|
|
946
|
+
/// # Arguments
|
|
947
|
+
///
|
|
948
|
+
/// * `hdf5_filename` - Path to the HDF5 file
|
|
949
|
+
/// * `gene_names` - Vector of gene names to query
|
|
950
|
+
///
|
|
951
|
+
/// # Returns
|
|
952
|
+
///
|
|
953
|
+
/// A result indicating success or error. On success, the function prints a JSON object
|
|
954
|
+
/// containing expression data for all requested genes to stdout.
|
|
955
|
+
fn query_multiple_genes_sparse(hdf5_filename: String, gene_names: Vec<String>) -> Result<()> {
|
|
956
|
+
let overall_start_time = Instant::now();
|
|
957
|
+
|
|
958
|
+
// Create timing map
|
|
959
|
+
let mut timings = Map::new();
|
|
960
|
+
timings.insert("gene_count".to_string(), Value::from(gene_names.len()));
|
|
961
|
+
timings.insert("format".to_string(), Value::String("sparse".to_string()));
|
|
962
|
+
|
|
963
|
+
// Open file and read datasets
|
|
964
|
+
let file_open_start = Instant::now();
|
|
965
|
+
let file = File::open(&hdf5_filename)?;
|
|
966
|
+
timings.insert(
|
|
967
|
+
"file_open_ms".to_string(),
|
|
968
|
+
Value::from(file_open_start.elapsed().as_millis() as u64),
|
|
969
|
+
);
|
|
970
|
+
|
|
971
|
+
let dim_start = Instant::now();
|
|
972
|
+
let ds_dim = file.dataset("data/dim")?;
|
|
973
|
+
let data_dim: Array1<_> = ds_dim.read::<usize, Dim<[usize; 1]>>()?;
|
|
974
|
+
let num_samples = data_dim[0];
|
|
975
|
+
let _num_genes = data_dim[1];
|
|
976
|
+
timings.insert(
|
|
977
|
+
"read_dims_ms".to_string(),
|
|
978
|
+
Value::from(dim_start.elapsed().as_millis() as u64),
|
|
979
|
+
);
|
|
980
|
+
|
|
981
|
+
let ds_genes = file.dataset("gene_names")?;
|
|
982
|
+
let genes = ds_genes.read_1d::<FixedAscii<104>>()?;
|
|
983
|
+
|
|
984
|
+
let ds_samples = file.dataset("sample_names")?;
|
|
985
|
+
let samples = ds_samples.read_1d::<FixedAscii<104>>()?;
|
|
986
|
+
|
|
987
|
+
// Read p dataset (contains pointers for all genes)
|
|
988
|
+
let p_start_time = Instant::now();
|
|
989
|
+
let ds_p = file.dataset("data/p")?;
|
|
990
|
+
let data_p: Array1<usize> = ds_p.read_1d::<usize>()?;
|
|
991
|
+
timings.insert(
|
|
992
|
+
"read_p_dataset_ms".to_string(),
|
|
993
|
+
Value::from(p_start_time.elapsed().as_millis() as u64),
|
|
994
|
+
);
|
|
995
|
+
|
|
996
|
+
// Open i and x datasets
|
|
997
|
+
let ds_start_time = Instant::now();
|
|
998
|
+
let ds_i = file.dataset("data/i")?;
|
|
999
|
+
let ds_x = file.dataset("data/x")?;
|
|
1000
|
+
timings.insert(
|
|
1001
|
+
"open_i_x_datasets_ms".to_string(),
|
|
1002
|
+
Value::from(ds_start_time.elapsed().as_millis() as u64),
|
|
1003
|
+
);
|
|
1004
|
+
|
|
1005
|
+
// Determine number of threads to use
|
|
1006
|
+
let num_threads = num_cpus::get();
|
|
1007
|
+
timings.insert("num_threads".to_string(), Value::from(num_threads as u64));
|
|
1008
|
+
|
|
1009
|
+
|
|
1010
|
+
// Thread-safe maps for results
|
|
1011
|
+
let genes_map = Arc::new(std::sync::Mutex::new(Map::new()));
|
|
1012
|
+
let gene_timings = Arc::new(std::sync::Mutex::new(Map::new()));
|
|
1013
|
+
|
|
1014
|
+
// Use rayon for parallel processing
|
|
1015
|
+
gene_names.par_iter().for_each(|gene_name| {
|
|
1016
|
+
let gene_start_time = Instant::now();
|
|
1017
|
+
|
|
1018
|
+
// Find the index of the requested gene
|
|
1019
|
+
match genes.iter().position(|&x| x == *gene_name) {
|
|
1020
|
+
Some(gene_index) => {
|
|
1021
|
+
// Find start and end points for this gene's data
|
|
1022
|
+
let array_start_point = data_p[gene_index];
|
|
1023
|
+
let array_stop_point = data_p[gene_index + 1];
|
|
1024
|
+
let num_populated_cells = array_stop_point - array_start_point;
|
|
1025
|
+
|
|
1026
|
+
if num_populated_cells == 0 {
|
|
1027
|
+
// Gene has no data, create array of zeros
|
|
1028
|
+
let mut samples_map = Map::new();
|
|
1029
|
+
for (_i, sample) in samples.iter().enumerate() {
|
|
1030
|
+
samples_map.insert(sample.to_string().replace("\\", ""), Value::from(0.0));
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
let gene_data = json!({
|
|
1034
|
+
"dataId": gene_name,
|
|
1035
|
+
"samples": samples_map
|
|
1036
|
+
});
|
|
1037
|
+
|
|
1038
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
1039
|
+
genes_map.insert(gene_name.clone(), gene_data);
|
|
1040
|
+
} else {
|
|
1041
|
+
// Read data for this gene
|
|
1042
|
+
match ds_i.read_slice_1d::<usize, _>(array_start_point..array_stop_point) {
|
|
1043
|
+
Ok(populated_column_ids) => {
|
|
1044
|
+
match ds_x.read_slice_1d::<f64, _>(array_start_point..array_stop_point)
|
|
1045
|
+
{
|
|
1046
|
+
Ok(populated_column_values) => {
|
|
1047
|
+
// Generate the complete array from sparse representation
|
|
1048
|
+
let mut gene_array: Array1<f64> = Array1::zeros(num_samples);
|
|
1049
|
+
|
|
1050
|
+
// Fill in values at populated column indices
|
|
1051
|
+
for (idx, &col_id) in populated_column_ids.iter().enumerate() {
|
|
1052
|
+
gene_array[col_id] = populated_column_values[idx];
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
// Create samples map
|
|
1056
|
+
let mut samples_map = Map::new();
|
|
1057
|
+
for (_i, sample) in samples.iter().enumerate() {
|
|
1058
|
+
let value = if gene_array[_i].is_finite() {
|
|
1059
|
+
Value::from(gene_array[_i])
|
|
1060
|
+
} else {
|
|
1061
|
+
Value::Null
|
|
1062
|
+
};
|
|
1063
|
+
|
|
1064
|
+
samples_map
|
|
1065
|
+
.insert(sample.to_string().replace("\\", ""), value);
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
let gene_data = json!({
|
|
1069
|
+
"dataId": gene_name,
|
|
1070
|
+
"samples": samples_map
|
|
1071
|
+
});
|
|
1072
|
+
|
|
1073
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
1074
|
+
genes_map.insert(gene_name.clone(), gene_data);
|
|
1075
|
+
}
|
|
1076
|
+
Err(err) => {
|
|
1077
|
+
let mut error_map = Map::new();
|
|
1078
|
+
error_map.insert(
|
|
1079
|
+
"error".to_string(),
|
|
1080
|
+
Value::String(format!(
|
|
1081
|
+
"Failed to read x dataset: {:?}",
|
|
1082
|
+
err
|
|
1083
|
+
)),
|
|
1084
|
+
);
|
|
1085
|
+
|
|
1086
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
1087
|
+
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
1088
|
+
}
|
|
1089
|
+
}
|
|
1090
|
+
}
|
|
1091
|
+
Err(err) => {
|
|
1092
|
+
let mut error_map = Map::new();
|
|
1093
|
+
error_map.insert(
|
|
1094
|
+
"error".to_string(),
|
|
1095
|
+
Value::String(format!("Failed to read i dataset: {:?}", err)),
|
|
1096
|
+
);
|
|
1097
|
+
|
|
1098
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
1099
|
+
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
1100
|
+
}
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
None => {
|
|
1105
|
+
let mut error_map = Map::new();
|
|
1106
|
+
error_map.insert(
|
|
1107
|
+
"error".to_string(),
|
|
1108
|
+
Value::String("Gene not found in dataset".to_string()),
|
|
1109
|
+
);
|
|
1110
|
+
|
|
1111
|
+
let mut genes_map = genes_map.lock().unwrap();
|
|
1112
|
+
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
1113
|
+
}
|
|
1114
|
+
}
|
|
1115
|
+
|
|
1116
|
+
// Record timing
|
|
1117
|
+
let elapsed_time = gene_start_time.elapsed().as_millis() as u64;
|
|
1118
|
+
let mut gene_timings = gene_timings.lock().unwrap();
|
|
1119
|
+
gene_timings.insert(gene_name.clone(), Value::from(elapsed_time));
|
|
1120
|
+
});
|
|
1121
|
+
|
|
1122
|
+
// Get the final maps from the Arc<Mutex<>>
|
|
1123
|
+
let genes_map = Arc::try_unwrap(genes_map).unwrap().into_inner().unwrap();
|
|
1124
|
+
|
|
1125
|
+
let output_json = json!({
|
|
1126
|
+
"genes": genes_map,
|
|
1127
|
+
"timings": timings,
|
|
1128
|
+
"parallel": true,
|
|
1129
|
+
"total_time_ms": overall_start_time.elapsed().as_millis() as u64
|
|
1130
|
+
});
|
|
1131
|
+
|
|
1132
|
+
println!("{}", output_json);
|
|
1133
|
+
|
|
1134
|
+
Ok(())
|
|
1135
|
+
}
|
|
630
1136
|
fn main() -> Result<()> {
|
|
631
1137
|
let mut input = String::new();
|
|
632
1138
|
match io::stdin().read_line(&mut input) {
|
|
@@ -642,12 +1148,75 @@ fn main() -> Result<()> {
|
|
|
642
1148
|
}
|
|
643
1149
|
};
|
|
644
1150
|
|
|
645
|
-
//
|
|
646
|
-
if
|
|
647
|
-
//
|
|
1151
|
+
// Case 1: Check if "genes" field exists and is an array
|
|
1152
|
+
if json_string["genes"].is_array() {
|
|
1153
|
+
// Convert the JsonValue array to a Vec<String>
|
|
1154
|
+
let mut gene_names: Vec<String> = Vec::new();
|
|
1155
|
+
for gene_value in json_string["genes"].members() {
|
|
1156
|
+
if let Some(gene_str) = gene_value.as_str() {
|
|
1157
|
+
gene_names.push(gene_str.to_string());
|
|
1158
|
+
}
|
|
1159
|
+
}
|
|
1160
|
+
|
|
1161
|
+
if !gene_names.is_empty() {
|
|
1162
|
+
match detect_hdf5_format(&hdf5_filename)? {
|
|
1163
|
+
"dense" => query_multiple_genes_dense(hdf5_filename, gene_names)?,
|
|
1164
|
+
"sparse" => query_multiple_genes_sparse(hdf5_filename, gene_names)?,
|
|
1165
|
+
_ => {
|
|
1166
|
+
println!(
|
|
1167
|
+
"{}",
|
|
1168
|
+
serde_json::json!({
|
|
1169
|
+
"status": "failure",
|
|
1170
|
+
"message": "Cannot query genes in unknown file format.",
|
|
1171
|
+
"file_path": hdf5_filename
|
|
1172
|
+
})
|
|
1173
|
+
);
|
|
1174
|
+
}
|
|
1175
|
+
}
|
|
1176
|
+
return Ok(());
|
|
1177
|
+
}
|
|
1178
|
+
}
|
|
1179
|
+
// Case 2: Check if "gene" field exists and is an array (this handles the case we're seeing)
|
|
1180
|
+
else if json_string["gene"].is_array() {
|
|
1181
|
+
// Convert the JsonValue array to a Vec<String>
|
|
1182
|
+
let mut gene_names: Vec<String> = Vec::new();
|
|
1183
|
+
for gene_value in json_string["gene"].members() {
|
|
1184
|
+
if let Some(gene_str) = gene_value.as_str() {
|
|
1185
|
+
gene_names.push(gene_str.to_string());
|
|
1186
|
+
}
|
|
1187
|
+
}
|
|
1188
|
+
|
|
1189
|
+
if !gene_names.is_empty() {
|
|
1190
|
+
// Process multiple genes
|
|
1191
|
+
match detect_hdf5_format(&hdf5_filename)? {
|
|
1192
|
+
"dense" => query_multiple_genes_dense(hdf5_filename, gene_names)?,
|
|
1193
|
+
"sparse" => query_multiple_genes_sparse(hdf5_filename, gene_names)?,
|
|
1194
|
+
_ => {
|
|
1195
|
+
println!(
|
|
1196
|
+
"{}",
|
|
1197
|
+
serde_json::json!({
|
|
1198
|
+
"status": "failure",
|
|
1199
|
+
"message": "Cannot query genes in unknown file format.",
|
|
1200
|
+
"file_path": hdf5_filename
|
|
1201
|
+
})
|
|
1202
|
+
);
|
|
1203
|
+
}
|
|
1204
|
+
}
|
|
1205
|
+
return Ok(());
|
|
1206
|
+
}
|
|
1207
|
+
}
|
|
1208
|
+
// Case 3: Check if "gene" field exists and is a string (original single gene case)
|
|
1209
|
+
else if let Some(gene_name) = json_string["gene"].as_str() {
|
|
648
1210
|
query_gene(hdf5_filename, gene_name.to_string())?;
|
|
649
|
-
|
|
1211
|
+
return Ok(());
|
|
650
1212
|
}
|
|
1213
|
+
println!(
|
|
1214
|
+
"{}",
|
|
1215
|
+
serde_json::json!({
|
|
1216
|
+
"status": "error",
|
|
1217
|
+
"message": "Neither gene nor genes array provided in input"
|
|
1218
|
+
})
|
|
1219
|
+
);
|
|
651
1220
|
}
|
|
652
1221
|
Err(error) => println!("Incorrect json: {}", error),
|
|
653
1222
|
}
|