@sjcrh/proteinpaint-rust 2.111.0 → 2.114.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/readHDF5.rs CHANGED
@@ -1,17 +1,532 @@
1
1
  // Need to set HDF5_DIR and LD_LIBRARY_PATH in ~/.bash_profile
2
2
  // Syntax: HDF5_DIR=/usr/local/Homebrew/Cellar/hdf5/1.14.3_1 && echo $HDF5_DIR && cd .. && cargo build --release && json='{"gene":"TP53","hdf5_file":"matrix_with_na_comp_9.h5"}' && time echo $json | target/release/rust_hdf5
3
3
 
4
- use hdf5::types::FixedAscii;
4
+ // Imports
5
+ use hdf5::types::{FixedAscii, VarLenAscii};
5
6
  use hdf5::{File, Result};
6
- use json;
7
- use ndarray::Array1;
8
7
  use ndarray::Dim;
8
+ use ndarray::{Array1, s};
9
+ use serde_json::{Map, Value, json};
9
10
  use std::io;
10
11
  use std::time::Instant;
11
12
 
12
- fn read_hdf5(hdf5_filename: String, gene_name: String) -> Result<()> {
13
- let file = File::open(&hdf5_filename)?; // open for reading
14
- let ds_dim = file.dataset("data/dim")?; // open the dataset
13
+ /// Determines the format of an HDF5 gene expression file
14
+ ///
15
+ /// This function examines the structure of an HDF5 file to determine its format.
16
+ /// It detects whether the file uses a dense matrix representation, a sparse matrix
17
+ /// representation, or an unknown format by checking for the presence of specific
18
+ /// datasets and groups.
19
+ ///
20
+ /// # HDF5 Format Specifications
21
+ ///
22
+ /// The function identifies the following formats:
23
+ ///
24
+ /// - **Dense format**:
25
+ /// - Contains a "counts" dataset (2D matrix of gene expression values)
26
+ /// - Contains a "gene_names" dataset (gene identifiers)
27
+ /// - Contains a "samples" dataset (sample identifiers)
28
+ ///
29
+ /// - **Sparse format**:
30
+ /// - Contains a "data" group with sparse matrix components
31
+ /// - Contains a "sample_names" dataset
32
+ ///
33
+ /// - **Unknown format**:
34
+ /// - Does not match either the dense or sparse format criteria
35
+ ///
36
+ /// # Arguments
37
+ ///
38
+ /// * `hdf5_filename` - Path to the HDF5 file to analyze
39
+ ///
40
+ /// # Returns
41
+ ///
42
+ /// A result containing one of the following static string values:
43
+ /// - `"dense"` - If the file is in dense matrix format
44
+ /// - `"sparse"` - If the file is in sparse matrix format
45
+ /// - `"unknown"` - If the file format cannot be determined
46
+ ///
47
+ /// # Errors
48
+ ///
49
+ /// This function will return an error if:
50
+ /// - The file cannot be opened
51
+ /// - The file is not a valid HDF5 file
52
+ ///
53
+ /// # Algorithm
54
+ ///
55
+ /// The detection algorithm works by checking for the presence of specific datasets
56
+ /// and groups that are characteristic of each format:
57
+ ///
58
+ /// 1. Opens the HDF5 file
59
+ /// 2. Checks for datasets/groups that indicate dense format
60
+ /// 3. Checks for datasets/groups that indicate sparse format
61
+ /// 4. Returns the detected format or "unknown"
62
+ ///
63
+ /// # Examples
64
+ ///
65
+ /// ```rust
66
+ /// // Example usage (not runnable)
67
+ /// match detect_hdf5_format("expression_data.h5") {
68
+ /// Ok("dense") => println!("Dense format detected"),
69
+ /// Ok("sparse") => println!("Sparse format detected"),
70
+ /// Ok("unknown") => println!("Unknown format detected"),
71
+ /// Err(e) => println!("Error: {}", e),
72
+ /// }
73
+ /// ```
74
+ fn detect_hdf5_format(hdf5_filename: &str) -> Result<&'static str> {
75
+ let file = File::open(hdf5_filename)?;
76
+
77
+ // Check for dense format (has counts, gene_names, and samples datasets)
78
+ let has_counts = file.dataset("counts").is_ok();
79
+ let has_gene_names = file.dataset("gene_names").is_ok();
80
+ let has_samples = file.dataset("samples").is_ok();
81
+
82
+ // Check for sparse matrix format (has data group and sample_names)
83
+ let has_data_group = file.group("data").is_ok();
84
+ let has_sample_names = file.dataset("sample_names").is_ok();
85
+
86
+ if has_counts && has_gene_names && has_samples {
87
+ // eprintln!("Dense format detected");
88
+ Ok("dense")
89
+ } else if has_data_group && has_sample_names {
90
+ //eprintln!("Sparse format detected");
91
+ Ok("sparse")
92
+ } else {
93
+ eprintln!("Unknown format detected");
94
+ Ok("unknown")
95
+ }
96
+ }
97
+
98
+ /// Unified function for querying gene expression data from any supported HDF5 file format
99
+ ///
100
+ /// This function serves as the central entry point for extracting expression values for a specified gene
101
+ /// from an HDF5 file. It automatically detects the format of the provided file (dense or sparse)
102
+ /// and routes the query to the appropriate specialized handler function.
103
+ ///
104
+ /// # Supported HDF5 Formats
105
+ ///
106
+ /// - **Dense format**: Contains explicit "gene_ids", "samples", and "counts" datasets where
107
+ /// the expression matrix is stored as a direct 2D array
108
+ /// - **Sparse format**: Contains a "data" group with "p", "i", "x" datasets using the
109
+ /// Compressed Sparse Column (CSC) representation for the expression matrix
110
+ ///
111
+ /// # Arguments
112
+ ///
113
+ /// * `hdf5_filename` - Path to the HDF5 file containing gene expression data
114
+ /// * `gene_name` - Name of the gene whose expression values to extract
115
+ ///
116
+ /// # Returns
117
+ ///
118
+ /// A result indicating success or error. On success, the function prints the gene
119
+ /// expression data in JSON format to stdout for dense matrix HDF5 files. For spare matrix files it
120
+ /// sends the expression data in JSON format with "output_string:" prefix to stdout.
121
+ ///
122
+ /// # Example Output Format
123
+ ///
124
+ /// ```json
125
+ /// {
126
+ /// "gene": "TP53",
127
+ /// "dataId": "TP53",
128
+ /// "samples": {
129
+ /// "sample1": 10.5,
130
+ /// "sample2": 8.2,
131
+ /// "sample3": 15.7
132
+ /// }
133
+ /// }
134
+ /// ```
135
+ ///
136
+ /// # Error Handling
137
+ ///
138
+ /// The function handles several types of errors:
139
+ /// - File format detection failures
140
+ /// - Unsupported or unknown file formats
141
+ /// - Errors from the format-specific query functions
142
+ ///
143
+ /// When an error occurs, the function returns a structured JSON error message.
144
+ ///
145
+ /// # Processing Flow
146
+ ///
147
+ /// 1. Detects the format of the HDF5 file using `detect_hdf5_format`
148
+ /// 2. Routes to the appropriate specialized function:
149
+ /// - `query_gene_dense` for dense matrix files
150
+ /// - `query_gene_sparse` for sparse matrix files
151
+ /// 3. Returns an error for unsupported formats
152
+ ///
153
+ /// This unified approach allows client code to work with either format without needing
154
+ /// to know the specific structure of the underlying HDF5 file.
155
+ fn query_gene(hdf5_filename: String, gene_name: String) -> Result<()> {
156
+ // First, detect the file format
157
+ let file_format = detect_hdf5_format(&hdf5_filename)?;
158
+
159
+ // Query gene data based on format
160
+ match file_format {
161
+ "dense" => query_gene_dense(hdf5_filename, gene_name),
162
+ "sparse" => query_gene_sparse(hdf5_filename, gene_name),
163
+ _ => {
164
+ // For unknown format, return an error
165
+ println!(
166
+ "{}",
167
+ serde_json::json!({
168
+ "status": "failure",
169
+ "message": "Cannot query gene in unknown file format. Please use .h5 format in either sparse or dense format.",
170
+ "file_path": hdf5_filename,
171
+ "gene": gene_name,
172
+ "format": "unknown"
173
+ })
174
+ );
175
+ Ok(())
176
+ }
177
+ }
178
+ }
179
+
180
+ /// Reads expression data for a specific gene from a dense format HDF5 file
181
+ ///
182
+ /// This function extracts expression values for a specified gene from an HDF5 file
183
+ /// that follows the dense matrix format. The dense format is characterized by:
184
+ /// - A "gene_ids" dataset containing gene identifiers
185
+ /// - A "samples" dataset containing sample identifiers
186
+ /// - A "counts" dataset containing a gene × sample expression matrix
187
+ ///
188
+ /// The function returns the expression values in a JSON format where sample names
189
+ /// are keys and their corresponding expression values are the values.
190
+ ///
191
+ /// # Arguments
192
+ ///
193
+ /// * `hdf5_filename` - Path to the HDF5 file
194
+ /// * `gene_name` - Name of the gene to query
195
+ ///
196
+ /// # Returns
197
+ ///
198
+ /// A result indicating success or error. On success, the function prints the gene
199
+ /// expression data in JSON format to stdout.
200
+ ///
201
+ /// # Output Format
202
+ ///
203
+ /// ```json
204
+ /// {
205
+ /// "gene": "GENE_NAME",
206
+ /// "dataId": "GENE_NAME",
207
+ /// "samples": {
208
+ /// "SAMPLE1": VALUE1,
209
+ /// "SAMPLE2": VALUE2,
210
+ /// ...
211
+ /// }
212
+ /// }
213
+ /// ```
214
+ ///
215
+ /// # Error Handling
216
+ ///
217
+ /// The function handles several potential errors:
218
+ /// - File opening errors
219
+ /// - Missing or inaccessible datasets ("gene_ids", "samples", "counts")
220
+ /// - Gene not found in the dataset
221
+ /// - Out of bounds gene index
222
+ /// - Expression data reading failures
223
+ ///
224
+ /// If an error occurs, the function returns an explanatory error message in JSON format.
225
+ ///
226
+ /// # Reading Strategy
227
+ ///
228
+ /// The function tries two methods to read expression data:
229
+ /// 1. First attempts to read a 1D slice directly from the counts dataset
230
+ /// 2. If that fails, tries reading the entire dataset and extracting the row of interest
231
+ ///
232
+ /// This dual approach ensures compatibility with different HDF5 library implementations
233
+ /// and dataset configurations.
234
+ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
235
+ // let start_time = Instant::now();
236
+
237
+ // Open the HDF5 file
238
+ let file = match File::open(hdf5_filename) {
239
+ Ok(f) => f,
240
+ Err(err) => {
241
+ println!(
242
+ "{}",
243
+ serde_json::json!({
244
+ "status": "error",
245
+ "message": format!("Failed to open HDF5 file: {}", err)
246
+ })
247
+ );
248
+ return Ok(());
249
+ }
250
+ };
251
+
252
+ // Read gene ids using VarLenAscii
253
+ let genes_dataset = match file.dataset("gene_ids") {
254
+ Ok(ds) => ds,
255
+ Err(err) => {
256
+ println!(
257
+ "{}",
258
+ serde_json::json!({
259
+ "status": "error",
260
+ "message": format!("Failed to open gene_ids dataset {}", err)
261
+ })
262
+ );
263
+ return Ok(());
264
+ }
265
+ };
266
+
267
+ // Read genes as VarLenAscii
268
+ let genes_varlen = match genes_dataset.read_1d::<VarLenAscii>() {
269
+ Ok(g) => g,
270
+ Err(err) => {
271
+ println!(
272
+ "{}",
273
+ serde_json::json!({
274
+ "status": "error",
275
+ "message": format!("Failed to read gene names as VarLenAscii: {}", err)
276
+ })
277
+ );
278
+ return Ok(());
279
+ }
280
+ };
281
+
282
+ // Convert to Vec<String> for easier handling
283
+ let genes: Vec<String> = genes_varlen.iter().map(|g| g.to_string()).collect();
284
+
285
+ // Read sample names using VarLenAscii
286
+ let samples_dataset = match file.dataset("samples") {
287
+ Ok(ds) => ds,
288
+ Err(err) => {
289
+ println!(
290
+ "{}",
291
+ serde_json::json!({
292
+ "status": "error",
293
+ "message": format!("Failed to open samples dataset{}", err)
294
+ })
295
+ );
296
+ return Ok(());
297
+ }
298
+ };
299
+
300
+ // Read samples as VarLenAscii
301
+ let samples_varlen = match samples_dataset.read_1d::<VarLenAscii>() {
302
+ Ok(s) => s,
303
+ Err(err) => {
304
+ println!(
305
+ "{}",
306
+ serde_json::json!({
307
+ "status": "error",
308
+ "message": format!("Failed to read samples as VarLenAscii: {}", err)
309
+ })
310
+ );
311
+ return Ok(());
312
+ }
313
+ };
314
+
315
+ // Convert to Vec<String> for easier handling
316
+ let samples: Vec<String> = samples_varlen.iter().map(|s| s.to_string()).collect();
317
+
318
+ // Find the index of the requested gene
319
+ let gene_index = match genes.iter().position(|x| *x == gene_name) {
320
+ Some(index) => index,
321
+ None => {
322
+ println!(
323
+ "{}",
324
+ serde_json::json!({
325
+ "status": "error",
326
+ "message": format!("Gene '{}' not found in the dataset", gene_name)
327
+ })
328
+ );
329
+ return Ok(());
330
+ }
331
+ };
332
+
333
+ // Read the expression data for the gene
334
+ let counts_dataset = match file.dataset("counts") {
335
+ Ok(ds) => ds,
336
+ Err(err) => {
337
+ println!(
338
+ "{}",
339
+ serde_json::json!({
340
+ "status": "error",
341
+ "message": format!("Failed to open counts dataset: {}", err)
342
+ })
343
+ );
344
+ return Ok(());
345
+ }
346
+ };
347
+
348
+ // Make sure the gene index is valid for this dataset
349
+ if gene_index >= counts_dataset.shape()[0] {
350
+ println!(
351
+ "{}",
352
+ serde_json::json!({
353
+ "status": "error",
354
+ "message": "Gene index is out of bounds for the dataset"
355
+ })
356
+ );
357
+ return Ok(());
358
+ }
359
+
360
+ // Try to read the expression data
361
+ let gene_expression: Array1<f64>;
362
+
363
+ // Method 1: Try to read a 1D slice directly (for 2D datasets)
364
+ match counts_dataset.read_slice_1d::<f64, _>(s![gene_index, ..]) {
365
+ Ok(data) => {
366
+ gene_expression = data;
367
+ }
368
+ Err(err1) => {
369
+ // Method 2: Try a different approach
370
+
371
+ // First get the dimensions
372
+ let dataset_shape = counts_dataset.shape();
373
+ if dataset_shape.len() != 2 {
374
+ println!(
375
+ "{}",
376
+ serde_json::json!({
377
+ "status": "error",
378
+ "message": "Expected a 2D dataset for counts"
379
+ })
380
+ );
381
+ return Ok(());
382
+ }
383
+
384
+ // Try reading the entire dataset and then extracting the row
385
+ match counts_dataset.read::<f64, Dim<[usize; 2]>>() {
386
+ Ok(all_data) => {
387
+ // Extract just the row we need
388
+ let row = all_data.slice(s![gene_index, ..]).to_owned();
389
+ gene_expression = row;
390
+
391
+ // Start building a flatter JSON structure
392
+ let mut output_string = String::from("{\"samples\":{");
393
+
394
+ // Create direct key-value pairs where sample names are the keys
395
+ for i in 0..gene_expression.len() {
396
+ // Add each sample name as a key pointing directly to its expression value
397
+ output_string += &format!(
398
+ "\"{}\":{}",
399
+ samples[i].to_string(),
400
+ gene_expression[i].to_string()
401
+ );
402
+
403
+ // Add comma if not the last item
404
+ if i < gene_expression.len() - 1 {
405
+ output_string += ",";
406
+ }
407
+ }
408
+
409
+ // Close the JSON object
410
+ output_string += "}}";
411
+
412
+ // println!("{}", output_string);
413
+ }
414
+ Err(err2) => {
415
+ println!(
416
+ "{}",
417
+ serde_json::json!({
418
+ "status": "error",
419
+ "message": format!("Failed to read expression values: {:?}, {:?}", err1, err2)
420
+ })
421
+ );
422
+ return Ok(());
423
+ }
424
+ }
425
+ }
426
+ }
427
+ // Create samples map
428
+ let mut samples_map = Map::new();
429
+ for (i, sample) in samples.iter().enumerate() {
430
+ if i < gene_expression.len() {
431
+ // Add each sample to the map, clean the sample name and convert value to JSON Number
432
+ // Note: We need to handle potential NaN or infinity values that aren't valid in JSON
433
+ let value = if gene_expression[i].is_finite() {
434
+ Value::from(gene_expression[i])
435
+ } else {
436
+ Value::Null // Or choose a different representation for non-finite values
437
+ };
438
+
439
+ samples_map.insert(
440
+ sample.replace("\\", ""), // Clean the sample name
441
+ value,
442
+ );
443
+ }
444
+ }
445
+
446
+ // Build the complete JSON structure
447
+ let output_json = json!({
448
+ "gene": gene_name,
449
+ "dataId": gene_name,
450
+ "samples": samples_map
451
+ });
452
+
453
+ // Output the JSON directly
454
+ println!("{}", output_json);
455
+
456
+ Ok(())
457
+ }
458
+
459
+ /// Reads expression data for a specific gene from a sparse format HDF5 file (from original readHD5.rs)
460
+ ///
461
+ /// This function extracts expression values for a specified gene from an HDF5 file
462
+ /// that uses a sparse matrix representation. Sparse matrices are efficient for storing
463
+ /// genomic data where many genes have zero expression in many samples. The sparse
464
+ /// format follows the Compressed Sparse Column (CSC) structure with:
465
+ ///
466
+ /// - A "data/dim" dataset containing matrix dimensions
467
+ /// - A "gene_names" dataset containing gene identifiers
468
+ /// - A "sample_names" dataset containing sample identifiers
469
+ /// - A "data/p" dataset containing pointers to where each gene's data starts and ends
470
+ /// - A "data/i" dataset containing column indices for non-zero values
471
+ /// - A "data/x" dataset containing the actual non-zero expression values
472
+ ///
473
+ /// # Arguments
474
+ ///
475
+ /// * `hdf5_filename` - Path to the HDF5 file
476
+ /// * `gene_name` - Name of the gene to query
477
+ ///
478
+ /// # Returns
479
+ ///
480
+ /// A result indicating success or error. On success, the function prints the gene
481
+ /// expression data in JSON format to stdout with "output_string:" prefix.
482
+ ///
483
+ /// # Output Format
484
+ ///
485
+ /// The function outputs a JSON object where sample names are keys and their
486
+ /// corresponding expression values are the values:
487
+ ///
488
+ /// ```json
489
+ /// {
490
+ /// "sample1": 0.0,
491
+ /// "sample2": 4.5,
492
+ /// "sample3": 0.0,
493
+ /// "sample4": 7.2,
494
+ /// ...
495
+ /// }
496
+ /// ```
497
+ ///
498
+ /// # Algorithm
499
+ ///
500
+ /// 1. Opens the HDF5 file and reads matrix dimensions
501
+ /// 2. Reads gene and sample names
502
+ /// 3. Finds the index of the requested gene
503
+ /// 4. Reads the sparse representation:
504
+ /// - Gets pointers from "data/p" to determine which values belong to the gene
505
+ /// - Reads column indices from "data/i" to know which samples have non-zero values
506
+ /// - Reads actual values from "data/x"
507
+ /// 5. Reconstructs a dense vector from the sparse representation
508
+ /// 6. Formats and outputs the result as JSON
509
+ ///
510
+ /// # Performance Tracking
511
+ ///
512
+ /// The function tracks performance at various stages using timestamps:
513
+ /// - Time spent parsing genes
514
+ /// - Time spent parsing samples
515
+ /// - Time spent reading the p, i, and x datasets
516
+ /// - Time spent generating the full array from sparse representation
517
+ ///
518
+ /// # Error Handling
519
+ ///
520
+ /// The function handles several potential errors:
521
+ /// - File opening failures
522
+ /// - Dataset access failures
523
+ /// - Gene not found in the dataset
524
+ /// - Sparse matrix reading failures
525
+ ///
526
+ /// If an error occurs, the function returns a structured JSON error message.
527
+ fn query_gene_sparse(hdf5_filename: String, gene_name: String) -> Result<()> {
528
+ let file = File::open(&hdf5_filename)?;
529
+ let ds_dim = file.dataset("data/dim")?;
15
530
 
16
531
  // Check the data type and read the dataset accordingly
17
532
  let data_dim: Array1<_> = ds_dim.read::<usize, Dim<[usize; 1]>>()?;
@@ -20,203 +535,119 @@ fn read_hdf5(hdf5_filename: String, gene_name: String) -> Result<()> {
20
535
  println!("num_samples:{}", num_samples);
21
536
  println!("num_genes:{}", num_genes);
22
537
 
23
- //let now_partial_i = Instant::now();
24
- //let data_partial_i: Array1<usize> = ds_i.read_slice_1d(0..20)?;
25
- //println!("Data_partial_i: {:?}", data_partial_i);
26
- //println!("Time for partial_i dataset:{:?}", now_partial_i.elapsed());
27
- //
28
- //let now_x = Instant::now();
29
- //let ds_x = file.dataset("data/x")?; // open the dataset
30
- //let data_x: Array1<_> = ds_x.read::<f64, Dim<[usize; 1]>>()?;
31
- //println!("Data_x: {:?}", data_x);
32
- //println!("Time for x dataset:{:?}", now_x.elapsed());
33
-
34
538
  let now_genes = Instant::now();
35
539
  let ds_genes = file.dataset("gene_names")?;
36
540
  let genes = ds_genes.read_1d::<FixedAscii<104>>()?;
37
- //println!("\tgenes = {:?}", genes);
38
- //println!("\tgenes.shape() = {:?}", genes.shape());
39
- //println!("\tgenes.strides() = {:?}", genes.strides());
40
- //println!("\tgenes.ndim() = {:?}", genes.ndim());
41
541
  println!("Time for parsing genes:{:?}", now_genes.elapsed());
42
542
 
43
543
  let now_samples = Instant::now();
44
544
  let ds_samples = file.dataset("sample_names")?;
45
545
  let samples = ds_samples.read_1d::<FixedAscii<104>>()?;
46
- //println!("\tsamples = {:?}", samples);
47
- //println!("\tsamples.shape() = {:?}", samples.shape());
48
- //println!("\tsamples.strides() = {:?}", samples.strides());
49
- //println!("\tsamples.ndim() = {:?}", samples.ndim());
50
546
  println!("Time for parsing samples:{:?}", now_samples.elapsed());
51
547
 
52
- let gene_index;
53
- match genes.iter().position(|&x| x == gene_name) {
548
+ let gene_index = match genes.iter().position(|&x| x == gene_name) {
54
549
  Some(index) => {
55
550
  println!(
56
551
  "The index of '{}' is {} in 0-based format (add 1 to compare with R output)",
57
552
  gene_name, index
58
553
  );
59
- gene_index = index;
554
+ index
60
555
  }
61
- None => panic!(
62
- "Gene '{}' not found in the HDF5 file '{}'",
63
- gene_name, &hdf5_filename
64
- ),
65
- }
556
+ None => {
557
+ println!(
558
+ "{}",
559
+ serde_json::json!({
560
+ "status": "failure",
561
+ "message": format!("Gene '{}' not found in the HDF5 file '{}'", gene_name, &hdf5_filename),
562
+ "file_path": hdf5_filename,
563
+ "gene": gene_name
564
+ })
565
+ );
566
+ return Ok(());
567
+ }
568
+ };
66
569
 
67
570
  // Find the number of columns that are populated for that gene
68
571
  let now_p = Instant::now();
69
- let ds_p = file.dataset("data/p")?; // open the dataset
70
-
71
- //let data_p: Array1<_> = ds_p.read::<usize, Dim<[usize; 1]>>()?;
572
+ let ds_p = file.dataset("data/p")?;
72
573
  let data_partial_p: Array1<usize> = ds_p.read_slice_1d(gene_index..gene_index + 2)?;
73
- //println!("Data_p: {:?}", data_p);
74
574
  println!("Data_partial_p: {:?}", data_partial_p);
75
575
  println!("Time for p dataset:{:?}", now_p.elapsed());
76
576
 
77
577
  let array_start_point = data_partial_p[0];
78
578
  let array_stop_point = data_partial_p[1];
79
- let num_populated_cells = data_partial_p[1] - array_start_point;
579
+ let num_populated_cells = array_stop_point - array_start_point;
80
580
  println!("Number of populated cells:{}", num_populated_cells);
81
581
 
82
- //Find all columns indices that are populated for the given gene
582
+ // Find all columns indices that are populated for the given gene
83
583
  let now_i = Instant::now();
84
- let ds_i = file.dataset("data/i")?; // open the dataset
85
-
86
- //let data_i: Array1<_> = ds_i.read::<f64, Dim<[usize; 1]>>()?;
87
- //println!("Data_i: {:?}", data_i);
584
+ let ds_i = file.dataset("data/i")?;
88
585
  let populated_column_ids: Array1<usize> =
89
- ds_i.read_slice_1d(array_start_point..array_stop_point - 1)?;
90
- println!(
91
- "Length of populated_column_ids:{}",
92
- populated_column_ids.len()
93
- );
94
-
95
- // Do a sanity check (for testing)
96
- //let mut min = 0;
97
- //for i in 0..populated_column_ids.len() {
98
- // if populated_column_ids[i] < min {
99
- // println!("Value is decreasing {},{}", populated_column_ids[i], min);
100
- // } else {
101
- // min = populated_column_ids[i];
102
- // }
103
- //}
104
- println!("Populated cells:{:?}", populated_column_ids);
586
+ ds_i.read_slice_1d(array_start_point..array_stop_point)?;
105
587
  println!("Time for i dataset:{:?}", now_i.elapsed());
106
588
 
107
- //Find all columns values that are populated for the given gene
589
+ // Find all columns values that are populated for the given gene
108
590
  let now_x = Instant::now();
109
- let ds_x = file.dataset("data/x")?; // open the dataset
110
-
111
- //let data_x: Array1<_> = ds_x.read::<f64, Dim<[usize; 1]>>()?;
112
- //println!("Data_x: {:?}", data_x);
591
+ let ds_x = file.dataset("data/x")?;
113
592
  let populated_column_values: Array1<f64> =
114
- ds_x.read_slice_1d(array_start_point..array_stop_point - 1)?;
115
- println!(
116
- "Length of populated_column_ids:{}",
117
- populated_column_values.len()
118
- );
593
+ ds_x.read_slice_1d(array_start_point..array_stop_point)?;
119
594
  println!("Time for x dataset:{:?}", now_x.elapsed());
120
595
 
121
596
  // Generate the complete array from the sparse array
122
-
123
597
  let mut gene_array: Array1<f64> = Array1::zeros(num_samples);
124
598
  let time_generating_full_array = Instant::now();
125
- //let mut gene_array: Vec<f64> = Vec::with_capacity(num_samples);
126
- for index in 0..num_samples {
127
- match populated_column_ids.iter().any(|&x| x == index) {
128
- true => match populated_column_ids.iter().position(|&x| x == index) {
129
- Some(y) => {
130
- gene_array[index] = populated_column_values[y] //gene_array.push(populated_column_values[y]),
131
- }
132
- None => {} // should not happen because if the index is found, its position in the array should also be found
133
- },
134
- false => gene_array[index] = 0.0, //gene_array.push(0.0), // If index not found, it means the value is 0 for that sample
135
- }
599
+
600
+ // Fill in the values at the populated column indices
601
+ for (idx, &col_id) in populated_column_ids.iter().enumerate() {
602
+ gene_array[col_id] = populated_column_values[idx];
136
603
  }
137
604
 
605
+ // Format output as JSON
138
606
  let mut output_string = "{".to_string();
139
607
  for i in 0..gene_array.len() {
140
- //let item_json = "{\"".to_string()
141
- // + &samples[i].to_string()
142
- // + &"\","
143
- // + &gene_array[i].to_string()
144
- // + &"}";
145
-
146
- //let item_json = format!("{{\"{}\"}}", samples[i].to_string());
147
-
148
608
  output_string += &format!(
149
609
  "\"{}\":{}",
150
- samples[i].to_string(),
610
+ samples[i].to_string().replace("\\", ""),
151
611
  gene_array[i].to_string()
152
612
  );
153
- //println!("item_json:{}", item_json);
154
613
 
155
- //let item_json = format!(
156
- // r##"{{"{}",{}}}"##,
157
- // samples[i].to_string().replace("\\", ""),
158
- // gene_array[i].to_string()
159
- //);
160
614
  if i != gene_array.len() - 1 {
161
615
  output_string += &",";
162
616
  }
163
617
  }
164
618
  output_string += &"}".to_string();
165
- output_string = output_string.replace("\\", "");
619
+
166
620
  println!(
167
621
  "Time generating full array:{:?}",
168
622
  time_generating_full_array.elapsed()
169
623
  );
170
624
  println!("output_string:{}", output_string);
171
625
 
172
- // Print individual element in array
173
-
174
- //let arr = v.iter().collect::<Vec<_>>();
175
- //for (idx, val) in arr.iter().enumerate() {
176
- // println!("\tarr[{:?}] = {:?} ({:?})", idx, val.to_string(), val.len());
177
- //}
178
-
179
- //for item in data_i {
180
- // println!("i:{}", item);
181
- //}
182
626
  Ok(())
183
627
  }
184
628
 
629
+ // Main function
185
630
  fn main() -> Result<()> {
186
631
  let mut input = String::new();
187
632
  match io::stdin().read_line(&mut input) {
188
- // Accepting the piped input from nodejs (or command line from testing)
189
633
  Ok(_bytes_read) => {
190
- //println!("{} bytes read", bytes_read);
191
- //println!("{}", input);
192
634
  let input_json = json::parse(&input);
193
635
  match input_json {
194
636
  Ok(json_string) => {
195
- let now = Instant::now();
196
- let hdf5_filename_result = &json_string["hdf5_file"].to_owned();
197
- let hdf5_filename;
198
- match hdf5_filename_result.as_str() {
199
- Some(x) => {
200
- hdf5_filename = x.to_string();
201
- }
637
+ // Extract HDF5 filename
638
+ let hdf5_filename = match json_string["hdf5_file"].as_str() {
639
+ Some(x) => x.to_string(),
202
640
  None => {
203
641
  panic!("HDF5 filename not provided");
204
642
  }
205
- }
643
+ };
206
644
 
207
- let gene_result = &json_string["gene"].to_owned();
208
- let gene_name;
209
- match gene_result.as_str() {
210
- Some(x) => {
211
- gene_name = x.to_string();
212
- }
213
- None => {
214
- panic!("Gene name not provided");
215
- }
645
+ // Then, check if we have a gene to query
646
+ if let Some(gene_name) = json_string["gene"].as_str() {
647
+ // let gene_query_time = Instant::now();
648
+ query_gene(hdf5_filename, gene_name.to_string())?;
649
+ // println!("Time for querying gene: {:?}", gene_query_time.elapsed());
216
650
  }
217
-
218
- read_hdf5(hdf5_filename, gene_name)?;
219
- println!("Time for parsing genes from HDF5:{:?}", now.elapsed());
220
651
  }
221
652
  Err(error) => println!("Incorrect json: {}", error),
222
653
  }