@sjcrh/proteinpaint-rust 2.122.0 → 2.124.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/gdcmaf.rs +107 -107
- package/src/genesetORA.rs +22 -48
- package/src/readHDF5.rs +31 -70
- package/src/test.rs +3 -0
- package/src/test_examples.rs +380 -473
- package/src/validateHDF5.rs +0 -1
package/src/readHDF5.rs
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
//------------------------------------------------------------------------------
|
|
2
2
|
// readHDF5.rs - HDF5 Gene Expression Data Reader
|
|
3
3
|
//------------------------------------------------------------------------------
|
|
4
|
-
//
|
|
4
|
+
//
|
|
5
5
|
// Extracts gene expression values from HDF5 files in dense or sparse formats.
|
|
6
|
-
// Supports single genes with memory optimization and multiple genes with
|
|
6
|
+
// Supports single genes with memory optimization and multiple genes with
|
|
7
7
|
// parallel processing.
|
|
8
8
|
//
|
|
9
9
|
// Features:
|
|
@@ -12,8 +12,8 @@
|
|
|
12
12
|
// - Parallel processing for multiple genes
|
|
13
13
|
// - JSON output with timing metrics
|
|
14
14
|
//
|
|
15
|
-
// Usage:
|
|
16
|
-
// HDF5_DIR=/usr/local/Homebrew/Cellar/hdf5/1.14.3_1 &&
|
|
15
|
+
// Usage:
|
|
16
|
+
// HDF5_DIR=/usr/local/Homebrew/Cellar/hdf5/1.14.3_1 &&
|
|
17
17
|
// echo $json='{"gene":"TP53","hdf5_file":"matrix.h5"}' | target/release/readHDF5
|
|
18
18
|
//------------------------------------------------------------------------------
|
|
19
19
|
use hdf5::types::{FixedAscii, VarLenAscii};
|
|
@@ -259,11 +259,7 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
259
259
|
// Create direct key-value pairs where sample names are the keys
|
|
260
260
|
for i in 0..gene_expression.len() {
|
|
261
261
|
// Add each sample name as a key pointing directly to its expression value
|
|
262
|
-
output_string += &format!(
|
|
263
|
-
"\"{}\":{}",
|
|
264
|
-
samples[i].to_string(),
|
|
265
|
-
gene_expression[i].to_string()
|
|
266
|
-
);
|
|
262
|
+
output_string += &format!("\"{}\":{}", samples[i].to_string(), gene_expression[i].to_string());
|
|
267
263
|
|
|
268
264
|
// Add comma if not the last item
|
|
269
265
|
if i < gene_expression.len() - 1 {
|
|
@@ -296,10 +292,7 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
296
292
|
Value::Null
|
|
297
293
|
};
|
|
298
294
|
|
|
299
|
-
samples_map.insert(
|
|
300
|
-
sample.replace("\\", ""),
|
|
301
|
-
value,
|
|
302
|
-
);
|
|
295
|
+
samples_map.insert(sample.replace("\\", ""), value);
|
|
303
296
|
}
|
|
304
297
|
}
|
|
305
298
|
|
|
@@ -317,7 +310,7 @@ fn query_gene_dense(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
317
310
|
|
|
318
311
|
/// Reads expression data for a specific gene from a sparse format HDF5 file
|
|
319
312
|
///
|
|
320
|
-
/// Extracts expression values from sparse matrix HDF5 files using Compressed
|
|
313
|
+
/// Extracts expression values from sparse matrix HDF5 files using Compressed
|
|
321
314
|
/// Sparse Column (CSC) structure.
|
|
322
315
|
///
|
|
323
316
|
/// # Arguments
|
|
@@ -391,15 +384,13 @@ fn query_gene_sparse(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
391
384
|
// Find all columns indices that are populated for the given gene
|
|
392
385
|
let now_i = Instant::now();
|
|
393
386
|
let ds_i = file.dataset("data/i")?;
|
|
394
|
-
let populated_column_ids: Array1<usize> =
|
|
395
|
-
ds_i.read_slice_1d(array_start_point..array_stop_point)?;
|
|
387
|
+
let populated_column_ids: Array1<usize> = ds_i.read_slice_1d(array_start_point..array_stop_point)?;
|
|
396
388
|
println!("Time for i dataset:{:?}", now_i.elapsed());
|
|
397
389
|
|
|
398
390
|
// Find all columns values that are populated for the given gene
|
|
399
391
|
let now_x = Instant::now();
|
|
400
392
|
let ds_x = file.dataset("data/x")?;
|
|
401
|
-
let populated_column_values: Array1<f64> =
|
|
402
|
-
ds_x.read_slice_1d(array_start_point..array_stop_point)?;
|
|
393
|
+
let populated_column_values: Array1<f64> = ds_x.read_slice_1d(array_start_point..array_stop_point)?;
|
|
403
394
|
println!("Time for x dataset:{:?}", now_x.elapsed());
|
|
404
395
|
|
|
405
396
|
// Generate the complete array from the sparse array
|
|
@@ -425,10 +416,7 @@ fn query_gene_sparse(hdf5_filename: String, gene_name: String) -> Result<()> {
|
|
|
425
416
|
}
|
|
426
417
|
output_string += &"}".to_string();
|
|
427
418
|
|
|
428
|
-
println!(
|
|
429
|
-
"Time generating full array:{:?}",
|
|
430
|
-
time_generating_full_array.elapsed()
|
|
431
|
-
);
|
|
419
|
+
println!("Time generating full array:{:?}", time_generating_full_array.elapsed());
|
|
432
420
|
println!("output_string:{}", output_string);
|
|
433
421
|
|
|
434
422
|
Ok(())
|
|
@@ -465,7 +453,6 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
465
453
|
}
|
|
466
454
|
};
|
|
467
455
|
|
|
468
|
-
|
|
469
456
|
let genes_dataset = match file.dataset("gene_ids") {
|
|
470
457
|
Ok(ds) => ds,
|
|
471
458
|
Err(err) => {
|
|
@@ -495,7 +482,7 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
495
482
|
};
|
|
496
483
|
|
|
497
484
|
let genes: Vec<String> = genes_varlen.iter().map(|g| g.to_string()).collect();
|
|
498
|
-
|
|
485
|
+
|
|
499
486
|
// Only create HashMap for multiple gene queries
|
|
500
487
|
let gene_to_index: Option<std::collections::HashMap<String, usize>> = if gene_names.len() > 1 {
|
|
501
488
|
let hashmap_start_time = Instant::now();
|
|
@@ -504,8 +491,8 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
504
491
|
map.insert(gene.clone(), idx);
|
|
505
492
|
}
|
|
506
493
|
timings.insert(
|
|
507
|
-
"build_hashmap_ms".to_string(),
|
|
508
|
-
Value::from(hashmap_start_time.elapsed().as_millis() as u64)
|
|
494
|
+
"build_hashmap_ms".to_string(),
|
|
495
|
+
Value::from(hashmap_start_time.elapsed().as_millis() as u64),
|
|
509
496
|
);
|
|
510
497
|
Some(map)
|
|
511
498
|
} else {
|
|
@@ -586,14 +573,11 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
586
573
|
};
|
|
587
574
|
|
|
588
575
|
// Configurable thread count for testing
|
|
589
|
-
let thread_count = 2;
|
|
576
|
+
let thread_count = 2;
|
|
590
577
|
timings.insert("thread_count".to_string(), Value::from(thread_count));
|
|
591
578
|
|
|
592
579
|
// Create a scoped thread pool with specified number of threads
|
|
593
|
-
match rayon::ThreadPoolBuilder::new()
|
|
594
|
-
.num_threads(thread_count)
|
|
595
|
-
.build()
|
|
596
|
-
{
|
|
580
|
+
match rayon::ThreadPoolBuilder::new().num_threads(thread_count).build() {
|
|
597
581
|
Ok(pool) => {
|
|
598
582
|
// Use the pool for this specific work
|
|
599
583
|
pool.install(|| {
|
|
@@ -650,26 +634,20 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
650
634
|
genes_map.insert(gene_name.clone(), gene_data);
|
|
651
635
|
} else {
|
|
652
636
|
// Fallback to per-gene reading if bulk load failed
|
|
653
|
-
match counts_dataset
|
|
654
|
-
.read_slice_1d::<f64, _>(s![gene_index, ..])
|
|
655
|
-
{
|
|
637
|
+
match counts_dataset.read_slice_1d::<f64, _>(s![gene_index, ..]) {
|
|
656
638
|
Ok(gene_expression) => {
|
|
657
639
|
// Create samples map for this gene
|
|
658
640
|
let mut samples_map = Map::new();
|
|
659
641
|
for (i, sample) in samples.iter().enumerate() {
|
|
660
642
|
if i < gene_expression.len() {
|
|
661
643
|
// Handle potential NaN or infinity values
|
|
662
|
-
let value =
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
samples_map.insert(
|
|
670
|
-
sample.replace("\\", ""),
|
|
671
|
-
value,
|
|
672
|
-
);
|
|
644
|
+
let value = if gene_expression[i].is_finite() {
|
|
645
|
+
Value::from(gene_expression[i])
|
|
646
|
+
} else {
|
|
647
|
+
Value::Null
|
|
648
|
+
};
|
|
649
|
+
|
|
650
|
+
samples_map.insert(sample.replace("\\", ""), value);
|
|
673
651
|
}
|
|
674
652
|
}
|
|
675
653
|
|
|
@@ -693,10 +671,7 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
693
671
|
);
|
|
694
672
|
|
|
695
673
|
let mut genes_map = genes_map.lock().unwrap();
|
|
696
|
-
genes_map.insert(
|
|
697
|
-
gene_name.clone(),
|
|
698
|
-
Value::Object(error_map),
|
|
699
|
-
);
|
|
674
|
+
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
700
675
|
}
|
|
701
676
|
}
|
|
702
677
|
}
|
|
@@ -736,7 +711,7 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
736
711
|
&counts_dataset,
|
|
737
712
|
&all_gene_data,
|
|
738
713
|
&samples,
|
|
739
|
-
&genes_map
|
|
714
|
+
&genes_map,
|
|
740
715
|
);
|
|
741
716
|
}
|
|
742
717
|
}
|
|
@@ -758,7 +733,6 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
758
733
|
// Read just this single gene's data directly
|
|
759
734
|
match counts_dataset.read_slice_1d::<f64, _>(s![gene_index, ..]) {
|
|
760
735
|
Ok(gene_expression) => {
|
|
761
|
-
|
|
762
736
|
// Create samples map for this gene
|
|
763
737
|
let mut samples_map = Map::new();
|
|
764
738
|
for (i, sample) in samples.iter().enumerate() {
|
|
@@ -786,10 +760,7 @@ fn query_multiple_genes_dense(hdf5_filename: String, gene_names: Vec<String>) ->
|
|
|
786
760
|
let mut error_map = Map::new();
|
|
787
761
|
error_map.insert(
|
|
788
762
|
"error".to_string(),
|
|
789
|
-
Value::String(format!(
|
|
790
|
-
"Failed to read expression values: {:?}",
|
|
791
|
-
err
|
|
792
|
-
)),
|
|
763
|
+
Value::String(format!("Failed to read expression values: {:?}", err)),
|
|
793
764
|
);
|
|
794
765
|
|
|
795
766
|
let mut genes_map = genes_map.lock().unwrap();
|
|
@@ -833,7 +804,7 @@ fn process_genes_sequentially(
|
|
|
833
804
|
counts_dataset: &hdf5::Dataset,
|
|
834
805
|
all_gene_data: &Option<ndarray::ArrayBase<ndarray::OwnedRepr<f64>, ndarray::Dim<[usize; 2]>>>,
|
|
835
806
|
samples: &Vec<String>,
|
|
836
|
-
genes_map: &Arc<std::sync::Mutex<Map<String, Value
|
|
807
|
+
genes_map: &Arc<std::sync::Mutex<Map<String, Value>>>,
|
|
837
808
|
) {
|
|
838
809
|
for gene_name in gene_names {
|
|
839
810
|
// Find the index of the requested gene, using HashMap if available
|
|
@@ -911,10 +882,7 @@ fn process_genes_sequentially(
|
|
|
911
882
|
let mut error_map = Map::new();
|
|
912
883
|
error_map.insert(
|
|
913
884
|
"error".to_string(),
|
|
914
|
-
Value::String(format!(
|
|
915
|
-
"Failed to read expression values: {:?}",
|
|
916
|
-
err1
|
|
917
|
-
)),
|
|
885
|
+
Value::String(format!("Failed to read expression values: {:?}", err1)),
|
|
918
886
|
);
|
|
919
887
|
|
|
920
888
|
let mut genes_map = genes_map.lock().unwrap();
|
|
@@ -935,7 +903,6 @@ fn process_genes_sequentially(
|
|
|
935
903
|
genes_map.insert(gene_name.clone(), Value::Object(error_map));
|
|
936
904
|
}
|
|
937
905
|
}
|
|
938
|
-
|
|
939
906
|
}
|
|
940
907
|
}
|
|
941
908
|
/// Queries expression data for multiple genes from a sparse format HDF5 file
|
|
@@ -1006,7 +973,6 @@ fn query_multiple_genes_sparse(hdf5_filename: String, gene_names: Vec<String>) -
|
|
|
1006
973
|
let num_threads = num_cpus::get();
|
|
1007
974
|
timings.insert("num_threads".to_string(), Value::from(num_threads as u64));
|
|
1008
975
|
|
|
1009
|
-
|
|
1010
976
|
// Thread-safe maps for results
|
|
1011
977
|
let genes_map = Arc::new(std::sync::Mutex::new(Map::new()));
|
|
1012
978
|
let gene_timings = Arc::new(std::sync::Mutex::new(Map::new()));
|
|
@@ -1041,8 +1007,7 @@ fn query_multiple_genes_sparse(hdf5_filename: String, gene_names: Vec<String>) -
|
|
|
1041
1007
|
// Read data for this gene
|
|
1042
1008
|
match ds_i.read_slice_1d::<usize, _>(array_start_point..array_stop_point) {
|
|
1043
1009
|
Ok(populated_column_ids) => {
|
|
1044
|
-
match ds_x.read_slice_1d::<f64, _>(array_start_point..array_stop_point)
|
|
1045
|
-
{
|
|
1010
|
+
match ds_x.read_slice_1d::<f64, _>(array_start_point..array_stop_point) {
|
|
1046
1011
|
Ok(populated_column_values) => {
|
|
1047
1012
|
// Generate the complete array from sparse representation
|
|
1048
1013
|
let mut gene_array: Array1<f64> = Array1::zeros(num_samples);
|
|
@@ -1061,8 +1026,7 @@ fn query_multiple_genes_sparse(hdf5_filename: String, gene_names: Vec<String>) -
|
|
|
1061
1026
|
Value::Null
|
|
1062
1027
|
};
|
|
1063
1028
|
|
|
1064
|
-
samples_map
|
|
1065
|
-
.insert(sample.to_string().replace("\\", ""), value);
|
|
1029
|
+
samples_map.insert(sample.to_string().replace("\\", ""), value);
|
|
1066
1030
|
}
|
|
1067
1031
|
|
|
1068
1032
|
let gene_data = json!({
|
|
@@ -1077,10 +1041,7 @@ fn query_multiple_genes_sparse(hdf5_filename: String, gene_names: Vec<String>) -
|
|
|
1077
1041
|
let mut error_map = Map::new();
|
|
1078
1042
|
error_map.insert(
|
|
1079
1043
|
"error".to_string(),
|
|
1080
|
-
Value::String(format!(
|
|
1081
|
-
"Failed to read x dataset: {:?}",
|
|
1082
|
-
err
|
|
1083
|
-
)),
|
|
1044
|
+
Value::String(format!("Failed to read x dataset: {:?}", err)),
|
|
1084
1045
|
);
|
|
1085
1046
|
|
|
1086
1047
|
let mut genes_map = genes_map.lock().unwrap();
|
package/src/test.rs
ADDED