@sjcrh/proteinpaint-rust 2.141.0 → 2.143.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.141.0",
2
+ "version": "2.143.0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "type": "module",
5
5
  "description": "Rust-based utilities for proteinpaint",
@@ -8,8 +8,12 @@ Various JSON parameters:
8
8
  filter_extreme_values: boolean (true/false). When true, this filter according to logic filterbyExpr in edgeR. This basically removes genes that have very low gene counts.
9
9
  num_genes: The top num_genes (for e.g 10) that need to be reported in the output.
10
10
  rank_type: var/iqr . This parameter decides whether to sort genes using variance or interquartile region. There is an article which states that its better to use interquartile region than variance for selecting genes for clustering https://www.frontiersin.org/articles/10.3389/fgene.2021.632620/full
11
+ newformat?: bool. Used to support new format HDF5
11
12
 
12
13
  Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","min_count":30,"min_total_count":20,"input_file":"/path/to/input/file.h5","filter_extreme_values":true,"num_genes":100, "rank_type":"var"}' && time echo $json | target/release/gene_variance
14
+
15
+ Usage for new format HDF5
16
+ echo '{"samples":"sample1,sample2,sample3","newformat":true,"min_count":30,"min_total_count":20,"input_file":"/path/to/input/file.h5","filter_extreme_values":true,"num_genes":100, "rank_type":"var"}' | ./target/release/topGeneByExpressionVariance
13
17
  */
14
18
  #![allow(non_snake_case)]
15
19
  use bgzip::BGZFReader;
@@ -30,7 +34,7 @@ use std::io;
30
34
  use std::io::Read;
31
35
  use std::str::FromStr;
32
36
  // use std::time::Instant;
33
- use hdf5::types::VarLenAscii;
37
+ use hdf5::types::{VarLenAscii, VarLenUnicode};
34
38
  use hdf5::{File, Result};
35
39
  use ndarray::Dim;
36
40
 
@@ -53,10 +57,7 @@ use ndarray::Dim;
53
57
  fn input_data_hdf5(
54
58
  filename: &String,
55
59
  sample_list: &Vec<&str>,
56
- ) -> Result<(
57
- Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
58
- Vec<String>,
59
- )> {
60
+ ) -> Result<(Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>, Vec<String>)> {
60
61
  // let now = Instant::now();
61
62
  // eprintln!("Reading HDF5 file: {}", filename);
62
63
 
@@ -73,10 +74,7 @@ fn input_data_hdf5(
73
74
  // "file_path": filename
74
75
  // })
75
76
  // );
76
- return Err(hdf5::Error::Internal(format!(
77
- "Failed to open HDF5 file: {}",
78
- err
79
- )));
77
+ return Err(hdf5::Error::Internal(format!("Failed to open HDF5 file: {}", err)));
80
78
  }
81
79
  };
82
80
 
@@ -113,10 +111,7 @@ fn input_data_hdf5(
113
111
  // "file_path": filename
114
112
  // })
115
113
  // );
116
- return Err(hdf5::Error::Internal(format!(
117
- "Failed to read gene symbols: {}",
118
- err
119
- )));
114
+ return Err(hdf5::Error::Internal(format!("Failed to read gene symbols: {}", err)));
120
115
  }
121
116
  };
122
117
 
@@ -158,10 +153,7 @@ fn input_data_hdf5(
158
153
  "file_path": filename
159
154
  })
160
155
  );
161
- return Err(hdf5::Error::Internal(format!(
162
- "Failed to read sample names: {}",
163
- err
164
- )));
156
+ return Err(hdf5::Error::Internal(format!("Failed to read sample names: {}", err)));
165
157
  }
166
158
  };
167
159
 
@@ -205,10 +197,7 @@ fn input_data_hdf5(
205
197
  // "file_path": filename
206
198
  // })
207
199
  // );
208
- return Err(hdf5::Error::Internal(format!(
209
- "Failed to open counts dataset: {}",
210
- err
211
- )));
200
+ return Err(hdf5::Error::Internal(format!("Failed to open counts dataset: {}", err)));
212
201
  }
213
202
  };
214
203
 
@@ -225,9 +214,7 @@ fn input_data_hdf5(
225
214
  // "actual_shape": dataset_shape
226
215
  // })
227
216
  // );
228
- return Err(hdf5::Error::Internal(
229
- "Expected a 2D dataset for counts".to_string(),
230
- ));
217
+ return Err(hdf5::Error::Internal("Expected a 2D dataset for counts".to_string()));
231
218
  }
232
219
 
233
220
  // Check dimensions match expected values
@@ -319,14 +306,154 @@ fn input_data_hdf5(
319
306
  Ok((dm, gene_names))
320
307
  }
321
308
 
309
+ // Similar to input_data_hdf5, but specifically for new H5 format
310
+ fn input_data_hdf5_newformat(
311
+ filename: &String,
312
+ sample_list: &Vec<&str>,
313
+ ) -> Result<(Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>, Vec<String>)> {
314
+ // Open the HDF5 file
315
+ let file = match File::open(filename) {
316
+ Ok(f) => f,
317
+ Err(err) => {
318
+ return Err(hdf5::Error::Internal(format!("Failed to open HDF5 file: {}", err)));
319
+ }
320
+ };
321
+
322
+ // Read gene symbols dataset
323
+ let genes_dataset = match file.dataset("item") {
324
+ Ok(ds) => ds,
325
+ Err(err) => {
326
+ return Err(hdf5::Error::Internal(format!(
327
+ "Failed to open gene_names dataset: {}",
328
+ err
329
+ )));
330
+ }
331
+ };
332
+
333
+ // Read genes as VarLenAscii
334
+ let genes_varlen = match genes_dataset.read_1d::<VarLenUnicode>() {
335
+ Ok(g) => g,
336
+ Err(err) => {
337
+ return Err(hdf5::Error::Internal(format!("Failed to read gene symbols: {}", err)));
338
+ }
339
+ };
340
+
341
+ // Convert to Vec<String> for easier handling
342
+ let gene_names: Vec<String> = genes_varlen.iter().map(|g| g.to_string()).collect();
343
+ let num_genes = gene_names.len();
344
+
345
+ // Read sample names
346
+ let samples_dataset = match file.dataset("samples") {
347
+ Ok(ds) => ds,
348
+ Err(err) => {
349
+ println!(
350
+ "{}",
351
+ serde_json::json!({
352
+ "status": "error",
353
+ "message": format!("Failed to open samples dataset: {}", err),
354
+ "file_path": filename
355
+ })
356
+ );
357
+ return Err(hdf5::Error::Internal(format!(
358
+ "Failed to open samples dataset: {}",
359
+ err
360
+ )));
361
+ }
362
+ };
363
+
364
+ // Read samples as VarLenAscii
365
+ let samples_varlen = match samples_dataset.read_1d::<VarLenUnicode>() {
366
+ Ok(s) => s,
367
+ Err(err) => {
368
+ // eprintln!("Failed to read sample names: {}", err);
369
+ println!(
370
+ "{}",
371
+ serde_json::json!({
372
+ "status": "error",
373
+ "message": format!("Failed to read sample names: {}", err),
374
+ "file_path": filename
375
+ })
376
+ );
377
+ return Err(hdf5::Error::Internal(format!("Failed to read sample names: {}", err)));
378
+ }
379
+ };
380
+
381
+ // Convert to Vec<String> for easier handling
382
+ let all_samples: Vec<String> = samples_varlen.iter().map(|s| s.to_string()).collect();
383
+
384
+ // Find indices of requested samples
385
+ let mut column_indices: Vec<usize> = Vec::with_capacity(sample_list.len());
386
+ for sample in sample_list {
387
+ if let Some(index) = all_samples.iter().position(|s| s == sample) {
388
+ column_indices.push(index);
389
+ } else {
390
+ return Err(hdf5::Error::Internal(format!(
391
+ "Sample '{}' not found in the dataset",
392
+ sample
393
+ )));
394
+ }
395
+ }
396
+
397
+ // Read the counts dataset
398
+ let counts_dataset = match file.dataset("matrix") {
399
+ Ok(ds) => ds,
400
+ Err(err) => {
401
+ return Err(hdf5::Error::Internal(format!("Failed to open counts dataset: {}", err)));
402
+ }
403
+ };
404
+
405
+ // Get dataset dimensions for validation
406
+ let dataset_shape = counts_dataset.shape();
407
+ if dataset_shape.len() != 2 {
408
+ return Err(hdf5::Error::Internal("Expected a 2D dataset for counts".to_string()));
409
+ };
410
+
411
+ // Check dimensions match expected values
412
+ if dataset_shape[0] != num_genes {
413
+ return Err(hdf5::Error::Internal(format!(
414
+ "Counts dataset first dimension ({}) doesn't match number of genes ({})",
415
+ dataset_shape[0], num_genes
416
+ )));
417
+ };
418
+
419
+ if dataset_shape[1] != all_samples.len() {
420
+ return Err(hdf5::Error::Internal(format!(
421
+ "Counts dataset second dimension ({}) doesn't match number of samples ({})",
422
+ dataset_shape[1],
423
+ all_samples.len()
424
+ )));
425
+ };
426
+
427
+ // Read the counts dataset
428
+ let all_counts = match counts_dataset.read::<f64, Dim<[usize; 2]>>() {
429
+ Ok(data) => data,
430
+ Err(err) => {
431
+ return Err(hdf5::Error::Internal(format!(
432
+ "Failed to read expression data: {}",
433
+ err
434
+ )));
435
+ }
436
+ };
437
+
438
+ let mut input_vector: Vec<f64> = Vec::with_capacity(num_genes * sample_list.len());
439
+
440
+ for gene_idx in 0..num_genes {
441
+ for &col_idx in &column_indices {
442
+ input_vector.push(all_counts[[gene_idx, col_idx]]);
443
+ }
444
+ }
445
+
446
+ // Create matrix from the extracted data
447
+ let dm = DMatrix::from_row_slice(num_genes, sample_list.len(), &input_vector);
448
+
449
+ Ok((dm, gene_names))
450
+ }
451
+
322
452
  // The original input_data function for text files is kept as is
323
453
  fn input_data(
324
454
  filename: &String,
325
455
  sample_list: &Vec<&str>,
326
- ) -> (
327
- Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
328
- Vec<String>,
329
- ) {
456
+ ) -> (Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>, Vec<String>) {
330
457
  // Build the CSV reader and iterate over each record.
331
458
  let mut reader = BGZFReader::new(fs::File::open(filename).unwrap()).unwrap();
332
459
  let mut num_lines: usize = 0;
@@ -472,10 +599,7 @@ fn calculate_variance(
472
599
  if rank_type == "var" {
473
600
  // Calculating variance
474
601
  if gene_counts.clone().variance().is_nan() == true {
475
- } else if filter_extreme_values == true
476
- && keep_cpm_bool == true
477
- && keep_total_bool == true
478
- {
602
+ } else if filter_extreme_values == true && keep_cpm_bool == true && keep_total_bool == true {
479
603
  gene_infos.push(GeneInfo {
480
604
  rank_type: gene_counts.variance(),
481
605
  gene_symbol: gene_names[row].clone(),
@@ -490,10 +614,7 @@ fn calculate_variance(
490
614
  // Calculating interquartile region
491
615
  let mut gene_counts_data = Data::new(gene_counts);
492
616
  if gene_counts_data.clone().interquartile_range().is_nan() == true {
493
- } else if filter_extreme_values == true
494
- && keep_cpm_bool == true
495
- && keep_total_bool == true
496
- {
617
+ } else if filter_extreme_values == true && keep_cpm_bool == true && keep_total_bool == true {
497
618
  gene_infos.push(GeneInfo {
498
619
  rank_type: gene_counts_data.interquartile_range(),
499
620
  gene_symbol: gene_names[row].clone(),
@@ -506,11 +627,9 @@ fn calculate_variance(
506
627
  }
507
628
  }
508
629
  }
509
- gene_infos.as_mut_slice().sort_by(|a, b| {
510
- (a.rank_type)
511
- .partial_cmp(&b.rank_type)
512
- .unwrap_or(Ordering::Equal)
513
- });
630
+ gene_infos
631
+ .as_mut_slice()
632
+ .sort_by(|a, b| (a.rank_type).partial_cmp(&b.rank_type).unwrap_or(Ordering::Equal));
514
633
  gene_infos
515
634
  }
516
635
 
@@ -527,8 +646,7 @@ fn cpm(
527
646
  for col in 0..input_matrix.ncols() {
528
647
  let norm_factor = column_sums[(0, col)];
529
648
  for row in 0..input_matrix.nrows() {
530
- output_matrix[(row, col)] =
531
- (input_matrix[(row, col)] as f64 * 1000000.0) / norm_factor as f64;
649
+ output_matrix[(row, col)] = (input_matrix[(row, col)] as f64 * 1000000.0) / norm_factor as f64;
532
650
  }
533
651
  }
534
652
  //println!("output_matrix:{:?}", output_matrix);
@@ -605,6 +723,14 @@ fn main() {
605
723
  // eprintln!("Using default text file format (no .h5 extension found)");
606
724
  }
607
725
 
726
+ // Determine if the H5 file is new format
727
+ let new_format: bool = match &json_string {
728
+ json::JsonValue::Object(ref obj) => {
729
+ obj.get("newformat").and_then(|v| v.as_bool()).map_or(false, |b| b)
730
+ }
731
+ _ => false,
732
+ };
733
+
608
734
  let rank_type = &json_string["rank_type"] // Value provide must be either "var" or "iqr"
609
735
  .to_owned()
610
736
  .as_str()
@@ -691,15 +817,25 @@ fn main() {
691
817
  // eprintln!("Reading data from {} file: {}", file_type, file_name);
692
818
  let (input_matrix, gene_names) = if file_type == "hdf5" {
693
819
  // eprintln!("Using HDF5 reader function...");
694
- match input_data_hdf5(&file_name, &samples_list) {
695
- Ok(result) => {
696
- // eprintln!("Successfully read HDF5 data");
697
- result
820
+ if new_format {
821
+ match input_data_hdf5_newformat(&file_name, &samples_list) {
822
+ Ok(result) => result,
823
+ Err(err) => {
824
+ eprintln!("ERROR in HDF5 new format reader: {:?}", err);
825
+ return;
826
+ }
698
827
  }
699
- Err(err) => {
700
- eprintln!("ERROR in HDF5 reader: {:?}", err);
701
- // Error has already been printed to stdout in JSON format by the function
702
- return;
828
+ } else {
829
+ match input_data_hdf5(&file_name, &samples_list) {
830
+ Ok(result) => {
831
+ // eprintln!("Successfully read HDF5 data");
832
+ result
833
+ }
834
+ Err(err) => {
835
+ eprintln!("ERROR in HDF5 reader: {:?}", err);
836
+ // Error has already been printed to stdout in JSON format by the function
837
+ return;
838
+ }
703
839
  }
704
840
  }
705
841
  } else {