@sjcrh/proteinpaint-rust 2.140.1 → 2.142.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/topGeneByExpressionVariance.rs +187 -51
package/package.json
CHANGED
|
@@ -8,8 +8,12 @@ Various JSON parameters:
|
|
|
8
8
|
filter_extreme_values: boolean (true/false). When true, this filter according to logic filterbyExpr in edgeR. This basically removes genes that have very low gene counts.
|
|
9
9
|
num_genes: The top num_genes (for e.g 10) that need to be reported in the output.
|
|
10
10
|
rank_type: var/iqr . This parameter decides whether to sort genes using variance or interquartile region. There is an article which states that its better to use interquartile region than variance for selecting genes for clustering https://www.frontiersin.org/articles/10.3389/fgene.2021.632620/full
|
|
11
|
+
newformat?: bool. Used to support new format HDF5
|
|
11
12
|
|
|
12
13
|
Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","min_count":30,"min_total_count":20,"input_file":"/path/to/input/file.h5","filter_extreme_values":true,"num_genes":100, "rank_type":"var"}' && time echo $json | target/release/gene_variance
|
|
14
|
+
|
|
15
|
+
Usage for new format HDF5
|
|
16
|
+
echo '{"samples":"sample1,sample2,sample3","newformat":true,"min_count":30,"min_total_count":20,"input_file":"/path/to/input/file.h5","filter_extreme_values":true,"num_genes":100, "rank_type":"var"}' | ./target/release/topGeneByExpressionVariance
|
|
13
17
|
*/
|
|
14
18
|
#![allow(non_snake_case)]
|
|
15
19
|
use bgzip::BGZFReader;
|
|
@@ -30,7 +34,7 @@ use std::io;
|
|
|
30
34
|
use std::io::Read;
|
|
31
35
|
use std::str::FromStr;
|
|
32
36
|
// use std::time::Instant;
|
|
33
|
-
use hdf5::types::VarLenAscii;
|
|
37
|
+
use hdf5::types::{VarLenAscii, VarLenUnicode};
|
|
34
38
|
use hdf5::{File, Result};
|
|
35
39
|
use ndarray::Dim;
|
|
36
40
|
|
|
@@ -53,10 +57,7 @@ use ndarray::Dim;
|
|
|
53
57
|
fn input_data_hdf5(
|
|
54
58
|
filename: &String,
|
|
55
59
|
sample_list: &Vec<&str>,
|
|
56
|
-
) -> Result<(
|
|
57
|
-
Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
58
|
-
Vec<String>,
|
|
59
|
-
)> {
|
|
60
|
+
) -> Result<(Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>, Vec<String>)> {
|
|
60
61
|
// let now = Instant::now();
|
|
61
62
|
// eprintln!("Reading HDF5 file: {}", filename);
|
|
62
63
|
|
|
@@ -73,10 +74,7 @@ fn input_data_hdf5(
|
|
|
73
74
|
// "file_path": filename
|
|
74
75
|
// })
|
|
75
76
|
// );
|
|
76
|
-
return Err(hdf5::Error::Internal(format!(
|
|
77
|
-
"Failed to open HDF5 file: {}",
|
|
78
|
-
err
|
|
79
|
-
)));
|
|
77
|
+
return Err(hdf5::Error::Internal(format!("Failed to open HDF5 file: {}", err)));
|
|
80
78
|
}
|
|
81
79
|
};
|
|
82
80
|
|
|
@@ -113,10 +111,7 @@ fn input_data_hdf5(
|
|
|
113
111
|
// "file_path": filename
|
|
114
112
|
// })
|
|
115
113
|
// );
|
|
116
|
-
return Err(hdf5::Error::Internal(format!(
|
|
117
|
-
"Failed to read gene symbols: {}",
|
|
118
|
-
err
|
|
119
|
-
)));
|
|
114
|
+
return Err(hdf5::Error::Internal(format!("Failed to read gene symbols: {}", err)));
|
|
120
115
|
}
|
|
121
116
|
};
|
|
122
117
|
|
|
@@ -158,10 +153,7 @@ fn input_data_hdf5(
|
|
|
158
153
|
"file_path": filename
|
|
159
154
|
})
|
|
160
155
|
);
|
|
161
|
-
return Err(hdf5::Error::Internal(format!(
|
|
162
|
-
"Failed to read sample names: {}",
|
|
163
|
-
err
|
|
164
|
-
)));
|
|
156
|
+
return Err(hdf5::Error::Internal(format!("Failed to read sample names: {}", err)));
|
|
165
157
|
}
|
|
166
158
|
};
|
|
167
159
|
|
|
@@ -205,10 +197,7 @@ fn input_data_hdf5(
|
|
|
205
197
|
// "file_path": filename
|
|
206
198
|
// })
|
|
207
199
|
// );
|
|
208
|
-
return Err(hdf5::Error::Internal(format!(
|
|
209
|
-
"Failed to open counts dataset: {}",
|
|
210
|
-
err
|
|
211
|
-
)));
|
|
200
|
+
return Err(hdf5::Error::Internal(format!("Failed to open counts dataset: {}", err)));
|
|
212
201
|
}
|
|
213
202
|
};
|
|
214
203
|
|
|
@@ -225,9 +214,7 @@ fn input_data_hdf5(
|
|
|
225
214
|
// "actual_shape": dataset_shape
|
|
226
215
|
// })
|
|
227
216
|
// );
|
|
228
|
-
return Err(hdf5::Error::Internal(
|
|
229
|
-
"Expected a 2D dataset for counts".to_string(),
|
|
230
|
-
));
|
|
217
|
+
return Err(hdf5::Error::Internal("Expected a 2D dataset for counts".to_string()));
|
|
231
218
|
}
|
|
232
219
|
|
|
233
220
|
// Check dimensions match expected values
|
|
@@ -319,14 +306,154 @@ fn input_data_hdf5(
|
|
|
319
306
|
Ok((dm, gene_names))
|
|
320
307
|
}
|
|
321
308
|
|
|
309
|
+
// Similar to input_data_hdf5, but specifically for new H5 format
|
|
310
|
+
fn input_data_hdf5_newformat(
|
|
311
|
+
filename: &String,
|
|
312
|
+
sample_list: &Vec<&str>,
|
|
313
|
+
) -> Result<(Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>, Vec<String>)> {
|
|
314
|
+
// Open the HDF5 file
|
|
315
|
+
let file = match File::open(filename) {
|
|
316
|
+
Ok(f) => f,
|
|
317
|
+
Err(err) => {
|
|
318
|
+
return Err(hdf5::Error::Internal(format!("Failed to open HDF5 file: {}", err)));
|
|
319
|
+
}
|
|
320
|
+
};
|
|
321
|
+
|
|
322
|
+
// Read gene symbols dataset
|
|
323
|
+
let genes_dataset = match file.dataset("item") {
|
|
324
|
+
Ok(ds) => ds,
|
|
325
|
+
Err(err) => {
|
|
326
|
+
return Err(hdf5::Error::Internal(format!(
|
|
327
|
+
"Failed to open gene_names dataset: {}",
|
|
328
|
+
err
|
|
329
|
+
)));
|
|
330
|
+
}
|
|
331
|
+
};
|
|
332
|
+
|
|
333
|
+
// Read genes as VarLenAscii
|
|
334
|
+
let genes_varlen = match genes_dataset.read_1d::<VarLenUnicode>() {
|
|
335
|
+
Ok(g) => g,
|
|
336
|
+
Err(err) => {
|
|
337
|
+
return Err(hdf5::Error::Internal(format!("Failed to read gene symbols: {}", err)));
|
|
338
|
+
}
|
|
339
|
+
};
|
|
340
|
+
|
|
341
|
+
// Convert to Vec<String> for easier handling
|
|
342
|
+
let gene_names: Vec<String> = genes_varlen.iter().map(|g| g.to_string()).collect();
|
|
343
|
+
let num_genes = gene_names.len();
|
|
344
|
+
|
|
345
|
+
// Read sample names
|
|
346
|
+
let samples_dataset = match file.dataset("samples") {
|
|
347
|
+
Ok(ds) => ds,
|
|
348
|
+
Err(err) => {
|
|
349
|
+
println!(
|
|
350
|
+
"{}",
|
|
351
|
+
serde_json::json!({
|
|
352
|
+
"status": "error",
|
|
353
|
+
"message": format!("Failed to open samples dataset: {}", err),
|
|
354
|
+
"file_path": filename
|
|
355
|
+
})
|
|
356
|
+
);
|
|
357
|
+
return Err(hdf5::Error::Internal(format!(
|
|
358
|
+
"Failed to open samples dataset: {}",
|
|
359
|
+
err
|
|
360
|
+
)));
|
|
361
|
+
}
|
|
362
|
+
};
|
|
363
|
+
|
|
364
|
+
// Read samples as VarLenAscii
|
|
365
|
+
let samples_varlen = match samples_dataset.read_1d::<VarLenUnicode>() {
|
|
366
|
+
Ok(s) => s,
|
|
367
|
+
Err(err) => {
|
|
368
|
+
// eprintln!("Failed to read sample names: {}", err);
|
|
369
|
+
println!(
|
|
370
|
+
"{}",
|
|
371
|
+
serde_json::json!({
|
|
372
|
+
"status": "error",
|
|
373
|
+
"message": format!("Failed to read sample names: {}", err),
|
|
374
|
+
"file_path": filename
|
|
375
|
+
})
|
|
376
|
+
);
|
|
377
|
+
return Err(hdf5::Error::Internal(format!("Failed to read sample names: {}", err)));
|
|
378
|
+
}
|
|
379
|
+
};
|
|
380
|
+
|
|
381
|
+
// Convert to Vec<String> for easier handling
|
|
382
|
+
let all_samples: Vec<String> = samples_varlen.iter().map(|s| s.to_string()).collect();
|
|
383
|
+
|
|
384
|
+
// Find indices of requested samples
|
|
385
|
+
let mut column_indices: Vec<usize> = Vec::with_capacity(sample_list.len());
|
|
386
|
+
for sample in sample_list {
|
|
387
|
+
if let Some(index) = all_samples.iter().position(|s| s == sample) {
|
|
388
|
+
column_indices.push(index);
|
|
389
|
+
} else {
|
|
390
|
+
return Err(hdf5::Error::Internal(format!(
|
|
391
|
+
"Sample '{}' not found in the dataset",
|
|
392
|
+
sample
|
|
393
|
+
)));
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// Read the counts dataset
|
|
398
|
+
let counts_dataset = match file.dataset("matrix") {
|
|
399
|
+
Ok(ds) => ds,
|
|
400
|
+
Err(err) => {
|
|
401
|
+
return Err(hdf5::Error::Internal(format!("Failed to open counts dataset: {}", err)));
|
|
402
|
+
}
|
|
403
|
+
};
|
|
404
|
+
|
|
405
|
+
// Get dataset dimensions for validation
|
|
406
|
+
let dataset_shape = counts_dataset.shape();
|
|
407
|
+
if dataset_shape.len() != 2 {
|
|
408
|
+
return Err(hdf5::Error::Internal("Expected a 2D dataset for counts".to_string()));
|
|
409
|
+
};
|
|
410
|
+
|
|
411
|
+
// Check dimensions match expected values
|
|
412
|
+
if dataset_shape[0] != num_genes {
|
|
413
|
+
return Err(hdf5::Error::Internal(format!(
|
|
414
|
+
"Counts dataset first dimension ({}) doesn't match number of genes ({})",
|
|
415
|
+
dataset_shape[0], num_genes
|
|
416
|
+
)));
|
|
417
|
+
};
|
|
418
|
+
|
|
419
|
+
if dataset_shape[1] != all_samples.len() {
|
|
420
|
+
return Err(hdf5::Error::Internal(format!(
|
|
421
|
+
"Counts dataset second dimension ({}) doesn't match number of samples ({})",
|
|
422
|
+
dataset_shape[1],
|
|
423
|
+
all_samples.len()
|
|
424
|
+
)));
|
|
425
|
+
};
|
|
426
|
+
|
|
427
|
+
// Read the counts dataset
|
|
428
|
+
let all_counts = match counts_dataset.read::<f64, Dim<[usize; 2]>>() {
|
|
429
|
+
Ok(data) => data,
|
|
430
|
+
Err(err) => {
|
|
431
|
+
return Err(hdf5::Error::Internal(format!(
|
|
432
|
+
"Failed to read expression data: {}",
|
|
433
|
+
err
|
|
434
|
+
)));
|
|
435
|
+
}
|
|
436
|
+
};
|
|
437
|
+
|
|
438
|
+
let mut input_vector: Vec<f64> = Vec::with_capacity(num_genes * sample_list.len());
|
|
439
|
+
|
|
440
|
+
for gene_idx in 0..num_genes {
|
|
441
|
+
for &col_idx in &column_indices {
|
|
442
|
+
input_vector.push(all_counts[[gene_idx, col_idx]]);
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
// Create matrix from the extracted data
|
|
447
|
+
let dm = DMatrix::from_row_slice(num_genes, sample_list.len(), &input_vector);
|
|
448
|
+
|
|
449
|
+
Ok((dm, gene_names))
|
|
450
|
+
}
|
|
451
|
+
|
|
322
452
|
// The original input_data function for text files is kept as is
|
|
323
453
|
fn input_data(
|
|
324
454
|
filename: &String,
|
|
325
455
|
sample_list: &Vec<&str>,
|
|
326
|
-
) -> (
|
|
327
|
-
Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
328
|
-
Vec<String>,
|
|
329
|
-
) {
|
|
456
|
+
) -> (Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>, Vec<String>) {
|
|
330
457
|
// Build the CSV reader and iterate over each record.
|
|
331
458
|
let mut reader = BGZFReader::new(fs::File::open(filename).unwrap()).unwrap();
|
|
332
459
|
let mut num_lines: usize = 0;
|
|
@@ -472,10 +599,7 @@ fn calculate_variance(
|
|
|
472
599
|
if rank_type == "var" {
|
|
473
600
|
// Calculating variance
|
|
474
601
|
if gene_counts.clone().variance().is_nan() == true {
|
|
475
|
-
} else if filter_extreme_values == true
|
|
476
|
-
&& keep_cpm_bool == true
|
|
477
|
-
&& keep_total_bool == true
|
|
478
|
-
{
|
|
602
|
+
} else if filter_extreme_values == true && keep_cpm_bool == true && keep_total_bool == true {
|
|
479
603
|
gene_infos.push(GeneInfo {
|
|
480
604
|
rank_type: gene_counts.variance(),
|
|
481
605
|
gene_symbol: gene_names[row].clone(),
|
|
@@ -490,10 +614,7 @@ fn calculate_variance(
|
|
|
490
614
|
// Calculating interquartile region
|
|
491
615
|
let mut gene_counts_data = Data::new(gene_counts);
|
|
492
616
|
if gene_counts_data.clone().interquartile_range().is_nan() == true {
|
|
493
|
-
} else if filter_extreme_values == true
|
|
494
|
-
&& keep_cpm_bool == true
|
|
495
|
-
&& keep_total_bool == true
|
|
496
|
-
{
|
|
617
|
+
} else if filter_extreme_values == true && keep_cpm_bool == true && keep_total_bool == true {
|
|
497
618
|
gene_infos.push(GeneInfo {
|
|
498
619
|
rank_type: gene_counts_data.interquartile_range(),
|
|
499
620
|
gene_symbol: gene_names[row].clone(),
|
|
@@ -506,11 +627,9 @@ fn calculate_variance(
|
|
|
506
627
|
}
|
|
507
628
|
}
|
|
508
629
|
}
|
|
509
|
-
gene_infos
|
|
510
|
-
(
|
|
511
|
-
|
|
512
|
-
.unwrap_or(Ordering::Equal)
|
|
513
|
-
});
|
|
630
|
+
gene_infos
|
|
631
|
+
.as_mut_slice()
|
|
632
|
+
.sort_by(|a, b| (a.rank_type).partial_cmp(&b.rank_type).unwrap_or(Ordering::Equal));
|
|
514
633
|
gene_infos
|
|
515
634
|
}
|
|
516
635
|
|
|
@@ -527,8 +646,7 @@ fn cpm(
|
|
|
527
646
|
for col in 0..input_matrix.ncols() {
|
|
528
647
|
let norm_factor = column_sums[(0, col)];
|
|
529
648
|
for row in 0..input_matrix.nrows() {
|
|
530
|
-
output_matrix[(row, col)] =
|
|
531
|
-
(input_matrix[(row, col)] as f64 * 1000000.0) / norm_factor as f64;
|
|
649
|
+
output_matrix[(row, col)] = (input_matrix[(row, col)] as f64 * 1000000.0) / norm_factor as f64;
|
|
532
650
|
}
|
|
533
651
|
}
|
|
534
652
|
//println!("output_matrix:{:?}", output_matrix);
|
|
@@ -605,6 +723,14 @@ fn main() {
|
|
|
605
723
|
// eprintln!("Using default text file format (no .h5 extension found)");
|
|
606
724
|
}
|
|
607
725
|
|
|
726
|
+
// Determine if the H5 file is new format
|
|
727
|
+
let new_format: bool = match &json_string {
|
|
728
|
+
json::JsonValue::Object(ref obj) => {
|
|
729
|
+
obj.get("newformat").and_then(|v| v.as_bool()).map_or(false, |b| b)
|
|
730
|
+
}
|
|
731
|
+
_ => false,
|
|
732
|
+
};
|
|
733
|
+
|
|
608
734
|
let rank_type = &json_string["rank_type"] // Value provide must be either "var" or "iqr"
|
|
609
735
|
.to_owned()
|
|
610
736
|
.as_str()
|
|
@@ -691,15 +817,25 @@ fn main() {
|
|
|
691
817
|
// eprintln!("Reading data from {} file: {}", file_type, file_name);
|
|
692
818
|
let (input_matrix, gene_names) = if file_type == "hdf5" {
|
|
693
819
|
// eprintln!("Using HDF5 reader function...");
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
820
|
+
if new_format {
|
|
821
|
+
match input_data_hdf5_newformat(&file_name, &samples_list) {
|
|
822
|
+
Ok(result) => result,
|
|
823
|
+
Err(err) => {
|
|
824
|
+
eprintln!("ERROR in HDF5 new format reader: {:?}", err);
|
|
825
|
+
return;
|
|
826
|
+
}
|
|
698
827
|
}
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
828
|
+
} else {
|
|
829
|
+
match input_data_hdf5(&file_name, &samples_list) {
|
|
830
|
+
Ok(result) => {
|
|
831
|
+
// eprintln!("Successfully read HDF5 data");
|
|
832
|
+
result
|
|
833
|
+
}
|
|
834
|
+
Err(err) => {
|
|
835
|
+
eprintln!("ERROR in HDF5 reader: {:?}", err);
|
|
836
|
+
// Error has already been printed to stdout in JSON format by the function
|
|
837
|
+
return;
|
|
838
|
+
}
|
|
703
839
|
}
|
|
704
840
|
}
|
|
705
841
|
} else {
|