@sjcrh/proteinpaint-rust 2.84.0 → 2.99.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/DEanalysis.rs +76 -73
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.
|
|
2
|
+
"version": "2.99.0",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"description": "Rust-based utilities for proteinpaint",
|
|
5
5
|
"main": "index.js",
|
|
@@ -38,5 +38,5 @@
|
|
|
38
38
|
"devDependencies": {
|
|
39
39
|
"tape": "^5.2.2"
|
|
40
40
|
},
|
|
41
|
-
"pp_release_tag": "v2.
|
|
41
|
+
"pp_release_tag": "v2.99.0"
|
|
42
42
|
}
|
package/src/DEanalysis.rs
CHANGED
|
@@ -11,7 +11,7 @@ use nalgebra::base::Matrix;
|
|
|
11
11
|
use nalgebra::base::VecStorage;
|
|
12
12
|
use nalgebra::DMatrix;
|
|
13
13
|
use nalgebra::ViewStorage;
|
|
14
|
-
use ndarray::Array1;
|
|
14
|
+
//use ndarray::Array1;
|
|
15
15
|
use ndarray::Array2;
|
|
16
16
|
use ndarray::Dim;
|
|
17
17
|
use serde::{Deserialize, Serialize};
|
|
@@ -25,7 +25,7 @@ use std::io::Read;
|
|
|
25
25
|
use std::str::FromStr;
|
|
26
26
|
use std::sync::{Arc, Mutex}; // Multithreading library
|
|
27
27
|
use std::thread;
|
|
28
|
-
use std::time::Instant;
|
|
28
|
+
//use std::time::Instant;
|
|
29
29
|
//use std::cmp::Ordering;
|
|
30
30
|
//use std::env;
|
|
31
31
|
use std::io;
|
|
@@ -73,43 +73,45 @@ fn input_data_from_HDF5(
|
|
|
73
73
|
Vec<String>,
|
|
74
74
|
) {
|
|
75
75
|
let file = HDF5File::open(&hdf5_filename).unwrap(); // open for reading
|
|
76
|
-
|
|
76
|
+
|
|
77
|
+
//let ds_dim = file.dataset("dims").unwrap(); // open the dataset
|
|
77
78
|
let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
|
|
78
79
|
let mut case_indexes: Vec<usize> = Vec::with_capacity(case_list.len());
|
|
79
80
|
let mut control_indexes: Vec<usize> = Vec::with_capacity(control_list.len());
|
|
80
81
|
// Check the data type and read the dataset accordingly
|
|
81
|
-
let data_dim: Array1<_> = ds_dim.read::<usize, Dim<[usize; 1]>>().unwrap();
|
|
82
|
-
let num_samples = data_dim[0]; // Number of total columns in the dataset
|
|
83
|
-
let num_genes = data_dim[1]; // Number of total rows in the dataset
|
|
84
|
-
|
|
85
|
-
println!("
|
|
82
|
+
//let data_dim: Array1<_> = ds_dim.read::<usize, Dim<[usize; 1]>>().unwrap();
|
|
83
|
+
//let num_samples = data_dim[0]; // Number of total columns in the dataset
|
|
84
|
+
//let num_genes = data_dim[1]; // Number of total rows in the dataset
|
|
85
|
+
|
|
86
|
+
//println!("num_samples bulk:{}", num_samples);
|
|
87
|
+
//println!("num_genes bulk:{}", num_genes);
|
|
86
88
|
|
|
87
|
-
let now_gene_names = Instant::now();
|
|
89
|
+
//let now_gene_names = Instant::now();
|
|
88
90
|
let ds_gene_names = file.dataset("gene_names").unwrap();
|
|
89
|
-
println!("ds_gene_names:{:?}", ds_gene_names);
|
|
91
|
+
//println!("ds_gene_names:{:?}", ds_gene_names);
|
|
90
92
|
let gene_names = ds_gene_names
|
|
91
93
|
.read::<VarLenAscii, Dim<[usize; 1]>>()
|
|
92
94
|
.unwrap();
|
|
93
|
-
println!("\tgene_names = {:?}", gene_names);
|
|
94
|
-
println!("\tgene_names.shape() = {:?}", gene_names.shape());
|
|
95
|
-
println!("\tgene_names.strides() = {:?}", gene_names.strides());
|
|
96
|
-
println!("\tgene_names.ndim() = {:?}", gene_names.ndim());
|
|
97
|
-
println!("Time for parsing gene names:{:?}", now_gene_names.elapsed());
|
|
95
|
+
//println!("\tgene_names = {:?}", gene_names);
|
|
96
|
+
//println!("\tgene_names.shape() = {:?}", gene_names.shape());
|
|
97
|
+
//println!("\tgene_names.strides() = {:?}", gene_names.strides());
|
|
98
|
+
//println!("\tgene_names.ndim() = {:?}", gene_names.ndim());
|
|
99
|
+
//println!("Time for parsing gene names:{:?}", now_gene_names.elapsed());
|
|
98
100
|
|
|
99
|
-
let now_gene_symbols = Instant::now();
|
|
101
|
+
//let now_gene_symbols = Instant::now();
|
|
100
102
|
let ds_gene_symbols = file.dataset("gene_symbols").unwrap();
|
|
101
|
-
println!("ds_gene_symbols:{:?}", ds_gene_symbols);
|
|
103
|
+
//println!("ds_gene_symbols:{:?}", ds_gene_symbols);
|
|
102
104
|
let gene_symbols = ds_gene_symbols
|
|
103
105
|
.read::<VarLenAscii, Dim<[usize; 1]>>()
|
|
104
106
|
.unwrap();
|
|
105
|
-
println!("\tgene_symbols = {:?}", gene_symbols);
|
|
106
|
-
println!("\tgene_symbols.shape() = {:?}", gene_symbols.shape());
|
|
107
|
-
println!("\tgene_symbols.strides() = {:?}", gene_symbols.strides());
|
|
108
|
-
println!("\tgene_symbols.ndim() = {:?}", gene_symbols.ndim());
|
|
109
|
-
println!(
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
);
|
|
107
|
+
//println!("\tgene_symbols = {:?}", gene_symbols);
|
|
108
|
+
//println!("\tgene_symbols.shape() = {:?}", gene_symbols.shape());
|
|
109
|
+
//println!("\tgene_symbols.strides() = {:?}", gene_symbols.strides());
|
|
110
|
+
//println!("\tgene_symbols.ndim() = {:?}", gene_symbols.ndim());
|
|
111
|
+
//println!(
|
|
112
|
+
// "Time for parsing gene symbols:{:?}",
|
|
113
|
+
// now_gene_symbols.elapsed()
|
|
114
|
+
//);
|
|
113
115
|
|
|
114
116
|
let mut gene_names_string: Vec<String> = Vec::with_capacity(gene_names.len());
|
|
115
117
|
let mut gene_symbols_string: Vec<String> = Vec::with_capacity(gene_symbols.len());
|
|
@@ -118,17 +120,17 @@ fn input_data_from_HDF5(
|
|
|
118
120
|
gene_symbols_string.push(gene_symbols[i].to_string());
|
|
119
121
|
}
|
|
120
122
|
|
|
121
|
-
let now_samples = Instant::now();
|
|
123
|
+
//let now_samples = Instant::now();
|
|
122
124
|
let ds_samples = file.dataset("samples").unwrap();
|
|
123
125
|
let samples = ds_samples.read::<VarLenAscii, Dim<[usize; 1]>>().unwrap();
|
|
124
|
-
println!("\tsamples = {:?}", samples);
|
|
125
|
-
println!("\tsamples.shape() = {:?}", samples.shape());
|
|
126
|
-
println!("\tsamples.strides() = {:?}", samples.strides());
|
|
127
|
-
println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
128
|
-
println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
126
|
+
//println!("\tsamples = {:?}", samples);
|
|
127
|
+
//println!("\tsamples.shape() = {:?}", samples.shape());
|
|
128
|
+
//println!("\tsamples.strides() = {:?}", samples.strides());
|
|
129
|
+
//println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
130
|
+
//println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
129
131
|
|
|
130
132
|
//Find all columns values that are populated for the given gene
|
|
131
|
-
let now_counts = Instant::now();
|
|
133
|
+
//let now_counts = Instant::now();
|
|
132
134
|
let ds_counts = file.dataset("counts").unwrap(); // open the dataset
|
|
133
135
|
|
|
134
136
|
let mut global_sample_index = 0;
|
|
@@ -189,7 +191,7 @@ fn input_data_from_HDF5(
|
|
|
189
191
|
global_sample_index += 1;
|
|
190
192
|
}
|
|
191
193
|
|
|
192
|
-
println!("Time for parsing HDF5 data:{:?}", now_counts.elapsed());
|
|
194
|
+
//println!("Time for parsing HDF5 data:{:?}", now_counts.elapsed());
|
|
193
195
|
//println!(
|
|
194
196
|
// "case + control length:{}",
|
|
195
197
|
// case_list.len() + control_list.len()
|
|
@@ -221,7 +223,7 @@ fn input_data_from_text(
|
|
|
221
223
|
Vec<String>,
|
|
222
224
|
Vec<String>,
|
|
223
225
|
) {
|
|
224
|
-
let input_time = Instant::now();
|
|
226
|
+
//let input_time = Instant::now();
|
|
225
227
|
let mut file = File::open(filename).unwrap();
|
|
226
228
|
let mut num_lines: usize = 0;
|
|
227
229
|
let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
|
|
@@ -350,7 +352,7 @@ fn input_data_from_text(
|
|
|
350
352
|
let genes_symbols_temp = Arc::new(Mutex::new(Vec::<String>::new()));
|
|
351
353
|
let input_vector_temp = Arc::new(Mutex::new(Vec::<f64>::new()));
|
|
352
354
|
let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
|
|
353
|
-
|
|
355
|
+
//println!("Number of threads used:{}", max_threads);
|
|
354
356
|
for thread_num in 0..max_threads {
|
|
355
357
|
let case_indexes_original = Arc::clone(&case_indexes_original);
|
|
356
358
|
let control_indexes_original = Arc::clone(&control_indexes_original);
|
|
@@ -485,7 +487,7 @@ fn input_data_from_text(
|
|
|
485
487
|
//println!("num_columns:{}", num_columns);
|
|
486
488
|
//println!("num_lines * num_columns:{}", num_lines * num_columns);
|
|
487
489
|
//println!("input_vector:{:?}", input_vector.len());
|
|
488
|
-
println!("Time for inputting data:{:?}", input_time.elapsed());
|
|
490
|
+
//println!("Time for inputting data:{:?}", input_time.elapsed());
|
|
489
491
|
let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
|
|
490
492
|
//println!("dm:{:?}", dm);
|
|
491
493
|
(dm, case_indexes, control_indexes, gene_names, gene_symbols)
|
|
@@ -513,14 +515,15 @@ struct PValueIndexes {
|
|
|
513
515
|
// Used to get the sample names from HDF5 file at PP server startup
|
|
514
516
|
fn get_DE_samples(hdf5_filename: &String) {
|
|
515
517
|
let file = HDF5File::open(&hdf5_filename).unwrap(); // open for reading
|
|
516
|
-
|
|
518
|
+
|
|
519
|
+
//let now_samples = Instant::now();
|
|
517
520
|
let ds_samples = file.dataset("samples").unwrap();
|
|
518
521
|
let samples = ds_samples.read::<VarLenAscii, Dim<[usize; 1]>>().unwrap();
|
|
519
|
-
println!("\tsamples = {:?}", samples);
|
|
520
|
-
println!("\tsamples.shape() = {:?}", samples.shape());
|
|
521
|
-
println!("\tsamples.strides() = {:?}", samples.strides());
|
|
522
|
-
println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
523
|
-
println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
522
|
+
//println!("\tsamples = {:?}", samples);
|
|
523
|
+
//println!("\tsamples.shape() = {:?}", samples.shape());
|
|
524
|
+
//println!("\tsamples.strides() = {:?}", samples.strides());
|
|
525
|
+
//println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
526
|
+
//println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
524
527
|
|
|
525
528
|
let mut output_string = "".to_string();
|
|
526
529
|
for i in 0..samples.len() {
|
|
@@ -544,7 +547,7 @@ fn get_DE_samples(hdf5_filename: &String) {
|
|
|
544
547
|
output_string += &",";
|
|
545
548
|
}
|
|
546
549
|
}
|
|
547
|
-
println!("
|
|
550
|
+
println!("{}", output_string);
|
|
548
551
|
}
|
|
549
552
|
|
|
550
553
|
fn main() {
|
|
@@ -559,7 +562,7 @@ fn main() {
|
|
|
559
562
|
let input_json = json::parse(&input);
|
|
560
563
|
match input_json {
|
|
561
564
|
Ok(json_string) => {
|
|
562
|
-
let now = Instant::now();
|
|
565
|
+
//let now = Instant::now();
|
|
563
566
|
let file_name = &json_string["input_file"]
|
|
564
567
|
.to_owned()
|
|
565
568
|
.as_str()
|
|
@@ -567,7 +570,7 @@ fn main() {
|
|
|
567
570
|
.to_string()
|
|
568
571
|
.split(",")
|
|
569
572
|
.collect();
|
|
570
|
-
println!("file_name:{}", file_name);
|
|
573
|
+
//println!("file_name:{}", file_name);
|
|
571
574
|
let data_type_option = json_string["data_type"].as_str().to_owned();
|
|
572
575
|
match data_type_option {
|
|
573
576
|
Some(x) => {
|
|
@@ -643,7 +646,7 @@ fn main() {
|
|
|
643
646
|
gene_symbols,
|
|
644
647
|
) = input_data_from_HDF5(file_name, &case_list, &control_list);
|
|
645
648
|
}
|
|
646
|
-
let filtering_time = Instant::now();
|
|
649
|
+
//let filtering_time = Instant::now();
|
|
647
650
|
let (
|
|
648
651
|
filtered_matrix,
|
|
649
652
|
lib_sizes,
|
|
@@ -658,21 +661,21 @@ fn main() {
|
|
|
658
661
|
gene_names,
|
|
659
662
|
gene_symbols,
|
|
660
663
|
);
|
|
661
|
-
println!("filtering time:{:?}", filtering_time.elapsed());
|
|
664
|
+
//println!("filtering time:{:?}", filtering_time.elapsed());
|
|
662
665
|
//println!("filtered_matrix_rows:{:?}", filtered_matrix.nrows());
|
|
663
666
|
//println!("filtered_matrix_cols:{:?}", filtered_matrix.ncols());
|
|
664
|
-
let cpm_normalization_time = Instant::now();
|
|
667
|
+
//let cpm_normalization_time = Instant::now();
|
|
665
668
|
let mut normalized_matrix = cpm(&filtered_matrix);
|
|
666
|
-
println!(
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
);
|
|
670
|
-
let tmm_normalization_time = Instant::now();
|
|
669
|
+
//println!(
|
|
670
|
+
// "cpm normalization time:{:?}",
|
|
671
|
+
// cpm_normalization_time.elapsed()
|
|
672
|
+
//);
|
|
673
|
+
//let tmm_normalization_time = Instant::now();
|
|
671
674
|
let norm_factors = tmm_normalization(filtered_matrix, &lib_sizes);
|
|
672
|
-
println!(
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
);
|
|
675
|
+
//println!(
|
|
676
|
+
// "tmm normalization time:{:?}",
|
|
677
|
+
// tmm_normalization_time.elapsed()
|
|
678
|
+
//);
|
|
676
679
|
//println!("norm_factors:{:?}", norm_factors);
|
|
677
680
|
|
|
678
681
|
for col in 0..normalized_matrix.ncols() {
|
|
@@ -683,19 +686,19 @@ fn main() {
|
|
|
683
686
|
}
|
|
684
687
|
}
|
|
685
688
|
//println!("normalized_matrix:{:?}", normalized_matrix);
|
|
686
|
-
println!("Number of cases:{}", case_list.len());
|
|
687
|
-
println!("Number of controls:{}", control_list.len());
|
|
688
|
-
println!("Time for pre-processing:{:?}", now.elapsed());
|
|
689
|
+
//println!("Number of cases:{}", case_list.len());
|
|
690
|
+
//println!("Number of controls:{}", control_list.len());
|
|
691
|
+
//println!("Time for pre-processing:{:?}", now.elapsed());
|
|
689
692
|
// Using Wilcoxon test for differential gene expression
|
|
690
693
|
|
|
691
|
-
let now2 = Instant::now();
|
|
694
|
+
//let now2 = Instant::now();
|
|
692
695
|
let mut p_values: Vec<PValueIndexes> =
|
|
693
696
|
Vec::with_capacity(normalized_matrix.nrows());
|
|
694
697
|
const THRESHOLD: usize = 50; // This determines whether the Wilcoxon exact test or the normal test will be used based on sample size.
|
|
695
698
|
|
|
696
699
|
//println!("case_indexes:{:?}", case_indexes);
|
|
697
700
|
//println!("control_indexes:{:?}", control_indexes);
|
|
698
|
-
let num_normalized_rows = normalized_matrix.nrows();
|
|
701
|
+
//let num_normalized_rows = normalized_matrix.nrows();
|
|
699
702
|
if normalized_matrix.nrows() * normalized_matrix.ncols()
|
|
700
703
|
< PAR_CUTOFF
|
|
701
704
|
{
|
|
@@ -857,13 +860,13 @@ fn main() {
|
|
|
857
860
|
p_values.append(&mut *p_values_temp.lock().unwrap());
|
|
858
861
|
}
|
|
859
862
|
//println!("p_values:{:?}", p_values);
|
|
860
|
-
println!(
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
);
|
|
863
|
+
//println!(
|
|
864
|
+
// "Time for running {} wilcoxon tests:{:?}",
|
|
865
|
+
// num_normalized_rows,
|
|
866
|
+
// now2.elapsed()
|
|
867
|
+
//);
|
|
865
868
|
let adjusted_p_values = adjust_p_values(p_values);
|
|
866
|
-
println!("
|
|
869
|
+
println!("{}", adjusted_p_values);
|
|
867
870
|
//let fold_changes =
|
|
868
871
|
// calculate_fold_change(normalized_matrix, case_indexes, control_indexes);
|
|
869
872
|
}
|
|
@@ -873,10 +876,10 @@ fn main() {
|
|
|
873
876
|
}
|
|
874
877
|
}
|
|
875
878
|
}
|
|
876
|
-
Err(error) =>
|
|
879
|
+
Err(error) => panic!("Incorrect json: {}", error),
|
|
877
880
|
}
|
|
878
881
|
}
|
|
879
|
-
Err(error) =>
|
|
882
|
+
Err(error) => panic!("Piping error: {}", error),
|
|
880
883
|
}
|
|
881
884
|
}
|
|
882
885
|
|
|
@@ -1321,7 +1324,7 @@ fn filter_by_expr(
|
|
|
1321
1324
|
positives.push(row);
|
|
1322
1325
|
}
|
|
1323
1326
|
}
|
|
1324
|
-
println!("positives length:{}", positives.len());
|
|
1327
|
+
//println!("positives length:{}", positives.len());
|
|
1325
1328
|
//println!("row_sums:{:?}", row_sums);
|
|
1326
1329
|
//println!("keep_cpm:{:?}", keep_cpm);
|
|
1327
1330
|
//println!("positive_cpm:{}", positive_cpm);
|
|
@@ -1337,8 +1340,8 @@ fn filter_by_expr(
|
|
|
1337
1340
|
let mut filtered_genes: Vec<String> = Vec::with_capacity(positives.len());
|
|
1338
1341
|
let mut filtered_gene_symbols: Vec<String> = Vec::with_capacity(positives.len());
|
|
1339
1342
|
let mut i = 0;
|
|
1340
|
-
println!("filtered_matrix rows:{}", filtered_matrix.nrows());
|
|
1341
|
-
println!("filtered_matrix cols:{}", filtered_matrix.ncols());
|
|
1343
|
+
//println!("filtered_matrix rows:{}", filtered_matrix.nrows());
|
|
1344
|
+
//println!("filtered_matrix cols:{}", filtered_matrix.ncols());
|
|
1342
1345
|
for index in positives {
|
|
1343
1346
|
let row = raw_data.row(index);
|
|
1344
1347
|
filtered_genes.push(gene_names[index].to_owned());
|