@sjcrh/proteinpaint-rust 2.81.5 → 2.99.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/DEanalysis.rs +80 -76
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.
|
|
2
|
+
"version": "2.99.0",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"description": "Rust-based utilities for proteinpaint",
|
|
5
5
|
"main": "index.js",
|
|
@@ -38,5 +38,5 @@
|
|
|
38
38
|
"devDependencies": {
|
|
39
39
|
"tape": "^5.2.2"
|
|
40
40
|
},
|
|
41
|
-
"pp_release_tag": "v2.
|
|
41
|
+
"pp_release_tag": "v2.99.0"
|
|
42
42
|
}
|
package/src/DEanalysis.rs
CHANGED
|
@@ -11,7 +11,7 @@ use nalgebra::base::Matrix;
|
|
|
11
11
|
use nalgebra::base::VecStorage;
|
|
12
12
|
use nalgebra::DMatrix;
|
|
13
13
|
use nalgebra::ViewStorage;
|
|
14
|
-
use ndarray::Array1;
|
|
14
|
+
//use ndarray::Array1;
|
|
15
15
|
use ndarray::Array2;
|
|
16
16
|
use ndarray::Dim;
|
|
17
17
|
use serde::{Deserialize, Serialize};
|
|
@@ -25,7 +25,7 @@ use std::io::Read;
|
|
|
25
25
|
use std::str::FromStr;
|
|
26
26
|
use std::sync::{Arc, Mutex}; // Multithreading library
|
|
27
27
|
use std::thread;
|
|
28
|
-
use std::time::Instant;
|
|
28
|
+
//use std::time::Instant;
|
|
29
29
|
//use std::cmp::Ordering;
|
|
30
30
|
//use std::env;
|
|
31
31
|
use std::io;
|
|
@@ -73,43 +73,45 @@ fn input_data_from_HDF5(
|
|
|
73
73
|
Vec<String>,
|
|
74
74
|
) {
|
|
75
75
|
let file = HDF5File::open(&hdf5_filename).unwrap(); // open for reading
|
|
76
|
-
|
|
76
|
+
|
|
77
|
+
//let ds_dim = file.dataset("dims").unwrap(); // open the dataset
|
|
77
78
|
let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
|
|
78
79
|
let mut case_indexes: Vec<usize> = Vec::with_capacity(case_list.len());
|
|
79
80
|
let mut control_indexes: Vec<usize> = Vec::with_capacity(control_list.len());
|
|
80
81
|
// Check the data type and read the dataset accordingly
|
|
81
|
-
let data_dim: Array1<_> = ds_dim.read::<usize, Dim<[usize; 1]>>().unwrap();
|
|
82
|
-
let num_samples = data_dim[0]; // Number of total columns in the dataset
|
|
83
|
-
let num_genes = data_dim[1]; // Number of total rows in the dataset
|
|
84
|
-
|
|
85
|
-
println!("
|
|
82
|
+
//let data_dim: Array1<_> = ds_dim.read::<usize, Dim<[usize; 1]>>().unwrap();
|
|
83
|
+
//let num_samples = data_dim[0]; // Number of total columns in the dataset
|
|
84
|
+
//let num_genes = data_dim[1]; // Number of total rows in the dataset
|
|
85
|
+
|
|
86
|
+
//println!("num_samples bulk:{}", num_samples);
|
|
87
|
+
//println!("num_genes bulk:{}", num_genes);
|
|
86
88
|
|
|
87
|
-
let now_gene_names = Instant::now();
|
|
89
|
+
//let now_gene_names = Instant::now();
|
|
88
90
|
let ds_gene_names = file.dataset("gene_names").unwrap();
|
|
89
|
-
println!("ds_gene_names:{:?}", ds_gene_names);
|
|
91
|
+
//println!("ds_gene_names:{:?}", ds_gene_names);
|
|
90
92
|
let gene_names = ds_gene_names
|
|
91
93
|
.read::<VarLenAscii, Dim<[usize; 1]>>()
|
|
92
94
|
.unwrap();
|
|
93
|
-
println!("\tgene_names = {:?}", gene_names);
|
|
94
|
-
println!("\tgene_names.shape() = {:?}", gene_names.shape());
|
|
95
|
-
println!("\tgene_names.strides() = {:?}", gene_names.strides());
|
|
96
|
-
println!("\tgene_names.ndim() = {:?}", gene_names.ndim());
|
|
97
|
-
println!("Time for parsing gene names:{:?}", now_gene_names.elapsed());
|
|
95
|
+
//println!("\tgene_names = {:?}", gene_names);
|
|
96
|
+
//println!("\tgene_names.shape() = {:?}", gene_names.shape());
|
|
97
|
+
//println!("\tgene_names.strides() = {:?}", gene_names.strides());
|
|
98
|
+
//println!("\tgene_names.ndim() = {:?}", gene_names.ndim());
|
|
99
|
+
//println!("Time for parsing gene names:{:?}", now_gene_names.elapsed());
|
|
98
100
|
|
|
99
|
-
let now_gene_symbols = Instant::now();
|
|
101
|
+
//let now_gene_symbols = Instant::now();
|
|
100
102
|
let ds_gene_symbols = file.dataset("gene_symbols").unwrap();
|
|
101
|
-
println!("ds_gene_symbols:{:?}", ds_gene_symbols);
|
|
103
|
+
//println!("ds_gene_symbols:{:?}", ds_gene_symbols);
|
|
102
104
|
let gene_symbols = ds_gene_symbols
|
|
103
105
|
.read::<VarLenAscii, Dim<[usize; 1]>>()
|
|
104
106
|
.unwrap();
|
|
105
|
-
println!("\tgene_symbols = {:?}", gene_symbols);
|
|
106
|
-
println!("\tgene_symbols.shape() = {:?}", gene_symbols.shape());
|
|
107
|
-
println!("\tgene_symbols.strides() = {:?}", gene_symbols.strides());
|
|
108
|
-
println!("\tgene_symbols.ndim() = {:?}", gene_symbols.ndim());
|
|
109
|
-
println!(
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
);
|
|
107
|
+
//println!("\tgene_symbols = {:?}", gene_symbols);
|
|
108
|
+
//println!("\tgene_symbols.shape() = {:?}", gene_symbols.shape());
|
|
109
|
+
//println!("\tgene_symbols.strides() = {:?}", gene_symbols.strides());
|
|
110
|
+
//println!("\tgene_symbols.ndim() = {:?}", gene_symbols.ndim());
|
|
111
|
+
//println!(
|
|
112
|
+
// "Time for parsing gene symbols:{:?}",
|
|
113
|
+
// now_gene_symbols.elapsed()
|
|
114
|
+
//);
|
|
113
115
|
|
|
114
116
|
let mut gene_names_string: Vec<String> = Vec::with_capacity(gene_names.len());
|
|
115
117
|
let mut gene_symbols_string: Vec<String> = Vec::with_capacity(gene_symbols.len());
|
|
@@ -118,17 +120,17 @@ fn input_data_from_HDF5(
|
|
|
118
120
|
gene_symbols_string.push(gene_symbols[i].to_string());
|
|
119
121
|
}
|
|
120
122
|
|
|
121
|
-
let now_samples = Instant::now();
|
|
123
|
+
//let now_samples = Instant::now();
|
|
122
124
|
let ds_samples = file.dataset("samples").unwrap();
|
|
123
125
|
let samples = ds_samples.read::<VarLenAscii, Dim<[usize; 1]>>().unwrap();
|
|
124
|
-
println!("\tsamples = {:?}", samples);
|
|
125
|
-
println!("\tsamples.shape() = {:?}", samples.shape());
|
|
126
|
-
println!("\tsamples.strides() = {:?}", samples.strides());
|
|
127
|
-
println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
128
|
-
println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
126
|
+
//println!("\tsamples = {:?}", samples);
|
|
127
|
+
//println!("\tsamples.shape() = {:?}", samples.shape());
|
|
128
|
+
//println!("\tsamples.strides() = {:?}", samples.strides());
|
|
129
|
+
//println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
130
|
+
//println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
129
131
|
|
|
130
132
|
//Find all columns values that are populated for the given gene
|
|
131
|
-
let now_counts = Instant::now();
|
|
133
|
+
//let now_counts = Instant::now();
|
|
132
134
|
let ds_counts = file.dataset("counts").unwrap(); // open the dataset
|
|
133
135
|
|
|
134
136
|
let mut global_sample_index = 0;
|
|
@@ -189,7 +191,7 @@ fn input_data_from_HDF5(
|
|
|
189
191
|
global_sample_index += 1;
|
|
190
192
|
}
|
|
191
193
|
|
|
192
|
-
println!("Time for parsing HDF5 data:{:?}", now_counts.elapsed());
|
|
194
|
+
//println!("Time for parsing HDF5 data:{:?}", now_counts.elapsed());
|
|
193
195
|
//println!(
|
|
194
196
|
// "case + control length:{}",
|
|
195
197
|
// case_list.len() + control_list.len()
|
|
@@ -221,7 +223,7 @@ fn input_data_from_text(
|
|
|
221
223
|
Vec<String>,
|
|
222
224
|
Vec<String>,
|
|
223
225
|
) {
|
|
224
|
-
let input_time = Instant::now();
|
|
226
|
+
//let input_time = Instant::now();
|
|
225
227
|
let mut file = File::open(filename).unwrap();
|
|
226
228
|
let mut num_lines: usize = 0;
|
|
227
229
|
let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
|
|
@@ -235,7 +237,8 @@ fn input_data_from_text(
|
|
|
235
237
|
// Check headers for samples
|
|
236
238
|
let lines: Vec<&str> = buffer.split('\n').collect::<Vec<&str>>();
|
|
237
239
|
let total_lines = lines.len();
|
|
238
|
-
let
|
|
240
|
+
let header_binding = lines[0].replace("\r", "");
|
|
241
|
+
let headers: Vec<&str> = header_binding.split('\t').collect::<Vec<&str>>();
|
|
239
242
|
//println!("headers:{:?}", headers);
|
|
240
243
|
let mut case_indexes_original: Vec<usize> = Vec::with_capacity(case_list.len());
|
|
241
244
|
let mut control_indexes_original: Vec<usize> = Vec::with_capacity(control_list.len());
|
|
@@ -282,7 +285,7 @@ fn input_data_from_text(
|
|
|
282
285
|
let lines_slice = &lines[..];
|
|
283
286
|
for line_iter in 1..lines_slice.len() - 1 {
|
|
284
287
|
// Subtracting 1 from total length of lines_slice because the last one will be empty
|
|
285
|
-
let line = lines_slice[line_iter];
|
|
288
|
+
let line = lines_slice[line_iter].replace("\r", "");
|
|
286
289
|
let mut index = 0;
|
|
287
290
|
for field in line.split('\t').collect::<Vec<&str>>() {
|
|
288
291
|
if index == gene_name_index.unwrap() {
|
|
@@ -349,7 +352,7 @@ fn input_data_from_text(
|
|
|
349
352
|
let genes_symbols_temp = Arc::new(Mutex::new(Vec::<String>::new()));
|
|
350
353
|
let input_vector_temp = Arc::new(Mutex::new(Vec::<f64>::new()));
|
|
351
354
|
let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
|
|
352
|
-
|
|
355
|
+
//println!("Number of threads used:{}", max_threads);
|
|
353
356
|
for thread_num in 0..max_threads {
|
|
354
357
|
let case_indexes_original = Arc::clone(&case_indexes_original);
|
|
355
358
|
let control_indexes_original = Arc::clone(&control_indexes_original);
|
|
@@ -377,7 +380,7 @@ fn input_data_from_text(
|
|
|
377
380
|
if remainder == thread_num {
|
|
378
381
|
//println!("buffer:{}", buffer);
|
|
379
382
|
// Thread analyzing a particular line must have the same remainder as the thread_num, this avoids multiple threads from parsing the same line
|
|
380
|
-
let line = lines[line_iter];
|
|
383
|
+
let line = lines[line_iter].replace("\r", "");
|
|
381
384
|
let mut index = 0;
|
|
382
385
|
for field in line.split('\t').collect::<Vec<&str>>() {
|
|
383
386
|
if index == gene_name_index.unwrap() {
|
|
@@ -484,7 +487,7 @@ fn input_data_from_text(
|
|
|
484
487
|
//println!("num_columns:{}", num_columns);
|
|
485
488
|
//println!("num_lines * num_columns:{}", num_lines * num_columns);
|
|
486
489
|
//println!("input_vector:{:?}", input_vector.len());
|
|
487
|
-
println!("Time for inputting data:{:?}", input_time.elapsed());
|
|
490
|
+
//println!("Time for inputting data:{:?}", input_time.elapsed());
|
|
488
491
|
let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
|
|
489
492
|
//println!("dm:{:?}", dm);
|
|
490
493
|
(dm, case_indexes, control_indexes, gene_names, gene_symbols)
|
|
@@ -512,14 +515,15 @@ struct PValueIndexes {
|
|
|
512
515
|
// Used to get the sample names from HDF5 file at PP server startup
|
|
513
516
|
fn get_DE_samples(hdf5_filename: &String) {
|
|
514
517
|
let file = HDF5File::open(&hdf5_filename).unwrap(); // open for reading
|
|
515
|
-
|
|
518
|
+
|
|
519
|
+
//let now_samples = Instant::now();
|
|
516
520
|
let ds_samples = file.dataset("samples").unwrap();
|
|
517
521
|
let samples = ds_samples.read::<VarLenAscii, Dim<[usize; 1]>>().unwrap();
|
|
518
|
-
println!("\tsamples = {:?}", samples);
|
|
519
|
-
println!("\tsamples.shape() = {:?}", samples.shape());
|
|
520
|
-
println!("\tsamples.strides() = {:?}", samples.strides());
|
|
521
|
-
println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
522
|
-
println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
522
|
+
//println!("\tsamples = {:?}", samples);
|
|
523
|
+
//println!("\tsamples.shape() = {:?}", samples.shape());
|
|
524
|
+
//println!("\tsamples.strides() = {:?}", samples.strides());
|
|
525
|
+
//println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
526
|
+
//println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
523
527
|
|
|
524
528
|
let mut output_string = "".to_string();
|
|
525
529
|
for i in 0..samples.len() {
|
|
@@ -543,7 +547,7 @@ fn get_DE_samples(hdf5_filename: &String) {
|
|
|
543
547
|
output_string += &",";
|
|
544
548
|
}
|
|
545
549
|
}
|
|
546
|
-
println!("
|
|
550
|
+
println!("{}", output_string);
|
|
547
551
|
}
|
|
548
552
|
|
|
549
553
|
fn main() {
|
|
@@ -558,7 +562,7 @@ fn main() {
|
|
|
558
562
|
let input_json = json::parse(&input);
|
|
559
563
|
match input_json {
|
|
560
564
|
Ok(json_string) => {
|
|
561
|
-
let now = Instant::now();
|
|
565
|
+
//let now = Instant::now();
|
|
562
566
|
let file_name = &json_string["input_file"]
|
|
563
567
|
.to_owned()
|
|
564
568
|
.as_str()
|
|
@@ -566,7 +570,7 @@ fn main() {
|
|
|
566
570
|
.to_string()
|
|
567
571
|
.split(",")
|
|
568
572
|
.collect();
|
|
569
|
-
println!("file_name:{}", file_name);
|
|
573
|
+
//println!("file_name:{}", file_name);
|
|
570
574
|
let data_type_option = json_string["data_type"].as_str().to_owned();
|
|
571
575
|
match data_type_option {
|
|
572
576
|
Some(x) => {
|
|
@@ -642,7 +646,7 @@ fn main() {
|
|
|
642
646
|
gene_symbols,
|
|
643
647
|
) = input_data_from_HDF5(file_name, &case_list, &control_list);
|
|
644
648
|
}
|
|
645
|
-
let filtering_time = Instant::now();
|
|
649
|
+
//let filtering_time = Instant::now();
|
|
646
650
|
let (
|
|
647
651
|
filtered_matrix,
|
|
648
652
|
lib_sizes,
|
|
@@ -657,21 +661,21 @@ fn main() {
|
|
|
657
661
|
gene_names,
|
|
658
662
|
gene_symbols,
|
|
659
663
|
);
|
|
660
|
-
println!("filtering time:{:?}", filtering_time.elapsed());
|
|
664
|
+
//println!("filtering time:{:?}", filtering_time.elapsed());
|
|
661
665
|
//println!("filtered_matrix_rows:{:?}", filtered_matrix.nrows());
|
|
662
666
|
//println!("filtered_matrix_cols:{:?}", filtered_matrix.ncols());
|
|
663
|
-
let cpm_normalization_time = Instant::now();
|
|
667
|
+
//let cpm_normalization_time = Instant::now();
|
|
664
668
|
let mut normalized_matrix = cpm(&filtered_matrix);
|
|
665
|
-
println!(
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
);
|
|
669
|
-
let tmm_normalization_time = Instant::now();
|
|
669
|
+
//println!(
|
|
670
|
+
// "cpm normalization time:{:?}",
|
|
671
|
+
// cpm_normalization_time.elapsed()
|
|
672
|
+
//);
|
|
673
|
+
//let tmm_normalization_time = Instant::now();
|
|
670
674
|
let norm_factors = tmm_normalization(filtered_matrix, &lib_sizes);
|
|
671
|
-
println!(
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
);
|
|
675
|
+
//println!(
|
|
676
|
+
// "tmm normalization time:{:?}",
|
|
677
|
+
// tmm_normalization_time.elapsed()
|
|
678
|
+
//);
|
|
675
679
|
//println!("norm_factors:{:?}", norm_factors);
|
|
676
680
|
|
|
677
681
|
for col in 0..normalized_matrix.ncols() {
|
|
@@ -682,19 +686,19 @@ fn main() {
|
|
|
682
686
|
}
|
|
683
687
|
}
|
|
684
688
|
//println!("normalized_matrix:{:?}", normalized_matrix);
|
|
685
|
-
println!("Number of cases:{}", case_list.len());
|
|
686
|
-
println!("Number of controls:{}", control_list.len());
|
|
687
|
-
println!("Time for pre-processing:{:?}", now.elapsed());
|
|
689
|
+
//println!("Number of cases:{}", case_list.len());
|
|
690
|
+
//println!("Number of controls:{}", control_list.len());
|
|
691
|
+
//println!("Time for pre-processing:{:?}", now.elapsed());
|
|
688
692
|
// Using Wilcoxon test for differential gene expression
|
|
689
693
|
|
|
690
|
-
let now2 = Instant::now();
|
|
694
|
+
//let now2 = Instant::now();
|
|
691
695
|
let mut p_values: Vec<PValueIndexes> =
|
|
692
696
|
Vec::with_capacity(normalized_matrix.nrows());
|
|
693
697
|
const THRESHOLD: usize = 50; // This determines whether the Wilcoxon exact test or the normal test will be used based on sample size.
|
|
694
698
|
|
|
695
699
|
//println!("case_indexes:{:?}", case_indexes);
|
|
696
700
|
//println!("control_indexes:{:?}", control_indexes);
|
|
697
|
-
let num_normalized_rows = normalized_matrix.nrows();
|
|
701
|
+
//let num_normalized_rows = normalized_matrix.nrows();
|
|
698
702
|
if normalized_matrix.nrows() * normalized_matrix.ncols()
|
|
699
703
|
< PAR_CUTOFF
|
|
700
704
|
{
|
|
@@ -856,13 +860,13 @@ fn main() {
|
|
|
856
860
|
p_values.append(&mut *p_values_temp.lock().unwrap());
|
|
857
861
|
}
|
|
858
862
|
//println!("p_values:{:?}", p_values);
|
|
859
|
-
println!(
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
);
|
|
863
|
+
//println!(
|
|
864
|
+
// "Time for running {} wilcoxon tests:{:?}",
|
|
865
|
+
// num_normalized_rows,
|
|
866
|
+
// now2.elapsed()
|
|
867
|
+
//);
|
|
864
868
|
let adjusted_p_values = adjust_p_values(p_values);
|
|
865
|
-
println!("
|
|
869
|
+
println!("{}", adjusted_p_values);
|
|
866
870
|
//let fold_changes =
|
|
867
871
|
// calculate_fold_change(normalized_matrix, case_indexes, control_indexes);
|
|
868
872
|
}
|
|
@@ -872,10 +876,10 @@ fn main() {
|
|
|
872
876
|
}
|
|
873
877
|
}
|
|
874
878
|
}
|
|
875
|
-
Err(error) =>
|
|
879
|
+
Err(error) => panic!("Incorrect json: {}", error),
|
|
876
880
|
}
|
|
877
881
|
}
|
|
878
|
-
Err(error) =>
|
|
882
|
+
Err(error) => panic!("Piping error: {}", error),
|
|
879
883
|
}
|
|
880
884
|
}
|
|
881
885
|
|
|
@@ -1320,7 +1324,7 @@ fn filter_by_expr(
|
|
|
1320
1324
|
positives.push(row);
|
|
1321
1325
|
}
|
|
1322
1326
|
}
|
|
1323
|
-
println!("positives length:{}", positives.len());
|
|
1327
|
+
//println!("positives length:{}", positives.len());
|
|
1324
1328
|
//println!("row_sums:{:?}", row_sums);
|
|
1325
1329
|
//println!("keep_cpm:{:?}", keep_cpm);
|
|
1326
1330
|
//println!("positive_cpm:{}", positive_cpm);
|
|
@@ -1336,8 +1340,8 @@ fn filter_by_expr(
|
|
|
1336
1340
|
let mut filtered_genes: Vec<String> = Vec::with_capacity(positives.len());
|
|
1337
1341
|
let mut filtered_gene_symbols: Vec<String> = Vec::with_capacity(positives.len());
|
|
1338
1342
|
let mut i = 0;
|
|
1339
|
-
println!("filtered_matrix rows:{}", filtered_matrix.nrows());
|
|
1340
|
-
println!("filtered_matrix cols:{}", filtered_matrix.ncols());
|
|
1343
|
+
//println!("filtered_matrix rows:{}", filtered_matrix.nrows());
|
|
1344
|
+
//println!("filtered_matrix cols:{}", filtered_matrix.ncols());
|
|
1341
1345
|
for index in positives {
|
|
1342
1346
|
let row = raw_data.row(index);
|
|
1343
1347
|
filtered_genes.push(gene_names[index].to_owned());
|