@sjcrh/proteinpaint-rust 2.84.0 → 2.110.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +25 -7
- package/package.json +2 -2
- package/src/DEanalysis.rs +76 -73
- package/src/gdcmaf.rs +102 -28
package/index.js
CHANGED
|
@@ -43,16 +43,22 @@ exports.run_rust = function (binfile, input_data) {
|
|
|
43
43
|
})
|
|
44
44
|
}
|
|
45
45
|
|
|
46
|
-
exports.
|
|
46
|
+
exports.stream_rust = function (binfile, input_data, emitJson) {
|
|
47
47
|
const binpath = path.join(__dirname, '/target/release/', binfile)
|
|
48
48
|
const ps = spawn(binpath)
|
|
49
|
+
const stderr = []
|
|
49
50
|
try {
|
|
51
|
+
// from GDC API -> ps.stdin -> ps.stdout -> transformed stream
|
|
50
52
|
Readable.from(input_data).pipe(ps.stdin)
|
|
53
|
+
//reader.on('data', ps.stdout.pipe)
|
|
54
|
+
//reader.on('error', ps.stderr.pipe)
|
|
55
|
+
//return reader
|
|
51
56
|
} catch (error) {
|
|
52
57
|
ps.kill()
|
|
53
58
|
let errmsg = error
|
|
54
|
-
if (stderr.length) errmsg += `killed run_rust('${binfile}'), stderr: ${stderr.join('').trim()}`
|
|
55
|
-
reject(errmsg)
|
|
59
|
+
//if (stderr.length) errmsg += `killed run_rust('${binfile}'), stderr: ${stderr.join('').trim()}`
|
|
60
|
+
//reject(errmsg)
|
|
61
|
+
console.log(59, error)
|
|
56
62
|
}
|
|
57
63
|
|
|
58
64
|
const childStream = new Transform({
|
|
@@ -62,11 +68,23 @@ exports.run_rust_stream = function (binfile, input_data) {
|
|
|
62
68
|
}
|
|
63
69
|
})
|
|
64
70
|
ps.stdout.pipe(childStream)
|
|
65
|
-
|
|
66
|
-
|
|
71
|
+
ps.stderr.on('data', data => stderr.push(data))
|
|
72
|
+
ps.on('close', code => { //console.log(72, stderr.length)
|
|
73
|
+
if (stderr.length) {
|
|
74
|
+
// handle rust stderr
|
|
75
|
+
const err = stderr.join('').trim()
|
|
76
|
+
const errmsg = `!!! stream_rust('${binfile}') stderr: !!!\n${err}`
|
|
77
|
+
console.log(errmsg)
|
|
78
|
+
emitJson(err)
|
|
79
|
+
} else {
|
|
80
|
+
emitJson({ ok: true, status: 'ok', message: 'Processing complete' })
|
|
81
|
+
}
|
|
67
82
|
})
|
|
68
|
-
|
|
69
|
-
|
|
83
|
+
ps.on('error', err => {
|
|
84
|
+
console.log(74, `stream_rust().on('error')`, err)
|
|
85
|
+
emitJson(stderr.join('').trim())
|
|
70
86
|
})
|
|
87
|
+
// below will duplicate ps.on('close') event above
|
|
88
|
+
// childStream.on('end', () => console.log(`-- childStream done --`))
|
|
71
89
|
return childStream
|
|
72
90
|
}
|
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.
|
|
2
|
+
"version": "2.110.0",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"description": "Rust-based utilities for proteinpaint",
|
|
5
5
|
"main": "index.js",
|
|
@@ -38,5 +38,5 @@
|
|
|
38
38
|
"devDependencies": {
|
|
39
39
|
"tape": "^5.2.2"
|
|
40
40
|
},
|
|
41
|
-
"pp_release_tag": "v2.
|
|
41
|
+
"pp_release_tag": "v2.110.0"
|
|
42
42
|
}
|
package/src/DEanalysis.rs
CHANGED
|
@@ -11,7 +11,7 @@ use nalgebra::base::Matrix;
|
|
|
11
11
|
use nalgebra::base::VecStorage;
|
|
12
12
|
use nalgebra::DMatrix;
|
|
13
13
|
use nalgebra::ViewStorage;
|
|
14
|
-
use ndarray::Array1;
|
|
14
|
+
//use ndarray::Array1;
|
|
15
15
|
use ndarray::Array2;
|
|
16
16
|
use ndarray::Dim;
|
|
17
17
|
use serde::{Deserialize, Serialize};
|
|
@@ -25,7 +25,7 @@ use std::io::Read;
|
|
|
25
25
|
use std::str::FromStr;
|
|
26
26
|
use std::sync::{Arc, Mutex}; // Multithreading library
|
|
27
27
|
use std::thread;
|
|
28
|
-
use std::time::Instant;
|
|
28
|
+
//use std::time::Instant;
|
|
29
29
|
//use std::cmp::Ordering;
|
|
30
30
|
//use std::env;
|
|
31
31
|
use std::io;
|
|
@@ -73,43 +73,45 @@ fn input_data_from_HDF5(
|
|
|
73
73
|
Vec<String>,
|
|
74
74
|
) {
|
|
75
75
|
let file = HDF5File::open(&hdf5_filename).unwrap(); // open for reading
|
|
76
|
-
|
|
76
|
+
|
|
77
|
+
//let ds_dim = file.dataset("dims").unwrap(); // open the dataset
|
|
77
78
|
let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
|
|
78
79
|
let mut case_indexes: Vec<usize> = Vec::with_capacity(case_list.len());
|
|
79
80
|
let mut control_indexes: Vec<usize> = Vec::with_capacity(control_list.len());
|
|
80
81
|
// Check the data type and read the dataset accordingly
|
|
81
|
-
let data_dim: Array1<_> = ds_dim.read::<usize, Dim<[usize; 1]>>().unwrap();
|
|
82
|
-
let num_samples = data_dim[0]; // Number of total columns in the dataset
|
|
83
|
-
let num_genes = data_dim[1]; // Number of total rows in the dataset
|
|
84
|
-
|
|
85
|
-
println!("
|
|
82
|
+
//let data_dim: Array1<_> = ds_dim.read::<usize, Dim<[usize; 1]>>().unwrap();
|
|
83
|
+
//let num_samples = data_dim[0]; // Number of total columns in the dataset
|
|
84
|
+
//let num_genes = data_dim[1]; // Number of total rows in the dataset
|
|
85
|
+
|
|
86
|
+
//println!("num_samples bulk:{}", num_samples);
|
|
87
|
+
//println!("num_genes bulk:{}", num_genes);
|
|
86
88
|
|
|
87
|
-
let now_gene_names = Instant::now();
|
|
89
|
+
//let now_gene_names = Instant::now();
|
|
88
90
|
let ds_gene_names = file.dataset("gene_names").unwrap();
|
|
89
|
-
println!("ds_gene_names:{:?}", ds_gene_names);
|
|
91
|
+
//println!("ds_gene_names:{:?}", ds_gene_names);
|
|
90
92
|
let gene_names = ds_gene_names
|
|
91
93
|
.read::<VarLenAscii, Dim<[usize; 1]>>()
|
|
92
94
|
.unwrap();
|
|
93
|
-
println!("\tgene_names = {:?}", gene_names);
|
|
94
|
-
println!("\tgene_names.shape() = {:?}", gene_names.shape());
|
|
95
|
-
println!("\tgene_names.strides() = {:?}", gene_names.strides());
|
|
96
|
-
println!("\tgene_names.ndim() = {:?}", gene_names.ndim());
|
|
97
|
-
println!("Time for parsing gene names:{:?}", now_gene_names.elapsed());
|
|
95
|
+
//println!("\tgene_names = {:?}", gene_names);
|
|
96
|
+
//println!("\tgene_names.shape() = {:?}", gene_names.shape());
|
|
97
|
+
//println!("\tgene_names.strides() = {:?}", gene_names.strides());
|
|
98
|
+
//println!("\tgene_names.ndim() = {:?}", gene_names.ndim());
|
|
99
|
+
//println!("Time for parsing gene names:{:?}", now_gene_names.elapsed());
|
|
98
100
|
|
|
99
|
-
let now_gene_symbols = Instant::now();
|
|
101
|
+
//let now_gene_symbols = Instant::now();
|
|
100
102
|
let ds_gene_symbols = file.dataset("gene_symbols").unwrap();
|
|
101
|
-
println!("ds_gene_symbols:{:?}", ds_gene_symbols);
|
|
103
|
+
//println!("ds_gene_symbols:{:?}", ds_gene_symbols);
|
|
102
104
|
let gene_symbols = ds_gene_symbols
|
|
103
105
|
.read::<VarLenAscii, Dim<[usize; 1]>>()
|
|
104
106
|
.unwrap();
|
|
105
|
-
println!("\tgene_symbols = {:?}", gene_symbols);
|
|
106
|
-
println!("\tgene_symbols.shape() = {:?}", gene_symbols.shape());
|
|
107
|
-
println!("\tgene_symbols.strides() = {:?}", gene_symbols.strides());
|
|
108
|
-
println!("\tgene_symbols.ndim() = {:?}", gene_symbols.ndim());
|
|
109
|
-
println!(
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
);
|
|
107
|
+
//println!("\tgene_symbols = {:?}", gene_symbols);
|
|
108
|
+
//println!("\tgene_symbols.shape() = {:?}", gene_symbols.shape());
|
|
109
|
+
//println!("\tgene_symbols.strides() = {:?}", gene_symbols.strides());
|
|
110
|
+
//println!("\tgene_symbols.ndim() = {:?}", gene_symbols.ndim());
|
|
111
|
+
//println!(
|
|
112
|
+
// "Time for parsing gene symbols:{:?}",
|
|
113
|
+
// now_gene_symbols.elapsed()
|
|
114
|
+
//);
|
|
113
115
|
|
|
114
116
|
let mut gene_names_string: Vec<String> = Vec::with_capacity(gene_names.len());
|
|
115
117
|
let mut gene_symbols_string: Vec<String> = Vec::with_capacity(gene_symbols.len());
|
|
@@ -118,17 +120,17 @@ fn input_data_from_HDF5(
|
|
|
118
120
|
gene_symbols_string.push(gene_symbols[i].to_string());
|
|
119
121
|
}
|
|
120
122
|
|
|
121
|
-
let now_samples = Instant::now();
|
|
123
|
+
//let now_samples = Instant::now();
|
|
122
124
|
let ds_samples = file.dataset("samples").unwrap();
|
|
123
125
|
let samples = ds_samples.read::<VarLenAscii, Dim<[usize; 1]>>().unwrap();
|
|
124
|
-
println!("\tsamples = {:?}", samples);
|
|
125
|
-
println!("\tsamples.shape() = {:?}", samples.shape());
|
|
126
|
-
println!("\tsamples.strides() = {:?}", samples.strides());
|
|
127
|
-
println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
128
|
-
println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
126
|
+
//println!("\tsamples = {:?}", samples);
|
|
127
|
+
//println!("\tsamples.shape() = {:?}", samples.shape());
|
|
128
|
+
//println!("\tsamples.strides() = {:?}", samples.strides());
|
|
129
|
+
//println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
130
|
+
//println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
129
131
|
|
|
130
132
|
//Find all columns values that are populated for the given gene
|
|
131
|
-
let now_counts = Instant::now();
|
|
133
|
+
//let now_counts = Instant::now();
|
|
132
134
|
let ds_counts = file.dataset("counts").unwrap(); // open the dataset
|
|
133
135
|
|
|
134
136
|
let mut global_sample_index = 0;
|
|
@@ -189,7 +191,7 @@ fn input_data_from_HDF5(
|
|
|
189
191
|
global_sample_index += 1;
|
|
190
192
|
}
|
|
191
193
|
|
|
192
|
-
println!("Time for parsing HDF5 data:{:?}", now_counts.elapsed());
|
|
194
|
+
//println!("Time for parsing HDF5 data:{:?}", now_counts.elapsed());
|
|
193
195
|
//println!(
|
|
194
196
|
// "case + control length:{}",
|
|
195
197
|
// case_list.len() + control_list.len()
|
|
@@ -221,7 +223,7 @@ fn input_data_from_text(
|
|
|
221
223
|
Vec<String>,
|
|
222
224
|
Vec<String>,
|
|
223
225
|
) {
|
|
224
|
-
let input_time = Instant::now();
|
|
226
|
+
//let input_time = Instant::now();
|
|
225
227
|
let mut file = File::open(filename).unwrap();
|
|
226
228
|
let mut num_lines: usize = 0;
|
|
227
229
|
let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
|
|
@@ -350,7 +352,7 @@ fn input_data_from_text(
|
|
|
350
352
|
let genes_symbols_temp = Arc::new(Mutex::new(Vec::<String>::new()));
|
|
351
353
|
let input_vector_temp = Arc::new(Mutex::new(Vec::<f64>::new()));
|
|
352
354
|
let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
|
|
353
|
-
|
|
355
|
+
//println!("Number of threads used:{}", max_threads);
|
|
354
356
|
for thread_num in 0..max_threads {
|
|
355
357
|
let case_indexes_original = Arc::clone(&case_indexes_original);
|
|
356
358
|
let control_indexes_original = Arc::clone(&control_indexes_original);
|
|
@@ -485,7 +487,7 @@ fn input_data_from_text(
|
|
|
485
487
|
//println!("num_columns:{}", num_columns);
|
|
486
488
|
//println!("num_lines * num_columns:{}", num_lines * num_columns);
|
|
487
489
|
//println!("input_vector:{:?}", input_vector.len());
|
|
488
|
-
println!("Time for inputting data:{:?}", input_time.elapsed());
|
|
490
|
+
//println!("Time for inputting data:{:?}", input_time.elapsed());
|
|
489
491
|
let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
|
|
490
492
|
//println!("dm:{:?}", dm);
|
|
491
493
|
(dm, case_indexes, control_indexes, gene_names, gene_symbols)
|
|
@@ -513,14 +515,15 @@ struct PValueIndexes {
|
|
|
513
515
|
// Used to get the sample names from HDF5 file at PP server startup
|
|
514
516
|
fn get_DE_samples(hdf5_filename: &String) {
|
|
515
517
|
let file = HDF5File::open(&hdf5_filename).unwrap(); // open for reading
|
|
516
|
-
|
|
518
|
+
|
|
519
|
+
//let now_samples = Instant::now();
|
|
517
520
|
let ds_samples = file.dataset("samples").unwrap();
|
|
518
521
|
let samples = ds_samples.read::<VarLenAscii, Dim<[usize; 1]>>().unwrap();
|
|
519
|
-
println!("\tsamples = {:?}", samples);
|
|
520
|
-
println!("\tsamples.shape() = {:?}", samples.shape());
|
|
521
|
-
println!("\tsamples.strides() = {:?}", samples.strides());
|
|
522
|
-
println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
523
|
-
println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
522
|
+
//println!("\tsamples = {:?}", samples);
|
|
523
|
+
//println!("\tsamples.shape() = {:?}", samples.shape());
|
|
524
|
+
//println!("\tsamples.strides() = {:?}", samples.strides());
|
|
525
|
+
//println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
526
|
+
//println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
524
527
|
|
|
525
528
|
let mut output_string = "".to_string();
|
|
526
529
|
for i in 0..samples.len() {
|
|
@@ -544,7 +547,7 @@ fn get_DE_samples(hdf5_filename: &String) {
|
|
|
544
547
|
output_string += &",";
|
|
545
548
|
}
|
|
546
549
|
}
|
|
547
|
-
println!("
|
|
550
|
+
println!("{}", output_string);
|
|
548
551
|
}
|
|
549
552
|
|
|
550
553
|
fn main() {
|
|
@@ -559,7 +562,7 @@ fn main() {
|
|
|
559
562
|
let input_json = json::parse(&input);
|
|
560
563
|
match input_json {
|
|
561
564
|
Ok(json_string) => {
|
|
562
|
-
let now = Instant::now();
|
|
565
|
+
//let now = Instant::now();
|
|
563
566
|
let file_name = &json_string["input_file"]
|
|
564
567
|
.to_owned()
|
|
565
568
|
.as_str()
|
|
@@ -567,7 +570,7 @@ fn main() {
|
|
|
567
570
|
.to_string()
|
|
568
571
|
.split(",")
|
|
569
572
|
.collect();
|
|
570
|
-
println!("file_name:{}", file_name);
|
|
573
|
+
//println!("file_name:{}", file_name);
|
|
571
574
|
let data_type_option = json_string["data_type"].as_str().to_owned();
|
|
572
575
|
match data_type_option {
|
|
573
576
|
Some(x) => {
|
|
@@ -643,7 +646,7 @@ fn main() {
|
|
|
643
646
|
gene_symbols,
|
|
644
647
|
) = input_data_from_HDF5(file_name, &case_list, &control_list);
|
|
645
648
|
}
|
|
646
|
-
let filtering_time = Instant::now();
|
|
649
|
+
//let filtering_time = Instant::now();
|
|
647
650
|
let (
|
|
648
651
|
filtered_matrix,
|
|
649
652
|
lib_sizes,
|
|
@@ -658,21 +661,21 @@ fn main() {
|
|
|
658
661
|
gene_names,
|
|
659
662
|
gene_symbols,
|
|
660
663
|
);
|
|
661
|
-
println!("filtering time:{:?}", filtering_time.elapsed());
|
|
664
|
+
//println!("filtering time:{:?}", filtering_time.elapsed());
|
|
662
665
|
//println!("filtered_matrix_rows:{:?}", filtered_matrix.nrows());
|
|
663
666
|
//println!("filtered_matrix_cols:{:?}", filtered_matrix.ncols());
|
|
664
|
-
let cpm_normalization_time = Instant::now();
|
|
667
|
+
//let cpm_normalization_time = Instant::now();
|
|
665
668
|
let mut normalized_matrix = cpm(&filtered_matrix);
|
|
666
|
-
println!(
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
);
|
|
670
|
-
let tmm_normalization_time = Instant::now();
|
|
669
|
+
//println!(
|
|
670
|
+
// "cpm normalization time:{:?}",
|
|
671
|
+
// cpm_normalization_time.elapsed()
|
|
672
|
+
//);
|
|
673
|
+
//let tmm_normalization_time = Instant::now();
|
|
671
674
|
let norm_factors = tmm_normalization(filtered_matrix, &lib_sizes);
|
|
672
|
-
println!(
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
);
|
|
675
|
+
//println!(
|
|
676
|
+
// "tmm normalization time:{:?}",
|
|
677
|
+
// tmm_normalization_time.elapsed()
|
|
678
|
+
//);
|
|
676
679
|
//println!("norm_factors:{:?}", norm_factors);
|
|
677
680
|
|
|
678
681
|
for col in 0..normalized_matrix.ncols() {
|
|
@@ -683,19 +686,19 @@ fn main() {
|
|
|
683
686
|
}
|
|
684
687
|
}
|
|
685
688
|
//println!("normalized_matrix:{:?}", normalized_matrix);
|
|
686
|
-
println!("Number of cases:{}", case_list.len());
|
|
687
|
-
println!("Number of controls:{}", control_list.len());
|
|
688
|
-
println!("Time for pre-processing:{:?}", now.elapsed());
|
|
689
|
+
//println!("Number of cases:{}", case_list.len());
|
|
690
|
+
//println!("Number of controls:{}", control_list.len());
|
|
691
|
+
//println!("Time for pre-processing:{:?}", now.elapsed());
|
|
689
692
|
// Using Wilcoxon test for differential gene expression
|
|
690
693
|
|
|
691
|
-
let now2 = Instant::now();
|
|
694
|
+
//let now2 = Instant::now();
|
|
692
695
|
let mut p_values: Vec<PValueIndexes> =
|
|
693
696
|
Vec::with_capacity(normalized_matrix.nrows());
|
|
694
697
|
const THRESHOLD: usize = 50; // This determines whether the Wilcoxon exact test or the normal test will be used based on sample size.
|
|
695
698
|
|
|
696
699
|
//println!("case_indexes:{:?}", case_indexes);
|
|
697
700
|
//println!("control_indexes:{:?}", control_indexes);
|
|
698
|
-
let num_normalized_rows = normalized_matrix.nrows();
|
|
701
|
+
//let num_normalized_rows = normalized_matrix.nrows();
|
|
699
702
|
if normalized_matrix.nrows() * normalized_matrix.ncols()
|
|
700
703
|
< PAR_CUTOFF
|
|
701
704
|
{
|
|
@@ -857,13 +860,13 @@ fn main() {
|
|
|
857
860
|
p_values.append(&mut *p_values_temp.lock().unwrap());
|
|
858
861
|
}
|
|
859
862
|
//println!("p_values:{:?}", p_values);
|
|
860
|
-
println!(
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
);
|
|
863
|
+
//println!(
|
|
864
|
+
// "Time for running {} wilcoxon tests:{:?}",
|
|
865
|
+
// num_normalized_rows,
|
|
866
|
+
// now2.elapsed()
|
|
867
|
+
//);
|
|
865
868
|
let adjusted_p_values = adjust_p_values(p_values);
|
|
866
|
-
println!("
|
|
869
|
+
println!("{}", adjusted_p_values);
|
|
867
870
|
//let fold_changes =
|
|
868
871
|
// calculate_fold_change(normalized_matrix, case_indexes, control_indexes);
|
|
869
872
|
}
|
|
@@ -873,10 +876,10 @@ fn main() {
|
|
|
873
876
|
}
|
|
874
877
|
}
|
|
875
878
|
}
|
|
876
|
-
Err(error) =>
|
|
879
|
+
Err(error) => panic!("Incorrect json: {}", error),
|
|
877
880
|
}
|
|
878
881
|
}
|
|
879
|
-
Err(error) =>
|
|
882
|
+
Err(error) => panic!("Piping error: {}", error),
|
|
880
883
|
}
|
|
881
884
|
}
|
|
882
885
|
|
|
@@ -1321,7 +1324,7 @@ fn filter_by_expr(
|
|
|
1321
1324
|
positives.push(row);
|
|
1322
1325
|
}
|
|
1323
1326
|
}
|
|
1324
|
-
println!("positives length:{}", positives.len());
|
|
1327
|
+
//println!("positives length:{}", positives.len());
|
|
1325
1328
|
//println!("row_sums:{:?}", row_sums);
|
|
1326
1329
|
//println!("keep_cpm:{:?}", keep_cpm);
|
|
1327
1330
|
//println!("positive_cpm:{}", positive_cpm);
|
|
@@ -1337,8 +1340,8 @@ fn filter_by_expr(
|
|
|
1337
1340
|
let mut filtered_genes: Vec<String> = Vec::with_capacity(positives.len());
|
|
1338
1341
|
let mut filtered_gene_symbols: Vec<String> = Vec::with_capacity(positives.len());
|
|
1339
1342
|
let mut i = 0;
|
|
1340
|
-
println!("filtered_matrix rows:{}", filtered_matrix.nrows());
|
|
1341
|
-
println!("filtered_matrix cols:{}", filtered_matrix.ncols());
|
|
1343
|
+
//println!("filtered_matrix rows:{}", filtered_matrix.nrows());
|
|
1344
|
+
//println!("filtered_matrix cols:{}", filtered_matrix.ncols());
|
|
1342
1345
|
for index in positives {
|
|
1343
1346
|
let row = raw_data.row(index);
|
|
1344
1347
|
filtered_genes.push(gene_names[index].to_owned());
|
package/src/gdcmaf.rs
CHANGED
|
@@ -13,17 +13,25 @@
|
|
|
13
13
|
use flate2::read::GzDecoder;
|
|
14
14
|
use flate2::write::GzEncoder;
|
|
15
15
|
use flate2::Compression;
|
|
16
|
-
use serde_json::Value;
|
|
16
|
+
use serde_json::{Value,json};
|
|
17
17
|
use std::path::Path;
|
|
18
18
|
use futures::StreamExt;
|
|
19
19
|
use std::io::{self,Read,Write};
|
|
20
|
+
use std::sync::Mutex;
|
|
20
21
|
|
|
21
22
|
|
|
23
|
+
// Struct to hold error information
|
|
24
|
+
#[derive(serde::Serialize)]
|
|
25
|
+
struct ErrorEntry {
|
|
26
|
+
url: String,
|
|
27
|
+
error: String,
|
|
28
|
+
}
|
|
22
29
|
|
|
23
|
-
fn select_maf_col(d:String,columns:&Vec<String
|
|
30
|
+
fn select_maf_col(d:String,columns:&Vec<String>,url:&str,errors: &Mutex<Vec<ErrorEntry>>) -> (Vec<u8>,i32) {
|
|
24
31
|
let mut maf_str: String = String::new();
|
|
25
32
|
let mut header_indices: Vec<usize> = Vec::new();
|
|
26
33
|
let lines = d.trim_end().split("\n");
|
|
34
|
+
let mut mafrows = 0;
|
|
27
35
|
for line in lines {
|
|
28
36
|
if line.starts_with("#") {
|
|
29
37
|
continue
|
|
@@ -33,6 +41,11 @@ fn select_maf_col(d:String,columns:&Vec<String>) -> Vec<u8> {
|
|
|
33
41
|
if let Some(index) = header.iter().position(|x| x == col) {
|
|
34
42
|
header_indices.push(index);
|
|
35
43
|
} else {
|
|
44
|
+
let error_msg = format!("Column {} was not found", col);
|
|
45
|
+
errors.lock().unwrap().push(ErrorEntry {
|
|
46
|
+
url: url.to_string().clone(),
|
|
47
|
+
error: error_msg.clone(),
|
|
48
|
+
});
|
|
36
49
|
panic!("{} was not found!",col);
|
|
37
50
|
}
|
|
38
51
|
}
|
|
@@ -44,14 +57,17 @@ fn select_maf_col(d:String,columns:&Vec<String>) -> Vec<u8> {
|
|
|
44
57
|
};
|
|
45
58
|
maf_str.push_str(maf_out_lst.join("\t").as_str());
|
|
46
59
|
maf_str.push_str("\n");
|
|
60
|
+
mafrows += 1;
|
|
47
61
|
}
|
|
48
62
|
};
|
|
49
|
-
maf_str.as_bytes().to_vec()
|
|
63
|
+
(maf_str.as_bytes().to_vec(),mafrows)
|
|
50
64
|
}
|
|
51
65
|
|
|
52
66
|
|
|
53
67
|
#[tokio::main]
|
|
54
68
|
async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
69
|
+
// Create a thread-container for errors
|
|
70
|
+
let errors = Mutex::new(Vec::<ErrorEntry>::new());
|
|
55
71
|
// Accepting the piped input json from jodejs and assign to the variable
|
|
56
72
|
// host: GDC host
|
|
57
73
|
// url: urls to download single maf files
|
|
@@ -75,9 +91,17 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
|
75
91
|
.map(|v| v.to_string().replace("\"",""))
|
|
76
92
|
.collect::<Vec<String>>();
|
|
77
93
|
} else {
|
|
94
|
+
errors.lock().unwrap().push(ErrorEntry {
|
|
95
|
+
url: String::new(),
|
|
96
|
+
error: "The columns of arg is not an array".to_string(),
|
|
97
|
+
});
|
|
78
98
|
panic!("Columns is not an array");
|
|
79
99
|
}
|
|
80
100
|
} else {
|
|
101
|
+
errors.lock().unwrap().push(ErrorEntry {
|
|
102
|
+
url: String::new(),
|
|
103
|
+
error: "The key columns is missed from arg".to_string(),
|
|
104
|
+
});
|
|
81
105
|
panic!("Columns was not selected");
|
|
82
106
|
};
|
|
83
107
|
|
|
@@ -85,39 +109,89 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
|
|
|
85
109
|
let download_futures = futures::stream::iter(
|
|
86
110
|
url.into_iter().map(|url|{
|
|
87
111
|
async move {
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
112
|
+
match reqwest::get(&url).await {
|
|
113
|
+
Ok(resp) if resp.status().is_success() => {
|
|
114
|
+
match resp.bytes().await {
|
|
115
|
+
Ok(content) => {
|
|
116
|
+
let mut decoder = GzDecoder::new(&content[..]);
|
|
117
|
+
let mut decompressed_content = Vec::new();
|
|
118
|
+
match decoder.read_to_end(&mut decompressed_content) {
|
|
119
|
+
Ok(_) => {
|
|
120
|
+
let text = String::from_utf8_lossy(&decompressed_content).to_string();
|
|
121
|
+
return Ok((url.clone(),text))
|
|
122
|
+
}
|
|
123
|
+
Err(e) => {
|
|
124
|
+
let error_msg = format!("Decompression failed: {}", e);
|
|
125
|
+
Err((url.clone(), error_msg))
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
Err(e) => {
|
|
130
|
+
let error_msg = format!("Decompression failed: {}", e);
|
|
131
|
+
Err((url.clone(), error_msg))
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
Ok(resp) => {
|
|
136
|
+
let error_msg = format!("HTTP error: {}", resp.status());
|
|
137
|
+
Err((url.clone(), error_msg))
|
|
138
|
+
}
|
|
139
|
+
Err(e) => {
|
|
140
|
+
let error_msg = format!("Server request failed: {}", e);
|
|
141
|
+
Err((url.clone(), error_msg))
|
|
100
142
|
}
|
|
101
|
-
} else {
|
|
102
|
-
let error_msg = "Failed to download: ".to_string() + &url;
|
|
103
|
-
error_msg
|
|
104
143
|
}
|
|
105
144
|
}
|
|
106
145
|
})
|
|
107
146
|
);
|
|
108
147
|
|
|
109
|
-
// output
|
|
148
|
+
// binary output
|
|
110
149
|
let mut encoder = GzEncoder::new(io::stdout(), Compression::default());
|
|
111
150
|
let _ = encoder.write_all(&maf_col.join("\t").as_bytes().to_vec()).expect("Failed to write header");
|
|
112
151
|
let _ = encoder.write_all(b"\n").expect("Failed to write newline");
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
152
|
+
|
|
153
|
+
// Collect all results before processing
|
|
154
|
+
let results = download_futures.buffer_unordered(50).collect::<Vec<_>>().await;
|
|
155
|
+
|
|
156
|
+
// Process results after all downloads are complete
|
|
157
|
+
for result in results {
|
|
158
|
+
match result {
|
|
159
|
+
Ok((url, content)) => {
|
|
160
|
+
let (maf_bit,mafrows) = select_maf_col(content, &maf_col, &url, &errors);
|
|
161
|
+
if mafrows > 0 {
|
|
162
|
+
let _ = encoder.write_all(&maf_bit).expect("Failed to write file");
|
|
163
|
+
} else {
|
|
164
|
+
errors.lock().unwrap().push(ErrorEntry {
|
|
165
|
+
url: url.clone(),
|
|
166
|
+
error: "Empty maf file".to_string(),
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
Err((url, error)) => {
|
|
171
|
+
errors.lock().unwrap().push(ErrorEntry {
|
|
172
|
+
url,
|
|
173
|
+
error,
|
|
174
|
+
})
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
};
|
|
178
|
+
|
|
179
|
+
// Finalize output and printing errors
|
|
180
|
+
encoder.finish().expect("Maf file output error!");
|
|
181
|
+
|
|
182
|
+
// Manually flush stdout
|
|
183
|
+
io::stdout().flush().expect("Failed to flush stdout");
|
|
184
|
+
|
|
185
|
+
// After processing all downloads, output the errors as JSON to stderr
|
|
186
|
+
let errors = errors.lock().unwrap();
|
|
187
|
+
if !errors.is_empty() {
|
|
188
|
+
let error_json = json!({
|
|
189
|
+
"errors": errors.iter().collect::<Vec<&ErrorEntry>>()
|
|
190
|
+
});
|
|
191
|
+
let mut stderr = io::stderr();
|
|
192
|
+
writeln!(stderr, "{}", error_json).expect("Failed to output stderr!");
|
|
193
|
+
io::stderr().flush().expect("Failed to flush stderr");
|
|
194
|
+
};
|
|
195
|
+
|
|
122
196
|
Ok(())
|
|
123
197
|
}
|