@sjcrh/proteinpaint-rust 2.124.0 → 2.126.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +9 -0
- package/package.json +2 -2
- package/src/cerno.rs +341 -0
- package/src/gdcGRIN2.rs +295 -0
- package/src/readHDF5.rs +2 -2
- package/src/test.rs +0 -3
package/Cargo.toml
CHANGED
|
@@ -34,6 +34,7 @@ reqwest = "0.11"
|
|
|
34
34
|
flate2 = "1"
|
|
35
35
|
futures = "0.3"
|
|
36
36
|
num_cpus = "1.16.0"
|
|
37
|
+
memchr = "2"
|
|
37
38
|
|
|
38
39
|
[profile.release]
|
|
39
40
|
lto = "fat"
|
|
@@ -100,3 +101,11 @@ path="src/readHDF5.rs"
|
|
|
100
101
|
[[bin]]
|
|
101
102
|
name="validateHDF5"
|
|
102
103
|
path="src/validateHDF5.rs"
|
|
104
|
+
|
|
105
|
+
[[bin]]
|
|
106
|
+
name="gdcGRIN2"
|
|
107
|
+
path="src/gdcGRIN2.rs"
|
|
108
|
+
|
|
109
|
+
[[bin]]
|
|
110
|
+
name="cerno"
|
|
111
|
+
path="src/cerno.rs"
|
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.
|
|
2
|
+
"version": "2.126.0",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Rust-based utilities for proteinpaint",
|
|
@@ -39,5 +39,5 @@
|
|
|
39
39
|
"devDependencies": {
|
|
40
40
|
"tape": "^5.2.2"
|
|
41
41
|
},
|
|
42
|
-
"pp_release_tag": "v2.
|
|
42
|
+
"pp_release_tag": "v2.126.0"
|
|
43
43
|
}
|
package/src/cerno.rs
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
// Syntax: cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/cerno
|
|
2
|
+
#![allow(non_snake_case)]
|
|
3
|
+
use json::JsonValue;
|
|
4
|
+
use r_mathlib::chi_squared_cdf;
|
|
5
|
+
use rusqlite::{Connection, Result};
|
|
6
|
+
use serde::{Deserialize, Serialize};
|
|
7
|
+
use serde_json;
|
|
8
|
+
use std::cmp::Ordering;
|
|
9
|
+
use std::collections::HashSet;
|
|
10
|
+
use std::io;
|
|
11
|
+
|
|
12
|
+
#[allow(non_camel_case_types)]
|
|
13
|
+
#[allow(non_snake_case)]
|
|
14
|
+
#[derive(Debug)]
|
|
15
|
+
struct GO_pathway {
|
|
16
|
+
GO_id: String,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
#[allow(non_camel_case_types)]
|
|
20
|
+
#[allow(non_snake_case)]
|
|
21
|
+
#[derive(Debug, Clone, PartialEq, PartialOrd)]
|
|
22
|
+
struct gene_order {
|
|
23
|
+
gene_name: String,
|
|
24
|
+
fold_change: f64,
|
|
25
|
+
rank: Option<usize>,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
#[allow(non_camel_case_types)]
|
|
29
|
+
#[allow(non_snake_case)]
|
|
30
|
+
#[derive(Debug, Serialize, Deserialize)]
|
|
31
|
+
//#[allow(dead_code)]
|
|
32
|
+
struct pathway_p_value {
|
|
33
|
+
pathway_name: String,
|
|
34
|
+
p_value_original: f64,
|
|
35
|
+
p_value_adjusted: Option<f64>,
|
|
36
|
+
gene_set_hits: String,
|
|
37
|
+
auc: f64,
|
|
38
|
+
es: f64,
|
|
39
|
+
gene_set_size: usize,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
#[allow(non_camel_case_types)]
|
|
43
|
+
#[allow(non_snake_case)]
|
|
44
|
+
#[derive(Debug, Serialize, Deserialize)]
|
|
45
|
+
//#[allow(dead_code)]
|
|
46
|
+
struct output_struct {
|
|
47
|
+
pval: f64,
|
|
48
|
+
fdr: f64,
|
|
49
|
+
leading_edge: String,
|
|
50
|
+
auc: f64,
|
|
51
|
+
es: f64,
|
|
52
|
+
geneset_size: usize,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
fn main() -> Result<()> {
|
|
56
|
+
let mut input = String::new();
|
|
57
|
+
match io::stdin().read_line(&mut input) {
|
|
58
|
+
// Accepting the piped input from nodejs (or command line from testing)
|
|
59
|
+
Ok(_n) => {
|
|
60
|
+
let input_json = json::parse(&input);
|
|
61
|
+
match input_json {
|
|
62
|
+
Ok(json_string) => {
|
|
63
|
+
let msigdb_input: &JsonValue = &json_string["db"];
|
|
64
|
+
let msigdb;
|
|
65
|
+
match msigdb_input.as_str() {
|
|
66
|
+
Some(db_string) => msigdb = db_string.to_string(),
|
|
67
|
+
None => panic!("msigdb file path is missing"),
|
|
68
|
+
}
|
|
69
|
+
let genesetgroup;
|
|
70
|
+
let genesetgroup_input: &JsonValue = &json_string["geneset_group"];
|
|
71
|
+
match genesetgroup_input.as_str() {
|
|
72
|
+
Some(genesetgroup_string) => genesetgroup = genesetgroup_string.to_string(),
|
|
73
|
+
None => panic!("genesetgroup is missing"),
|
|
74
|
+
}
|
|
75
|
+
let sample_genes_input: &JsonValue = &json_string["genes"];
|
|
76
|
+
let mut sample_genes = Vec::<&str>::new();
|
|
77
|
+
for iter in 0..sample_genes_input.len() {
|
|
78
|
+
let item = sample_genes_input[iter].as_str().unwrap();
|
|
79
|
+
sample_genes.push(item);
|
|
80
|
+
}
|
|
81
|
+
//println!("sample_genes:{:?}", sample_genes);
|
|
82
|
+
|
|
83
|
+
let fold_change_input: &JsonValue = &json_string["fold_change"];
|
|
84
|
+
let mut fold_change_f64 = Vec::<f64>::new();
|
|
85
|
+
for iter in 0..fold_change_input.len() {
|
|
86
|
+
let item = fold_change_input[iter].as_f64().unwrap();
|
|
87
|
+
fold_change_f64.push(item);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
if sample_genes.len() == 0 {
|
|
91
|
+
panic!("No sample genes provided");
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
if sample_genes.len() != fold_change_f64.len() {
|
|
95
|
+
panic!("Length of genes array and fold change array are not equal");
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
let mut genes_vector: Vec<gene_order> = Vec::with_capacity(sample_genes.len());
|
|
99
|
+
for i in 0..sample_genes.len() {
|
|
100
|
+
let item: gene_order = gene_order {
|
|
101
|
+
gene_name: sample_genes[i].to_string(),
|
|
102
|
+
fold_change: fold_change_f64[i],
|
|
103
|
+
rank: None, // Will be calculated later
|
|
104
|
+
};
|
|
105
|
+
genes_vector.push(item)
|
|
106
|
+
}
|
|
107
|
+
let mut pathway_p_values: Vec<pathway_p_value> = Vec::with_capacity(10000);
|
|
108
|
+
|
|
109
|
+
let genedb_input: &JsonValue = &json_string["genedb"];
|
|
110
|
+
let genedb;
|
|
111
|
+
match genedb_input.as_str() {
|
|
112
|
+
Some(gene_db_string) => genedb = gene_db_string.to_string(),
|
|
113
|
+
None => panic!("genedb file path is missing"),
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
let filter_non_coding_genes_input: &JsonValue = &json_string["filter_non_coding_genes"];
|
|
117
|
+
let filter_non_coding_genes: bool = filter_non_coding_genes_input.as_bool().unwrap();
|
|
118
|
+
|
|
119
|
+
let genedbconn = Connection::open(genedb)?;
|
|
120
|
+
let genedb_result = genedbconn.prepare(&("select * from codingGenes"));
|
|
121
|
+
let mut sample_coding_genes: Vec<gene_order> = Vec::with_capacity(24000);
|
|
122
|
+
match genedb_result {
|
|
123
|
+
Ok(mut x) => {
|
|
124
|
+
let mut genes = x.query([])?;
|
|
125
|
+
while let Some(coding_gene) = genes.next()? {
|
|
126
|
+
//println!("coding_gene:{:?}", coding_gene);
|
|
127
|
+
for sample_gene in &genes_vector {
|
|
128
|
+
let code_gene: String = coding_gene.get(0).unwrap();
|
|
129
|
+
if filter_non_coding_genes == true && code_gene == *sample_gene.gene_name {
|
|
130
|
+
sample_coding_genes.push(sample_gene.clone());
|
|
131
|
+
} else if filter_non_coding_genes == false {
|
|
132
|
+
sample_coding_genes.push(sample_gene.clone());
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
Err(_) => {}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if sample_coding_genes.len() == 0 {
|
|
141
|
+
panic!("All query genes are non-coding");
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Sort sample_coding_gene in descending order
|
|
145
|
+
sample_coding_genes
|
|
146
|
+
.as_mut_slice()
|
|
147
|
+
.sort_by(|a, b| (b.fold_change).partial_cmp(&a.fold_change).unwrap_or(Ordering::Equal));
|
|
148
|
+
|
|
149
|
+
// Assign ranks to each gene
|
|
150
|
+
for i in 0..sample_coding_genes.len() {
|
|
151
|
+
sample_coding_genes[i].rank = Some(i)
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
//println!("sample_genes:{:?}", sample_genes);
|
|
155
|
+
//println!("background_genes:{:?}", background_genes);
|
|
156
|
+
|
|
157
|
+
let msigdbconn = Connection::open(msigdb)?;
|
|
158
|
+
let stmt_result = msigdbconn
|
|
159
|
+
.prepare(&("select id from terms where parent_id='".to_owned() + &genesetgroup + "'"));
|
|
160
|
+
match stmt_result {
|
|
161
|
+
Ok(mut stmt) => {
|
|
162
|
+
#[allow(non_snake_case)]
|
|
163
|
+
let GO_iter = stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
|
|
164
|
+
#[allow(non_snake_case)]
|
|
165
|
+
for GO_term in GO_iter {
|
|
166
|
+
match GO_term {
|
|
167
|
+
Ok(n) => {
|
|
168
|
+
//println!("GO term {:?}", n);
|
|
169
|
+
let sql_statement =
|
|
170
|
+
"select genes from term2genes where id='".to_owned() + &n.GO_id + &"'";
|
|
171
|
+
//println!("sql_statement:{}", sql_statement);
|
|
172
|
+
let mut gene_stmt = msigdbconn.prepare(&(sql_statement))?;
|
|
173
|
+
//println!("gene_stmt:{:?}", gene_stmt);
|
|
174
|
+
|
|
175
|
+
let mut rows = gene_stmt.query([])?;
|
|
176
|
+
let mut names = HashSet::<String>::new();
|
|
177
|
+
while let Some(row) = rows.next()? {
|
|
178
|
+
let a: String = row.get(0)?;
|
|
179
|
+
let input_gene_json = json::parse(&a);
|
|
180
|
+
match input_gene_json {
|
|
181
|
+
Ok(json_genes) => {
|
|
182
|
+
for json_iter in 0..json_genes.len() {
|
|
183
|
+
names.insert(json_genes[json_iter]["symbol"].to_string());
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
Err(_) => {
|
|
187
|
+
panic!("Symbol, ensg, enstCanonical structure is missing!")
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
let gene_set_size = names.len();
|
|
192
|
+
let (p_value, auc, es, matches, gene_set_hits) =
|
|
193
|
+
cerno(&sample_coding_genes, names);
|
|
194
|
+
|
|
195
|
+
if matches >= 1.0
|
|
196
|
+
&& p_value.is_nan() == false
|
|
197
|
+
&& es.is_nan() == false
|
|
198
|
+
&& es != f64::INFINITY
|
|
199
|
+
&& auc != f64::INFINITY
|
|
200
|
+
&& auc.is_nan() == false
|
|
201
|
+
{
|
|
202
|
+
pathway_p_values.push(pathway_p_value {
|
|
203
|
+
pathway_name: n.GO_id,
|
|
204
|
+
p_value_original: p_value,
|
|
205
|
+
p_value_adjusted: None,
|
|
206
|
+
auc: auc,
|
|
207
|
+
es: es,
|
|
208
|
+
gene_set_hits: gene_set_hits,
|
|
209
|
+
gene_set_size: gene_set_size,
|
|
210
|
+
})
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
Err(_) => {
|
|
214
|
+
println!("GO term not found!")
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
Err(_) => panic!("sqlite database file not found"),
|
|
220
|
+
}
|
|
221
|
+
let output_string =
|
|
222
|
+
"result: {".to_string() + &"\"data\":" + &adjust_p_values(pathway_p_values) + &"}";
|
|
223
|
+
println!("{}", output_string);
|
|
224
|
+
}
|
|
225
|
+
Err(error) => println!("Incorrect json:{}", error),
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
Err(error) => println!("Piping error: {}", error),
|
|
229
|
+
}
|
|
230
|
+
Ok(())
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
fn cerno(sample_coding_genes: &Vec<gene_order>, genes_in_pathway: HashSet<String>) -> (f64, f64, f64, f64, String) {
|
|
234
|
+
// Filter the sample_coding_genes vector to only include those whose gene_names are in the HashSet genes_in_pathway
|
|
235
|
+
let gene_intersections: Vec<&gene_order> = sample_coding_genes
|
|
236
|
+
.iter()
|
|
237
|
+
.filter(|sample_coding_genes| genes_in_pathway.contains(&sample_coding_genes.gene_name)) // Check if name is in the HashSet genes_in_pathway
|
|
238
|
+
.collect(); // Collect the results into a new vector
|
|
239
|
+
|
|
240
|
+
let N1 = gene_intersections.len() as f64;
|
|
241
|
+
let N = sample_coding_genes.len() as f64;
|
|
242
|
+
let mut gene_set_hits: String = "".to_string();
|
|
243
|
+
for gene in &gene_intersections {
|
|
244
|
+
gene_set_hits += &(gene.gene_name.to_string() + &",");
|
|
245
|
+
}
|
|
246
|
+
if gene_intersections.len() > 0 {
|
|
247
|
+
// Remove the last "," in string
|
|
248
|
+
gene_set_hits.pop();
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
let ranks: Vec<usize> = gene_intersections // x <- l %in% mset$gs2gv[[m]] ; ranks <- c(1:N)[x]
|
|
252
|
+
.iter()
|
|
253
|
+
.map(|x| x.rank.unwrap())
|
|
254
|
+
.collect::<Vec<usize>>();
|
|
255
|
+
|
|
256
|
+
let cerno: f64 = ranks // -2 * sum( log(ranks/N) )
|
|
257
|
+
.iter()
|
|
258
|
+
.map(|x| ((*x as f64) / N).ln())
|
|
259
|
+
.collect::<Vec<f64>>()
|
|
260
|
+
.iter()
|
|
261
|
+
.sum::<f64>()
|
|
262
|
+
* (-2.0);
|
|
263
|
+
|
|
264
|
+
let cES: f64 = cerno / (2.0 * (N1 as f64)); // cES <- cerno/(2*N1)
|
|
265
|
+
let N2 = N - N1; // N2 = N - N1
|
|
266
|
+
let R1 = ranks.iter().sum::<usize>() as f64; // R1 <- sum(ranks)
|
|
267
|
+
let U = N1 * N2 + N1 * (N1 + 1.0) / 2.0 - R1; // U <- N1*N2+N1*(N1+1)/2-R1
|
|
268
|
+
let AUC = U / (N1 * N2); // AUC <- U/(N1*N2)
|
|
269
|
+
let p_value = chi_squared_cdf(cerno, 2.0 * N1, false, false); // pchisq(ret$cerno, 2*N1, lower.tail=FALSE)
|
|
270
|
+
(p_value, AUC, cES, N1, gene_set_hits)
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>) -> String {
|
|
274
|
+
// Sorting p-values in ascending order
|
|
275
|
+
original_p_values.as_mut_slice().sort_by(|a, b| {
|
|
276
|
+
(a.p_value_original)
|
|
277
|
+
.partial_cmp(&b.p_value_original)
|
|
278
|
+
.unwrap_or(Ordering::Equal)
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
let mut adjusted_p_values: Vec<pathway_p_value> = Vec::with_capacity(original_p_values.len());
|
|
282
|
+
let mut old_p_value: f64 = 0.0;
|
|
283
|
+
let mut rank: f64 = original_p_values.len() as f64;
|
|
284
|
+
for j in 0..original_p_values.len() {
|
|
285
|
+
let i = original_p_values.len() - j - 1;
|
|
286
|
+
|
|
287
|
+
//println!("p_val:{}", p_val);
|
|
288
|
+
let mut adjusted_p_val: f64 = original_p_values[i].p_value_original * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
|
|
289
|
+
if adjusted_p_val > 1.0 {
|
|
290
|
+
// p_value should NEVER be greater than 1
|
|
291
|
+
adjusted_p_val = 1.0;
|
|
292
|
+
}
|
|
293
|
+
//println!("Original p_value:{}", original_p_values[i].p_value);
|
|
294
|
+
//println!("Raw adjusted p_value:{}", adjusted_p_value);
|
|
295
|
+
if i != original_p_values.len() - 1 {
|
|
296
|
+
if adjusted_p_val > old_p_value {
|
|
297
|
+
adjusted_p_val = old_p_value;
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
old_p_value = adjusted_p_val;
|
|
301
|
+
//println!("adjusted_p_value:{}", adjusted_p_val);
|
|
302
|
+
rank -= 1.0;
|
|
303
|
+
|
|
304
|
+
adjusted_p_values.push(pathway_p_value {
|
|
305
|
+
pathway_name: original_p_values[i].pathway_name.clone(),
|
|
306
|
+
p_value_original: original_p_values[i].p_value_original,
|
|
307
|
+
p_value_adjusted: Some(adjusted_p_val),
|
|
308
|
+
auc: original_p_values[i].auc,
|
|
309
|
+
es: original_p_values[i].es,
|
|
310
|
+
gene_set_hits: original_p_values[i].gene_set_hits.clone(),
|
|
311
|
+
gene_set_size: original_p_values[i].gene_set_size,
|
|
312
|
+
});
|
|
313
|
+
}
|
|
314
|
+
adjusted_p_values.as_mut_slice().sort_by(|a, b| {
|
|
315
|
+
(a.p_value_adjusted.unwrap())
|
|
316
|
+
.partial_cmp(&b.p_value_adjusted.unwrap())
|
|
317
|
+
.unwrap_or(Ordering::Equal)
|
|
318
|
+
});
|
|
319
|
+
|
|
320
|
+
let mut output_string = "{".to_string();
|
|
321
|
+
for i in 0..adjusted_p_values.len() {
|
|
322
|
+
let item = output_struct {
|
|
323
|
+
pval: adjusted_p_values[i].p_value_original,
|
|
324
|
+
fdr: adjusted_p_values[i].p_value_adjusted.unwrap(),
|
|
325
|
+
leading_edge: adjusted_p_values[i].gene_set_hits.clone(),
|
|
326
|
+
geneset_size: adjusted_p_values[i].gene_set_size,
|
|
327
|
+
es: adjusted_p_values[i].es,
|
|
328
|
+
auc: adjusted_p_values[i].auc,
|
|
329
|
+
};
|
|
330
|
+
output_string += &format!(
|
|
331
|
+
"\"{}\":{}",
|
|
332
|
+
adjusted_p_values[i].pathway_name.clone(),
|
|
333
|
+
serde_json::to_string(&item).unwrap()
|
|
334
|
+
);
|
|
335
|
+
if i < adjusted_p_values.len() - 1 {
|
|
336
|
+
output_string += &",".to_string();
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
output_string += &"}".to_string();
|
|
340
|
+
output_string
|
|
341
|
+
}
|
package/src/gdcGRIN2.rs
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
use flate2::read::GzDecoder;
|
|
2
|
+
use futures::StreamExt;
|
|
3
|
+
use memchr::memchr;
|
|
4
|
+
use serde::Deserialize;
|
|
5
|
+
use serde_json;
|
|
6
|
+
use std::collections::HashMap;
|
|
7
|
+
use std::io::{self, Read, Write};
|
|
8
|
+
use std::time::Duration;
|
|
9
|
+
use tokio::io::{AsyncReadExt, BufReader};
|
|
10
|
+
use tokio::time::timeout;
|
|
11
|
+
|
|
12
|
+
// Struct to hold error information
|
|
13
|
+
#[derive(serde::Serialize)]
|
|
14
|
+
struct ErrorEntry {
|
|
15
|
+
case: String,
|
|
16
|
+
error: String,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// Define the structure for datadd
|
|
20
|
+
#[derive(Deserialize, Debug)]
|
|
21
|
+
struct DataType {
|
|
22
|
+
cnv: Option<String>,
|
|
23
|
+
maf: Option<String>,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// Function to parse TSV content
|
|
27
|
+
// CNV:
|
|
28
|
+
// Select cnv columns ["Chromosome","Start","End","Segment_Mean"]
|
|
29
|
+
// Segment_Mean >= 0.2 => gain; Segment_Mean <= -0.2 => loss
|
|
30
|
+
// MAF:
|
|
31
|
+
// Select MAF columns ["Chromosome","Start_Position","End_Position"]
|
|
32
|
+
fn parse_content(content: &str, case_id: &str, data_type: &str) -> Result<String, (String, String, String)> {
|
|
33
|
+
let lines = content.lines();
|
|
34
|
+
//let mut parsed_data = Vec::new();
|
|
35
|
+
let mut parsed_data: String = String::new();
|
|
36
|
+
let mut columns_indices: Vec<usize> = Vec::new();
|
|
37
|
+
let mut header_mk: &str = "";
|
|
38
|
+
let mut columns = Vec::new(); // columns selected from GDC file
|
|
39
|
+
if data_type == "cnv" {
|
|
40
|
+
header_mk = "GDC_Aliquot_ID";
|
|
41
|
+
columns = vec!["Chromosome", "Start", "End", "Segment_Mean"]
|
|
42
|
+
} else if data_type == "maf" {
|
|
43
|
+
header_mk = "Hugo_Symbol";
|
|
44
|
+
columns = vec!["Chromosome", "Start_Position", "End_Position"]
|
|
45
|
+
};
|
|
46
|
+
let mut header: Vec<String> = Vec::new(); // GDC file header
|
|
47
|
+
for line in lines {
|
|
48
|
+
if line.starts_with("#") {
|
|
49
|
+
continue;
|
|
50
|
+
} else if line.contains(&header_mk) {
|
|
51
|
+
// header line
|
|
52
|
+
header = line.split("\t").map(|s| s.to_string()).collect();
|
|
53
|
+
for col in &columns {
|
|
54
|
+
match header.iter().position(|x| x == col) {
|
|
55
|
+
Some(index) => {
|
|
56
|
+
columns_indices.push(index);
|
|
57
|
+
}
|
|
58
|
+
None => {
|
|
59
|
+
let error_msg = format!("Column {} was not found", col);
|
|
60
|
+
return Err((case_id.to_string(), data_type.to_string(), error_msg));
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
} else {
|
|
65
|
+
let mut keep_ck: bool = true;
|
|
66
|
+
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
67
|
+
let mut out_lst: Vec<String> = Vec::new();
|
|
68
|
+
// add sample ID first
|
|
69
|
+
out_lst.push(case_id.to_string());
|
|
70
|
+
for x in columns_indices.iter() {
|
|
71
|
+
let mut element = cont_lst[*x].to_string();
|
|
72
|
+
if data_type == "cnv" && &header[*x] == "Segment_Mean" {
|
|
73
|
+
// convert to f32 (segment_mean)
|
|
74
|
+
let seg_mean = match element.parse::<f32>() {
|
|
75
|
+
Ok(val) => val,
|
|
76
|
+
Err(_e) => {
|
|
77
|
+
let error_msg = "Segment_Mean in cnv file is not float".to_string();
|
|
78
|
+
return Err((case_id.to_string(), data_type.to_string(), error_msg));
|
|
79
|
+
}
|
|
80
|
+
};
|
|
81
|
+
if seg_mean >= 0.2 {
|
|
82
|
+
element = "gain".to_string();
|
|
83
|
+
} else if seg_mean <= -0.2 {
|
|
84
|
+
element = "loss".to_string();
|
|
85
|
+
} else {
|
|
86
|
+
keep_ck = false;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
out_lst.push(element);
|
|
90
|
+
}
|
|
91
|
+
// add lsn.type to snv
|
|
92
|
+
if data_type == "maf" {
|
|
93
|
+
out_lst.push("mutation".to_string());
|
|
94
|
+
}
|
|
95
|
+
if keep_ck {
|
|
96
|
+
parsed_data.push_str(out_lst.join("\t").as_str());
|
|
97
|
+
parsed_data.push_str("\n");
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
if columns_indices.is_empty() {
|
|
102
|
+
return Err((
|
|
103
|
+
case_id.to_string(),
|
|
104
|
+
data_type.to_string(),
|
|
105
|
+
"No matching columns found. Problematic file!".to_string(),
|
|
106
|
+
));
|
|
107
|
+
};
|
|
108
|
+
Ok(parsed_data)
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Function to download data
|
|
112
|
+
//async fn download_data(data4dl: HashMap<String,DataType>, host: &str) -> Vec<Result<(String, String), (String, String)>> {
|
|
113
|
+
async fn download_data(data4dl: HashMap<String, DataType>, host: &str) -> () {
|
|
114
|
+
// Generate URLs from data4dl, handling optional cnv and maf
|
|
115
|
+
let data_urls = data4dl
|
|
116
|
+
.into_iter()
|
|
117
|
+
.flat_map(|(case_id, data_types)| {
|
|
118
|
+
let mut urls = Vec::new();
|
|
119
|
+
if let Some(cnv_uuid) = &data_types.cnv {
|
|
120
|
+
urls.push((case_id.clone(), "cnv".to_string(), format!("{}{}", host, cnv_uuid)));
|
|
121
|
+
}
|
|
122
|
+
if let Some(maf_uuid) = &data_types.maf {
|
|
123
|
+
urls.push((case_id.clone(), "maf".to_string(), format!("{}{}", host, maf_uuid)));
|
|
124
|
+
}
|
|
125
|
+
urls
|
|
126
|
+
})
|
|
127
|
+
.collect::<Vec<_>>();
|
|
128
|
+
let download_futures = futures::stream::iter(data_urls.into_iter().map(|(case_id, data_type, url)| {
|
|
129
|
+
async move {
|
|
130
|
+
//let case_dt = format!("{}/{}",case_id,data_type).to_string();
|
|
131
|
+
// Build HTTP client with timeouts
|
|
132
|
+
let client = reqwest::Client::builder()
|
|
133
|
+
.timeout(Duration::from_secs(60)) // 60-second timeout per request
|
|
134
|
+
.connect_timeout(Duration::from_secs(30))
|
|
135
|
+
.build()
|
|
136
|
+
.map_err(|_e| "Client build error".to_string());
|
|
137
|
+
// Handle client creation result
|
|
138
|
+
match client {
|
|
139
|
+
Ok(client) => {
|
|
140
|
+
match client.get(&url).send().await {
|
|
141
|
+
Ok(resp) if resp.status().is_success() => {
|
|
142
|
+
match resp.bytes().await {
|
|
143
|
+
Ok(content) => {
|
|
144
|
+
// if data_type == "cnv" {
|
|
145
|
+
if !memchr(0x00, &content).is_some() {
|
|
146
|
+
// CNV files are plain text
|
|
147
|
+
let text = String::from_utf8_lossy(&content).to_string();
|
|
148
|
+
Ok((case_id.clone(), data_type.clone(), text))
|
|
149
|
+
} else {
|
|
150
|
+
let mut decoder = GzDecoder::new(&content[..]);
|
|
151
|
+
let mut decompressed_content = Vec::new();
|
|
152
|
+
match decoder.read_to_end(&mut decompressed_content) {
|
|
153
|
+
Ok(_) => {
|
|
154
|
+
let text = String::from_utf8_lossy(&decompressed_content).to_string();
|
|
155
|
+
Ok((case_id.clone(), data_type.clone(), text))
|
|
156
|
+
}
|
|
157
|
+
Err(e) => {
|
|
158
|
+
let error_msg = format!(
|
|
159
|
+
"Failed to decompress {} file for {}: {}",
|
|
160
|
+
data_type, case_id, e
|
|
161
|
+
);
|
|
162
|
+
Err((case_id.clone(), data_type.clone(), error_msg))
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
Err(e) => {
|
|
168
|
+
let error_msg =
|
|
169
|
+
format!("Failed to read bytes for {} file for {}: {}", data_type, case_id, e);
|
|
170
|
+
Err((case_id.clone(), data_type.clone(), error_msg))
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
Ok(resp) => {
|
|
175
|
+
let error_msg =
|
|
176
|
+
format!("HTTP error for {} file for {}: {}", data_type, case_id, resp.status());
|
|
177
|
+
Err((case_id.clone(), data_type.clone(), error_msg))
|
|
178
|
+
}
|
|
179
|
+
Err(e) => {
|
|
180
|
+
let error_msg =
|
|
181
|
+
format!("Server request failed for {} file for {}: {}", data_type, case_id, e);
|
|
182
|
+
Err((case_id.clone(), data_type.clone(), error_msg))
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
Err(_e) => {
|
|
187
|
+
let error_msg = "Client build error".to_string();
|
|
188
|
+
Err((case_id, data_type, error_msg))
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
}));
|
|
193
|
+
|
|
194
|
+
// Execute downloads concurrently and collect results
|
|
195
|
+
download_futures
|
|
196
|
+
.buffer_unordered(10)
|
|
197
|
+
.for_each(|result| async {
|
|
198
|
+
match result {
|
|
199
|
+
Ok((case_id, data_type, content)) => match parse_content(&content, &case_id, &data_type) {
|
|
200
|
+
Ok(parsed_data) => match serde_json::to_string(&parsed_data) {
|
|
201
|
+
Ok(json) => println!("{}", json),
|
|
202
|
+
Err(e) => {
|
|
203
|
+
let error = ErrorEntry {
|
|
204
|
+
case: format!("{}: {}", case_id, data_type),
|
|
205
|
+
error: format!("Failed to convert data to JSON {}", e),
|
|
206
|
+
};
|
|
207
|
+
let error_js = serde_json::to_string(&error).unwrap();
|
|
208
|
+
eprintln!("{}", error_js);
|
|
209
|
+
}
|
|
210
|
+
},
|
|
211
|
+
Err((cid, dtp, error)) => {
|
|
212
|
+
let error = ErrorEntry {
|
|
213
|
+
case: format!("{}: {}", cid, dtp),
|
|
214
|
+
error,
|
|
215
|
+
};
|
|
216
|
+
let error_js = serde_json::to_string(&error).unwrap();
|
|
217
|
+
eprintln!("{}", error_js);
|
|
218
|
+
}
|
|
219
|
+
},
|
|
220
|
+
Err((case_id, data_type, error)) => {
|
|
221
|
+
let error = ErrorEntry {
|
|
222
|
+
case: format!("{}: {}", case_id, data_type),
|
|
223
|
+
error,
|
|
224
|
+
};
|
|
225
|
+
let error_js = serde_json::to_string(&error).unwrap();
|
|
226
|
+
eprintln!("{}", error_js);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
})
|
|
230
|
+
.await;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
#[tokio::main]
|
|
234
|
+
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
235
|
+
const HOST: &str = "https://api.gdc.cancer.gov/data/";
|
|
236
|
+
|
|
237
|
+
// Accepting the piped input json from nodejs
|
|
238
|
+
let timeout_duration = Duration::from_secs(5); // Set a 5-second timeout
|
|
239
|
+
|
|
240
|
+
// Wrap the read operation in a timeout
|
|
241
|
+
let result = timeout(timeout_duration, async {
|
|
242
|
+
let mut buffer = String::new(); // Initialize an empty string to store input
|
|
243
|
+
let mut reader = BufReader::new(tokio::io::stdin()); // Create a buffered reader for stdin
|
|
244
|
+
reader.read_to_string(&mut buffer).await?; // Read a line asynchronously
|
|
245
|
+
Ok::<String, io::Error>(buffer) // Return the input as a Result
|
|
246
|
+
})
|
|
247
|
+
.await;
|
|
248
|
+
|
|
249
|
+
// Handle the result of the input timeout operation
|
|
250
|
+
let input_js: HashMap<String, DataType> = match result {
|
|
251
|
+
Ok(Ok(buffer)) => match serde_json::from_str(&buffer) {
|
|
252
|
+
Ok(js) => js,
|
|
253
|
+
Err(e) => {
|
|
254
|
+
let stdin_error = ErrorEntry {
|
|
255
|
+
case: String::new(),
|
|
256
|
+
error: format!("Input JSON parsing error: {}", e),
|
|
257
|
+
};
|
|
258
|
+
writeln!(io::stderr(), "{}", serde_json::to_string(&stdin_error).unwrap()).unwrap();
|
|
259
|
+
return Err(Box::new(std::io::Error::new(
|
|
260
|
+
std::io::ErrorKind::InvalidInput,
|
|
261
|
+
"Input JSON parsing Error!",
|
|
262
|
+
)) as Box<dyn std::error::Error>);
|
|
263
|
+
}
|
|
264
|
+
},
|
|
265
|
+
Ok(Err(_e)) => {
|
|
266
|
+
let stdin_error = ErrorEntry {
|
|
267
|
+
case: String::new(),
|
|
268
|
+
error: "Error reading from stdin.".to_string(),
|
|
269
|
+
};
|
|
270
|
+
let stdin_error_js = serde_json::to_string(&stdin_error).unwrap();
|
|
271
|
+
writeln!(io::stderr(), "{}", stdin_error_js).expect("Failed to output stderr!");
|
|
272
|
+
return Err(Box::new(std::io::Error::new(
|
|
273
|
+
std::io::ErrorKind::InvalidInput,
|
|
274
|
+
"Error reading from stdin!",
|
|
275
|
+
)) as Box<dyn std::error::Error>);
|
|
276
|
+
}
|
|
277
|
+
Err(_) => {
|
|
278
|
+
let stdin_error = ErrorEntry {
|
|
279
|
+
case: String::new(),
|
|
280
|
+
error: "Timeout while reading from stdin.".to_string(),
|
|
281
|
+
};
|
|
282
|
+
let stdin_error_js = serde_json::to_string(&stdin_error).unwrap();
|
|
283
|
+
writeln!(io::stderr(), "{}", stdin_error_js).expect("Failed to output stderr!");
|
|
284
|
+
return Err(Box::new(std::io::Error::new(
|
|
285
|
+
std::io::ErrorKind::InvalidInput,
|
|
286
|
+
"Timeout while reading from stdin.",
|
|
287
|
+
)) as Box<dyn std::error::Error>);
|
|
288
|
+
}
|
|
289
|
+
};
|
|
290
|
+
|
|
291
|
+
// Download data
|
|
292
|
+
download_data(input_js, HOST).await;
|
|
293
|
+
|
|
294
|
+
Ok(())
|
|
295
|
+
}
|
package/src/readHDF5.rs
CHANGED
|
@@ -19,9 +19,9 @@
|
|
|
19
19
|
use hdf5::types::{FixedAscii, VarLenAscii};
|
|
20
20
|
use hdf5::{File, Result};
|
|
21
21
|
use ndarray::Dim;
|
|
22
|
-
use ndarray::{
|
|
22
|
+
use ndarray::{s, Array1};
|
|
23
23
|
use rayon::prelude::*;
|
|
24
|
-
use serde_json::{Map, Value
|
|
24
|
+
use serde_json::{json, Map, Value};
|
|
25
25
|
use std::io;
|
|
26
26
|
use std::sync::Arc;
|
|
27
27
|
use std::time::Instant;
|
package/src/test.rs
DELETED