@sjcrh/proteinpaint-rust 2.125.0 → 2.126.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +5 -0
- package/index.js +1 -1
- package/package.json +2 -2
- package/src/gdcGRIN2.rs +295 -0
package/Cargo.toml
CHANGED
|
@@ -34,6 +34,7 @@ reqwest = "0.11"
|
|
|
34
34
|
flate2 = "1"
|
|
35
35
|
futures = "0.3"
|
|
36
36
|
num_cpus = "1.16.0"
|
|
37
|
+
memchr = "2"
|
|
37
38
|
|
|
38
39
|
[profile.release]
|
|
39
40
|
lto = "fat"
|
|
@@ -100,3 +101,7 @@ path="src/readHDF5.rs"
|
|
|
100
101
|
[[bin]]
|
|
101
102
|
name="validateHDF5"
|
|
102
103
|
path="src/validateHDF5.rs"
|
|
104
|
+
|
|
105
|
+
[[bin]]
|
|
106
|
+
name="gdcGRIN2"
|
|
107
|
+
path="src/gdcGRIN2.rs"
|
package/index.js
CHANGED
|
@@ -5,7 +5,7 @@ import { spawn, exec } from 'child_process'
|
|
|
5
5
|
import { Readable, Transform } from 'stream'
|
|
6
6
|
import { promisify } from 'util'
|
|
7
7
|
|
|
8
|
-
const __dirname = import.meta.dirname
|
|
8
|
+
const __dirname = import.meta.dirname // set __dirname for consistency with cjs code
|
|
9
9
|
|
|
10
10
|
const execPromise = promisify(exec)
|
|
11
11
|
|
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.
|
|
2
|
+
"version": "2.126.2",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Rust-based utilities for proteinpaint",
|
|
@@ -39,5 +39,5 @@
|
|
|
39
39
|
"devDependencies": {
|
|
40
40
|
"tape": "^5.2.2"
|
|
41
41
|
},
|
|
42
|
-
"pp_release_tag": "v2.
|
|
42
|
+
"pp_release_tag": "v2.126.2"
|
|
43
43
|
}
|
package/src/gdcGRIN2.rs
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
use flate2::read::GzDecoder;
|
|
2
|
+
use futures::StreamExt;
|
|
3
|
+
use memchr::memchr;
|
|
4
|
+
use serde::Deserialize;
|
|
5
|
+
use serde_json;
|
|
6
|
+
use std::collections::HashMap;
|
|
7
|
+
use std::io::{self, Read, Write};
|
|
8
|
+
use std::time::Duration;
|
|
9
|
+
use tokio::io::{AsyncReadExt, BufReader};
|
|
10
|
+
use tokio::time::timeout;
|
|
11
|
+
|
|
12
|
+
// Struct to hold error information
|
|
13
|
+
#[derive(serde::Serialize)]
|
|
14
|
+
struct ErrorEntry {
|
|
15
|
+
case: String,
|
|
16
|
+
error: String,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// Define the structure for datadd
|
|
20
|
+
#[derive(Deserialize, Debug)]
|
|
21
|
+
struct DataType {
|
|
22
|
+
cnv: Option<String>,
|
|
23
|
+
maf: Option<String>,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// Function to parse TSV content
|
|
27
|
+
// CNV:
|
|
28
|
+
// Select cnv columns ["Chromosome","Start","End","Segment_Mean"]
|
|
29
|
+
// Segment_Mean >= 0.2 => gain; Segment_Mean <= -0.2 => loss
|
|
30
|
+
// MAF:
|
|
31
|
+
// Select MAF columns ["Chromosome","Start_Position","End_Position"]
|
|
32
|
+
fn parse_content(content: &str, case_id: &str, data_type: &str) -> Result<String, (String, String, String)> {
|
|
33
|
+
let lines = content.lines();
|
|
34
|
+
//let mut parsed_data = Vec::new();
|
|
35
|
+
let mut parsed_data: String = String::new();
|
|
36
|
+
let mut columns_indices: Vec<usize> = Vec::new();
|
|
37
|
+
let mut header_mk: &str = "";
|
|
38
|
+
let mut columns = Vec::new(); // columns selected from GDC file
|
|
39
|
+
if data_type == "cnv" {
|
|
40
|
+
header_mk = "GDC_Aliquot_ID";
|
|
41
|
+
columns = vec!["Chromosome", "Start", "End", "Segment_Mean"]
|
|
42
|
+
} else if data_type == "maf" {
|
|
43
|
+
header_mk = "Hugo_Symbol";
|
|
44
|
+
columns = vec!["Chromosome", "Start_Position", "End_Position"]
|
|
45
|
+
};
|
|
46
|
+
let mut header: Vec<String> = Vec::new(); // GDC file header
|
|
47
|
+
for line in lines {
|
|
48
|
+
if line.starts_with("#") {
|
|
49
|
+
continue;
|
|
50
|
+
} else if line.contains(&header_mk) {
|
|
51
|
+
// header line
|
|
52
|
+
header = line.split("\t").map(|s| s.to_string()).collect();
|
|
53
|
+
for col in &columns {
|
|
54
|
+
match header.iter().position(|x| x == col) {
|
|
55
|
+
Some(index) => {
|
|
56
|
+
columns_indices.push(index);
|
|
57
|
+
}
|
|
58
|
+
None => {
|
|
59
|
+
let error_msg = format!("Column {} was not found", col);
|
|
60
|
+
return Err((case_id.to_string(), data_type.to_string(), error_msg));
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
} else {
|
|
65
|
+
let mut keep_ck: bool = true;
|
|
66
|
+
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
67
|
+
let mut out_lst: Vec<String> = Vec::new();
|
|
68
|
+
// add sample ID first
|
|
69
|
+
out_lst.push(case_id.to_string());
|
|
70
|
+
for x in columns_indices.iter() {
|
|
71
|
+
let mut element = cont_lst[*x].to_string();
|
|
72
|
+
if data_type == "cnv" && &header[*x] == "Segment_Mean" {
|
|
73
|
+
// convert to f32 (segment_mean)
|
|
74
|
+
let seg_mean = match element.parse::<f32>() {
|
|
75
|
+
Ok(val) => val,
|
|
76
|
+
Err(_e) => {
|
|
77
|
+
let error_msg = "Segment_Mean in cnv file is not float".to_string();
|
|
78
|
+
return Err((case_id.to_string(), data_type.to_string(), error_msg));
|
|
79
|
+
}
|
|
80
|
+
};
|
|
81
|
+
if seg_mean >= 0.2 {
|
|
82
|
+
element = "gain".to_string();
|
|
83
|
+
} else if seg_mean <= -0.2 {
|
|
84
|
+
element = "loss".to_string();
|
|
85
|
+
} else {
|
|
86
|
+
keep_ck = false;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
out_lst.push(element);
|
|
90
|
+
}
|
|
91
|
+
// add lsn.type to snv
|
|
92
|
+
if data_type == "maf" {
|
|
93
|
+
out_lst.push("mutation".to_string());
|
|
94
|
+
}
|
|
95
|
+
if keep_ck {
|
|
96
|
+
parsed_data.push_str(out_lst.join("\t").as_str());
|
|
97
|
+
parsed_data.push_str("\n");
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
if columns_indices.is_empty() {
|
|
102
|
+
return Err((
|
|
103
|
+
case_id.to_string(),
|
|
104
|
+
data_type.to_string(),
|
|
105
|
+
"No matching columns found. Problematic file!".to_string(),
|
|
106
|
+
));
|
|
107
|
+
};
|
|
108
|
+
Ok(parsed_data)
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Function to download data
|
|
112
|
+
//async fn download_data(data4dl: HashMap<String,DataType>, host: &str) -> Vec<Result<(String, String), (String, String)>> {
|
|
113
|
+
async fn download_data(data4dl: HashMap<String, DataType>, host: &str) -> () {
|
|
114
|
+
// Generate URLs from data4dl, handling optional cnv and maf
|
|
115
|
+
let data_urls = data4dl
|
|
116
|
+
.into_iter()
|
|
117
|
+
.flat_map(|(case_id, data_types)| {
|
|
118
|
+
let mut urls = Vec::new();
|
|
119
|
+
if let Some(cnv_uuid) = &data_types.cnv {
|
|
120
|
+
urls.push((case_id.clone(), "cnv".to_string(), format!("{}{}", host, cnv_uuid)));
|
|
121
|
+
}
|
|
122
|
+
if let Some(maf_uuid) = &data_types.maf {
|
|
123
|
+
urls.push((case_id.clone(), "maf".to_string(), format!("{}{}", host, maf_uuid)));
|
|
124
|
+
}
|
|
125
|
+
urls
|
|
126
|
+
})
|
|
127
|
+
.collect::<Vec<_>>();
|
|
128
|
+
let download_futures = futures::stream::iter(data_urls.into_iter().map(|(case_id, data_type, url)| {
|
|
129
|
+
async move {
|
|
130
|
+
//let case_dt = format!("{}/{}",case_id,data_type).to_string();
|
|
131
|
+
// Build HTTP client with timeouts
|
|
132
|
+
let client = reqwest::Client::builder()
|
|
133
|
+
.timeout(Duration::from_secs(60)) // 60-second timeout per request
|
|
134
|
+
.connect_timeout(Duration::from_secs(30))
|
|
135
|
+
.build()
|
|
136
|
+
.map_err(|_e| "Client build error".to_string());
|
|
137
|
+
// Handle client creation result
|
|
138
|
+
match client {
|
|
139
|
+
Ok(client) => {
|
|
140
|
+
match client.get(&url).send().await {
|
|
141
|
+
Ok(resp) if resp.status().is_success() => {
|
|
142
|
+
match resp.bytes().await {
|
|
143
|
+
Ok(content) => {
|
|
144
|
+
// if data_type == "cnv" {
|
|
145
|
+
if !memchr(0x00, &content).is_some() {
|
|
146
|
+
// CNV files are plain text
|
|
147
|
+
let text = String::from_utf8_lossy(&content).to_string();
|
|
148
|
+
Ok((case_id.clone(), data_type.clone(), text))
|
|
149
|
+
} else {
|
|
150
|
+
let mut decoder = GzDecoder::new(&content[..]);
|
|
151
|
+
let mut decompressed_content = Vec::new();
|
|
152
|
+
match decoder.read_to_end(&mut decompressed_content) {
|
|
153
|
+
Ok(_) => {
|
|
154
|
+
let text = String::from_utf8_lossy(&decompressed_content).to_string();
|
|
155
|
+
Ok((case_id.clone(), data_type.clone(), text))
|
|
156
|
+
}
|
|
157
|
+
Err(e) => {
|
|
158
|
+
let error_msg = format!(
|
|
159
|
+
"Failed to decompress {} file for {}: {}",
|
|
160
|
+
data_type, case_id, e
|
|
161
|
+
);
|
|
162
|
+
Err((case_id.clone(), data_type.clone(), error_msg))
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
Err(e) => {
|
|
168
|
+
let error_msg =
|
|
169
|
+
format!("Failed to read bytes for {} file for {}: {}", data_type, case_id, e);
|
|
170
|
+
Err((case_id.clone(), data_type.clone(), error_msg))
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
Ok(resp) => {
|
|
175
|
+
let error_msg =
|
|
176
|
+
format!("HTTP error for {} file for {}: {}", data_type, case_id, resp.status());
|
|
177
|
+
Err((case_id.clone(), data_type.clone(), error_msg))
|
|
178
|
+
}
|
|
179
|
+
Err(e) => {
|
|
180
|
+
let error_msg =
|
|
181
|
+
format!("Server request failed for {} file for {}: {}", data_type, case_id, e);
|
|
182
|
+
Err((case_id.clone(), data_type.clone(), error_msg))
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
Err(_e) => {
|
|
187
|
+
let error_msg = "Client build error".to_string();
|
|
188
|
+
Err((case_id, data_type, error_msg))
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
}));
|
|
193
|
+
|
|
194
|
+
// Execute downloads concurrently and collect results
|
|
195
|
+
download_futures
|
|
196
|
+
.buffer_unordered(10)
|
|
197
|
+
.for_each(|result| async {
|
|
198
|
+
match result {
|
|
199
|
+
Ok((case_id, data_type, content)) => match parse_content(&content, &case_id, &data_type) {
|
|
200
|
+
Ok(parsed_data) => match serde_json::to_string(&parsed_data) {
|
|
201
|
+
Ok(json) => println!("{}", json),
|
|
202
|
+
Err(e) => {
|
|
203
|
+
let error = ErrorEntry {
|
|
204
|
+
case: format!("{}: {}", case_id, data_type),
|
|
205
|
+
error: format!("Failed to convert data to JSON {}", e),
|
|
206
|
+
};
|
|
207
|
+
let error_js = serde_json::to_string(&error).unwrap();
|
|
208
|
+
eprintln!("{}", error_js);
|
|
209
|
+
}
|
|
210
|
+
},
|
|
211
|
+
Err((cid, dtp, error)) => {
|
|
212
|
+
let error = ErrorEntry {
|
|
213
|
+
case: format!("{}: {}", cid, dtp),
|
|
214
|
+
error,
|
|
215
|
+
};
|
|
216
|
+
let error_js = serde_json::to_string(&error).unwrap();
|
|
217
|
+
eprintln!("{}", error_js);
|
|
218
|
+
}
|
|
219
|
+
},
|
|
220
|
+
Err((case_id, data_type, error)) => {
|
|
221
|
+
let error = ErrorEntry {
|
|
222
|
+
case: format!("{}: {}", case_id, data_type),
|
|
223
|
+
error,
|
|
224
|
+
};
|
|
225
|
+
let error_js = serde_json::to_string(&error).unwrap();
|
|
226
|
+
eprintln!("{}", error_js);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
})
|
|
230
|
+
.await;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
#[tokio::main]
|
|
234
|
+
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
235
|
+
const HOST: &str = "https://api.gdc.cancer.gov/data/";
|
|
236
|
+
|
|
237
|
+
// Accepting the piped input json from nodejs
|
|
238
|
+
let timeout_duration = Duration::from_secs(5); // Set a 5-second timeout
|
|
239
|
+
|
|
240
|
+
// Wrap the read operation in a timeout
|
|
241
|
+
let result = timeout(timeout_duration, async {
|
|
242
|
+
let mut buffer = String::new(); // Initialize an empty string to store input
|
|
243
|
+
let mut reader = BufReader::new(tokio::io::stdin()); // Create a buffered reader for stdin
|
|
244
|
+
reader.read_to_string(&mut buffer).await?; // Read a line asynchronously
|
|
245
|
+
Ok::<String, io::Error>(buffer) // Return the input as a Result
|
|
246
|
+
})
|
|
247
|
+
.await;
|
|
248
|
+
|
|
249
|
+
// Handle the result of the input timeout operation
|
|
250
|
+
let input_js: HashMap<String, DataType> = match result {
|
|
251
|
+
Ok(Ok(buffer)) => match serde_json::from_str(&buffer) {
|
|
252
|
+
Ok(js) => js,
|
|
253
|
+
Err(e) => {
|
|
254
|
+
let stdin_error = ErrorEntry {
|
|
255
|
+
case: String::new(),
|
|
256
|
+
error: format!("Input JSON parsing error: {}", e),
|
|
257
|
+
};
|
|
258
|
+
writeln!(io::stderr(), "{}", serde_json::to_string(&stdin_error).unwrap()).unwrap();
|
|
259
|
+
return Err(Box::new(std::io::Error::new(
|
|
260
|
+
std::io::ErrorKind::InvalidInput,
|
|
261
|
+
"Input JSON parsing Error!",
|
|
262
|
+
)) as Box<dyn std::error::Error>);
|
|
263
|
+
}
|
|
264
|
+
},
|
|
265
|
+
Ok(Err(_e)) => {
|
|
266
|
+
let stdin_error = ErrorEntry {
|
|
267
|
+
case: String::new(),
|
|
268
|
+
error: "Error reading from stdin.".to_string(),
|
|
269
|
+
};
|
|
270
|
+
let stdin_error_js = serde_json::to_string(&stdin_error).unwrap();
|
|
271
|
+
writeln!(io::stderr(), "{}", stdin_error_js).expect("Failed to output stderr!");
|
|
272
|
+
return Err(Box::new(std::io::Error::new(
|
|
273
|
+
std::io::ErrorKind::InvalidInput,
|
|
274
|
+
"Error reading from stdin!",
|
|
275
|
+
)) as Box<dyn std::error::Error>);
|
|
276
|
+
}
|
|
277
|
+
Err(_) => {
|
|
278
|
+
let stdin_error = ErrorEntry {
|
|
279
|
+
case: String::new(),
|
|
280
|
+
error: "Timeout while reading from stdin.".to_string(),
|
|
281
|
+
};
|
|
282
|
+
let stdin_error_js = serde_json::to_string(&stdin_error).unwrap();
|
|
283
|
+
writeln!(io::stderr(), "{}", stdin_error_js).expect("Failed to output stderr!");
|
|
284
|
+
return Err(Box::new(std::io::Error::new(
|
|
285
|
+
std::io::ErrorKind::InvalidInput,
|
|
286
|
+
"Timeout while reading from stdin.",
|
|
287
|
+
)) as Box<dyn std::error::Error>);
|
|
288
|
+
}
|
|
289
|
+
};
|
|
290
|
+
|
|
291
|
+
// Download data
|
|
292
|
+
download_data(input_js, HOST).await;
|
|
293
|
+
|
|
294
|
+
Ok(())
|
|
295
|
+
}
|