@sjcrh/proteinpaint-rust 2.125.0 → 2.126.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -34,6 +34,7 @@ reqwest = "0.11"
34
34
  flate2 = "1"
35
35
  futures = "0.3"
36
36
  num_cpus = "1.16.0"
37
+ memchr = "2"
37
38
 
38
39
  [profile.release]
39
40
  lto = "fat"
@@ -100,3 +101,7 @@ path="src/readHDF5.rs"
100
101
  [[bin]]
101
102
  name="validateHDF5"
102
103
  path="src/validateHDF5.rs"
104
+
105
+ [[bin]]
106
+ name="gdcGRIN2"
107
+ path="src/gdcGRIN2.rs"
package/index.js CHANGED
@@ -5,7 +5,7 @@ import { spawn, exec } from 'child_process'
5
5
  import { Readable, Transform } from 'stream'
6
6
  import { promisify } from 'util'
7
7
 
8
- const __dirname = import.meta.dirname
8
+ const __dirname = import.meta.dirname // set __dirname for consistency with cjs code
9
9
 
10
10
  const execPromise = promisify(exec)
11
11
 
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.125.0",
2
+ "version": "2.126.2",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "type": "module",
5
5
  "description": "Rust-based utilities for proteinpaint",
@@ -39,5 +39,5 @@
39
39
  "devDependencies": {
40
40
  "tape": "^5.2.2"
41
41
  },
42
- "pp_release_tag": "v2.125.0"
42
+ "pp_release_tag": "v2.126.2"
43
43
  }
@@ -0,0 +1,295 @@
1
+ use flate2::read::GzDecoder;
2
+ use futures::StreamExt;
3
+ use memchr::memchr;
4
+ use serde::Deserialize;
5
+ use serde_json;
6
+ use std::collections::HashMap;
7
+ use std::io::{self, Read, Write};
8
+ use std::time::Duration;
9
+ use tokio::io::{AsyncReadExt, BufReader};
10
+ use tokio::time::timeout;
11
+
12
+ // Struct to hold error information
13
+ #[derive(serde::Serialize)]
14
+ struct ErrorEntry {
15
+ case: String,
16
+ error: String,
17
+ }
18
+
19
+ // Define the structure for datadd
20
+ #[derive(Deserialize, Debug)]
21
+ struct DataType {
22
+ cnv: Option<String>,
23
+ maf: Option<String>,
24
+ }
25
+
26
+ // Function to parse TSV content
27
+ // CNV:
28
+ // Select cnv columns ["Chromosome","Start","End","Segment_Mean"]
29
+ // Segment_Mean >= 0.2 => gain; Segment_Mean <= -0.2 => loss
30
+ // MAF:
31
+ // Select MAF columns ["Chromosome","Start_Position","End_Position"]
32
+ fn parse_content(content: &str, case_id: &str, data_type: &str) -> Result<String, (String, String, String)> {
33
+ let lines = content.lines();
34
+ //let mut parsed_data = Vec::new();
35
+ let mut parsed_data: String = String::new();
36
+ let mut columns_indices: Vec<usize> = Vec::new();
37
+ let mut header_mk: &str = "";
38
+ let mut columns = Vec::new(); // columns selected from GDC file
39
+ if data_type == "cnv" {
40
+ header_mk = "GDC_Aliquot_ID";
41
+ columns = vec!["Chromosome", "Start", "End", "Segment_Mean"]
42
+ } else if data_type == "maf" {
43
+ header_mk = "Hugo_Symbol";
44
+ columns = vec!["Chromosome", "Start_Position", "End_Position"]
45
+ };
46
+ let mut header: Vec<String> = Vec::new(); // GDC file header
47
+ for line in lines {
48
+ if line.starts_with("#") {
49
+ continue;
50
+ } else if line.contains(&header_mk) {
51
+ // header line
52
+ header = line.split("\t").map(|s| s.to_string()).collect();
53
+ for col in &columns {
54
+ match header.iter().position(|x| x == col) {
55
+ Some(index) => {
56
+ columns_indices.push(index);
57
+ }
58
+ None => {
59
+ let error_msg = format!("Column {} was not found", col);
60
+ return Err((case_id.to_string(), data_type.to_string(), error_msg));
61
+ }
62
+ }
63
+ }
64
+ } else {
65
+ let mut keep_ck: bool = true;
66
+ let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
67
+ let mut out_lst: Vec<String> = Vec::new();
68
+ // add sample ID first
69
+ out_lst.push(case_id.to_string());
70
+ for x in columns_indices.iter() {
71
+ let mut element = cont_lst[*x].to_string();
72
+ if data_type == "cnv" && &header[*x] == "Segment_Mean" {
73
+ // convert to f32 (segment_mean)
74
+ let seg_mean = match element.parse::<f32>() {
75
+ Ok(val) => val,
76
+ Err(_e) => {
77
+ let error_msg = "Segment_Mean in cnv file is not float".to_string();
78
+ return Err((case_id.to_string(), data_type.to_string(), error_msg));
79
+ }
80
+ };
81
+ if seg_mean >= 0.2 {
82
+ element = "gain".to_string();
83
+ } else if seg_mean <= -0.2 {
84
+ element = "loss".to_string();
85
+ } else {
86
+ keep_ck = false;
87
+ }
88
+ }
89
+ out_lst.push(element);
90
+ }
91
+ // add lsn.type to snv
92
+ if data_type == "maf" {
93
+ out_lst.push("mutation".to_string());
94
+ }
95
+ if keep_ck {
96
+ parsed_data.push_str(out_lst.join("\t").as_str());
97
+ parsed_data.push_str("\n");
98
+ }
99
+ }
100
+ }
101
+ if columns_indices.is_empty() {
102
+ return Err((
103
+ case_id.to_string(),
104
+ data_type.to_string(),
105
+ "No matching columns found. Problematic file!".to_string(),
106
+ ));
107
+ };
108
+ Ok(parsed_data)
109
+ }
110
+
111
+ // Function to download data
112
+ //async fn download_data(data4dl: HashMap<String,DataType>, host: &str) -> Vec<Result<(String, String), (String, String)>> {
113
+ async fn download_data(data4dl: HashMap<String, DataType>, host: &str) -> () {
114
+ // Generate URLs from data4dl, handling optional cnv and maf
115
+ let data_urls = data4dl
116
+ .into_iter()
117
+ .flat_map(|(case_id, data_types)| {
118
+ let mut urls = Vec::new();
119
+ if let Some(cnv_uuid) = &data_types.cnv {
120
+ urls.push((case_id.clone(), "cnv".to_string(), format!("{}{}", host, cnv_uuid)));
121
+ }
122
+ if let Some(maf_uuid) = &data_types.maf {
123
+ urls.push((case_id.clone(), "maf".to_string(), format!("{}{}", host, maf_uuid)));
124
+ }
125
+ urls
126
+ })
127
+ .collect::<Vec<_>>();
128
+ let download_futures = futures::stream::iter(data_urls.into_iter().map(|(case_id, data_type, url)| {
129
+ async move {
130
+ //let case_dt = format!("{}/{}",case_id,data_type).to_string();
131
+ // Build HTTP client with timeouts
132
+ let client = reqwest::Client::builder()
133
+ .timeout(Duration::from_secs(60)) // 60-second timeout per request
134
+ .connect_timeout(Duration::from_secs(30))
135
+ .build()
136
+ .map_err(|_e| "Client build error".to_string());
137
+ // Handle client creation result
138
+ match client {
139
+ Ok(client) => {
140
+ match client.get(&url).send().await {
141
+ Ok(resp) if resp.status().is_success() => {
142
+ match resp.bytes().await {
143
+ Ok(content) => {
144
+ // if data_type == "cnv" {
145
+ if !memchr(0x00, &content).is_some() {
146
+ // CNV files are plain text
147
+ let text = String::from_utf8_lossy(&content).to_string();
148
+ Ok((case_id.clone(), data_type.clone(), text))
149
+ } else {
150
+ let mut decoder = GzDecoder::new(&content[..]);
151
+ let mut decompressed_content = Vec::new();
152
+ match decoder.read_to_end(&mut decompressed_content) {
153
+ Ok(_) => {
154
+ let text = String::from_utf8_lossy(&decompressed_content).to_string();
155
+ Ok((case_id.clone(), data_type.clone(), text))
156
+ }
157
+ Err(e) => {
158
+ let error_msg = format!(
159
+ "Failed to decompress {} file for {}: {}",
160
+ data_type, case_id, e
161
+ );
162
+ Err((case_id.clone(), data_type.clone(), error_msg))
163
+ }
164
+ }
165
+ }
166
+ }
167
+ Err(e) => {
168
+ let error_msg =
169
+ format!("Failed to read bytes for {} file for {}: {}", data_type, case_id, e);
170
+ Err((case_id.clone(), data_type.clone(), error_msg))
171
+ }
172
+ }
173
+ }
174
+ Ok(resp) => {
175
+ let error_msg =
176
+ format!("HTTP error for {} file for {}: {}", data_type, case_id, resp.status());
177
+ Err((case_id.clone(), data_type.clone(), error_msg))
178
+ }
179
+ Err(e) => {
180
+ let error_msg =
181
+ format!("Server request failed for {} file for {}: {}", data_type, case_id, e);
182
+ Err((case_id.clone(), data_type.clone(), error_msg))
183
+ }
184
+ }
185
+ }
186
+ Err(_e) => {
187
+ let error_msg = "Client build error".to_string();
188
+ Err((case_id, data_type, error_msg))
189
+ }
190
+ }
191
+ }
192
+ }));
193
+
194
+ // Execute downloads concurrently and collect results
195
+ download_futures
196
+ .buffer_unordered(10)
197
+ .for_each(|result| async {
198
+ match result {
199
+ Ok((case_id, data_type, content)) => match parse_content(&content, &case_id, &data_type) {
200
+ Ok(parsed_data) => match serde_json::to_string(&parsed_data) {
201
+ Ok(json) => println!("{}", json),
202
+ Err(e) => {
203
+ let error = ErrorEntry {
204
+ case: format!("{}: {}", case_id, data_type),
205
+ error: format!("Failed to convert data to JSON {}", e),
206
+ };
207
+ let error_js = serde_json::to_string(&error).unwrap();
208
+ eprintln!("{}", error_js);
209
+ }
210
+ },
211
+ Err((cid, dtp, error)) => {
212
+ let error = ErrorEntry {
213
+ case: format!("{}: {}", cid, dtp),
214
+ error,
215
+ };
216
+ let error_js = serde_json::to_string(&error).unwrap();
217
+ eprintln!("{}", error_js);
218
+ }
219
+ },
220
+ Err((case_id, data_type, error)) => {
221
+ let error = ErrorEntry {
222
+ case: format!("{}: {}", case_id, data_type),
223
+ error,
224
+ };
225
+ let error_js = serde_json::to_string(&error).unwrap();
226
+ eprintln!("{}", error_js);
227
+ }
228
+ }
229
+ })
230
+ .await;
231
+ }
232
+
233
+ #[tokio::main]
234
+ async fn main() -> Result<(), Box<dyn std::error::Error>> {
235
+ const HOST: &str = "https://api.gdc.cancer.gov/data/";
236
+
237
+ // Accepting the piped input json from nodejs
238
+ let timeout_duration = Duration::from_secs(5); // Set a 5-second timeout
239
+
240
+ // Wrap the read operation in a timeout
241
+ let result = timeout(timeout_duration, async {
242
+ let mut buffer = String::new(); // Initialize an empty string to store input
243
+ let mut reader = BufReader::new(tokio::io::stdin()); // Create a buffered reader for stdin
244
+ reader.read_to_string(&mut buffer).await?; // Read a line asynchronously
245
+ Ok::<String, io::Error>(buffer) // Return the input as a Result
246
+ })
247
+ .await;
248
+
249
+ // Handle the result of the input timeout operation
250
+ let input_js: HashMap<String, DataType> = match result {
251
+ Ok(Ok(buffer)) => match serde_json::from_str(&buffer) {
252
+ Ok(js) => js,
253
+ Err(e) => {
254
+ let stdin_error = ErrorEntry {
255
+ case: String::new(),
256
+ error: format!("Input JSON parsing error: {}", e),
257
+ };
258
+ writeln!(io::stderr(), "{}", serde_json::to_string(&stdin_error).unwrap()).unwrap();
259
+ return Err(Box::new(std::io::Error::new(
260
+ std::io::ErrorKind::InvalidInput,
261
+ "Input JSON parsing Error!",
262
+ )) as Box<dyn std::error::Error>);
263
+ }
264
+ },
265
+ Ok(Err(_e)) => {
266
+ let stdin_error = ErrorEntry {
267
+ case: String::new(),
268
+ error: "Error reading from stdin.".to_string(),
269
+ };
270
+ let stdin_error_js = serde_json::to_string(&stdin_error).unwrap();
271
+ writeln!(io::stderr(), "{}", stdin_error_js).expect("Failed to output stderr!");
272
+ return Err(Box::new(std::io::Error::new(
273
+ std::io::ErrorKind::InvalidInput,
274
+ "Error reading from stdin!",
275
+ )) as Box<dyn std::error::Error>);
276
+ }
277
+ Err(_) => {
278
+ let stdin_error = ErrorEntry {
279
+ case: String::new(),
280
+ error: "Timeout while reading from stdin.".to_string(),
281
+ };
282
+ let stdin_error_js = serde_json::to_string(&stdin_error).unwrap();
283
+ writeln!(io::stderr(), "{}", stdin_error_js).expect("Failed to output stderr!");
284
+ return Err(Box::new(std::io::Error::new(
285
+ std::io::ErrorKind::InvalidInput,
286
+ "Timeout while reading from stdin.",
287
+ )) as Box<dyn std::error::Error>);
288
+ }
289
+ };
290
+
291
+ // Download data
292
+ download_data(input_js, HOST).await;
293
+
294
+ Ok(())
295
+ }