@sjcrh/proteinpaint-rust 2.111.0 → 2.112.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/index.js +142 -32
  2. package/package.json +2 -2
  3. package/src/gdcmaf.rs +130 -46
package/index.js CHANGED
@@ -1,7 +1,10 @@
1
1
  const path = require('path'),
2
- spawn = require('child_process').spawn,
2
+ { spawn, exec } = require('child_process'),
3
3
  Readable = require('stream').Readable,
4
- Transform = require('stream').Transform
4
+ Transform = require('stream').Transform,
5
+ { promisify } = require('util')
6
+
7
+ const execPromise = promisify(exec)
5
8
 
6
9
  exports.run_rust = function (binfile, input_data) {
7
10
  return new Promise((resolve, reject) => {
@@ -45,47 +48,154 @@ exports.run_rust = function (binfile, input_data) {
45
48
 
46
49
  exports.stream_rust = function (binfile, input_data, emitJson) {
47
50
  const binpath = path.join(__dirname, '/target/release/', binfile)
48
- const ps = spawn(binpath)
49
- const stderr = []
50
- try {
51
- // from GDC API -> ps.stdin -> ps.stdout -> transformed stream
52
- Readable.from(input_data).pipe(ps.stdin)
53
- //reader.on('data', ps.stdout.pipe)
54
- //reader.on('error', ps.stderr.pipe)
55
- //return reader
56
- } catch (error) {
57
- ps.kill()
58
- let errmsg = error
59
- //if (stderr.length) errmsg += `killed run_rust('${binfile}'), stderr: ${stderr.join('').trim()}`
60
- //reject(errmsg)
61
- console.log(59, error)
62
- }
63
51
 
52
+ const ps = spawn(binpath)
64
53
  const childStream = new Transform({
65
54
  transform(chunk, encoding, callback) {
66
55
  this.push(chunk)
67
56
  callback()
68
57
  }
69
58
  })
70
- ps.stdout.pipe(childStream)
59
+ // we only want to run this interval loop inside a container, not in dev/test CI
60
+ if (binfile == 'gdcmaf') trackByPid(ps.pid, binfile)
61
+ const stderr = []
62
+ try {
63
+ // from route handler -> input_data -> ps.stdin -> ps.stdout -> transformed stream -> express response.pipe()
64
+ Readable.from(input_data)
65
+ .pipe(ps.stdin)
66
+ .on('error', err => {
67
+ emitErrors({ error: `error piping input data to spawned ${binfile} process` })
68
+ })
69
+ } catch (error) {
70
+ console.log(`Error piping input_data into ${binfile}`, error)
71
+ return
72
+ }
73
+
74
+ // uncomment to trigger childStream.destroy()
75
+ // setTimeout(() => { console.log(74, 'childStream.destroy()'); childStream.destroy();}, 1000)
76
+ // childStream.destroy() does not seem to trigger ps.stdout.pipe('...').on('error') callback,
77
+ // which is okay as long as the server doesn't crash and ps get's killed eventually
78
+ ps.stdout.pipe(childStream).on('error', err => console.log('ps.stdout.pipe(childStream) error', err))
79
+
71
80
  ps.stderr.on('data', data => stderr.push(data))
72
- ps.on('close', code => { //console.log(72, stderr.length)
73
- if (stderr.length) {
74
- // handle rust stderr
75
- const errors = stderr.join('').trim().split('\n').map(JSON.parse)
76
- //const errmsg = `!!! stream_rust('${binfile}') stderr: !!!`
77
- //console.log(errmsg, errors)
78
- emitJson({errors})
81
+
82
+ ps.on('close', code => {
83
+ if (trackedPids.has(ps.pid)) trackedPids.delete(ps.pid)
84
+ if (stderr.length || killedPids.has(ps.pid) || code !== 0) {
85
+ emitErrors(null, ps.pid, code)
79
86
  } else {
80
- emitJson({ ok: true, status: 'ok', message: 'Processing complete' })
87
+ emitJson()
81
88
  }
82
89
  })
83
90
  ps.on('error', err => {
84
- //console.log(74, `stream_rust().on('error')`, err)
85
- const errors = stderr.join('').trim().split('\n').map(JSON.parse)
86
- emitJson({errors})
91
+ if (trackedPids.has(ps.pid)) trackedPids.delete(ps.pid)
92
+ // console.log(74, `stream_rust().on('error')`, err)
93
+ emitErrors(null, ps.pid)
94
+ })
95
+ ps.on('SIGTERM', err => {
96
+ console.log(err)
97
+ })
98
+
99
+ function emitErrors(error, pid, code = 0) {
100
+ // concatenate stderr uint8arr into a string
101
+ let errors = stderr.join('').trim()
102
+ if (error) errors += `\n` + error
103
+ if (pid && killedPids.has(ps.pid) && !trackedPids.has(ps.pid)) {
104
+ errors += '\n' + JSON.stringify({ error: `server error: MAF file processing terminated (expired process)` })
105
+ killedPids.delete(pid)
106
+ } else if (pid && code !== 0) {
107
+ // may result from errors in spawned process code, or external signal (like `kill -9` in terminal)
108
+ errors += '\n' + JSON.stringify({ error: `server error: MAF file processing terminated (code=${code})` })
109
+ }
110
+ emitJson(errors)
111
+ }
112
+
113
+ // on('end') will duplicate ps.on('close') event above
114
+ // childStream.on('end', () => console.log(`childStream.on(end)`))
115
+
116
+ // this may duplicate ps.on('error'), unless the error happened within the transform
117
+ childStream.on('error', err => {
118
+ console.log('stream_rust childStream.on(error)', err)
119
+ try {
120
+ childStream.destroy(err)
121
+ } catch (e) {
122
+ console.log(e)
123
+ }
87
124
  })
88
- // below will duplicate ps.on('close') event above
89
- // childStream.on('end', () => console.log(`-- childStream done --`))
90
- return childStream
125
+
126
+ function endStream() {
127
+ try {
128
+ if (!childStream.writableEnded) {
129
+ console.log('trigger childStream.destroy() in endStream()')
130
+ childStream.destroy()
131
+ }
132
+ } catch (e) {
133
+ console.log('error triggering childStream.destroy()', e)
134
+ }
135
+ try {
136
+ if (!ps.killed) {
137
+ console.log('trigger ps.kill() in endStream()')
138
+ ps.kill()
139
+ }
140
+ if (trackedPids.has(ps.pid)) trackedPids.delete(ps.pid)
141
+ } catch (e) {
142
+ console.log('error triggering ps.kill()', e)
143
+ }
144
+ }
145
+
146
+ return { rustStream: childStream, endStream }
147
+ }
148
+
149
+ const trackedPids = new Map() // will be used to monitor expired processes
150
+ const killedPids = new Set() // will be used to detect killed processes, to help with error detection
151
+ const PSKILL_INTERVAL_MS = 30000 // every 30 seconds
152
+ let psKillInterval
153
+
154
+ // default maxElapsed = 5 * 60 * 1000 millisecond = 300000 or 5 minutes, change to 0 to test
155
+ // may allow configuration of maxElapsed by dataset/argument
156
+ function trackByPid(pid, name, maxElapsed = 300000) {
157
+ if (!pid) return
158
+ // only track by value (integer, string), not reference object
159
+ // NOTE: a reused/reassigned process.pid will be replaced by the most recent process
160
+ trackedPids.set(pid, { name, expires: Date.now() + maxElapsed })
161
+ if (!psKillInterval) psKillInterval = setInterval(killExpiredProcesses, PSKILL_INTERVAL_MS)
162
+ // uncomment below to test
163
+ // console.log([...trackedPids.entries()])
164
+ // if (maxElapsed < 10000) setTimeout(killExpiredProcesses, 1000) // uncomment for testing only
165
+ }
166
+
167
+ //
168
+ // Use one setInterval() to monitor >= 1 process,
169
+ // instead of a separate setTimeout() for each process.
170
+ // This is more reliable as setTimeout would use spawned ps.kill(),
171
+ // which may not exist when the timeout callback is executed and
172
+ // thus would require clearTimeout(closured_variable). Tracking by
173
+ // pid does not rely on a usable 'ps' variable to kill itself.
174
+ //
175
+ function killExpiredProcesses() {
176
+ //console.log(149, 'killExpiredProcesses()')
177
+ killedPids.clear()
178
+ const time = Date.now()
179
+ for (const [pid, info] of trackedPids.entries()) {
180
+ if (info.expires > time) continue
181
+ try {
182
+ // true if process exists
183
+ process.kill(pid, 0)
184
+ } catch (_) {
185
+ // no need to kill, but remove from tracking
186
+ trackedPids.delete(pid)
187
+ // prevent misleading logs of 'unable to kill ...'
188
+ continue
189
+ }
190
+ const label = `rust process ${info.name} (pid=${pid})`
191
+ try {
192
+ // detect if process exists before killing it
193
+ process.kill(pid, 'SIGTERM')
194
+ trackedPids.delete(pid)
195
+ killedPids.add(pid)
196
+ console.log(`killed ${label}`)
197
+ } catch (err) {
198
+ console.log(`unable to kill ${label}`, err)
199
+ }
200
+ }
91
201
  }
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.111.0",
2
+ "version": "2.112.0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
@@ -38,5 +38,5 @@
38
38
  "devDependencies": {
39
39
  "tape": "^5.2.2"
40
40
  },
41
- "pp_release_tag": "v2.111.0"
41
+ "pp_release_tag": "v2.112.0"
42
42
  }
package/src/gdcmaf.rs CHANGED
@@ -1,22 +1,25 @@
1
1
  /*
2
- This script download cohort maf files from GDC, concatenate them into a single file that includes user specified columns.
2
+ This script download cohort maf files from GDC, concatenate them into a single file that includes user specified columns.
3
3
 
4
- Input JSON:
5
- host: GDC host
6
- fileIdLst: An array of uuid
7
- Output gzip compressed maf file to stdout.
4
+ Input JSON:
5
+ host: GDC host
6
+ fileIdLst: An array of uuid
7
+ Output gzip compressed maf file to stdout.
8
8
 
9
- Example of usage:
10
- echo '{"host": "https://api.gdc.cancer.gov/data/","columns": ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome", "Start_Position"], "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
9
+ Example of usage:
10
+ echo '{"host": "https://api.gdc.cancer.gov/data/","columns": ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome", "Start_Position"], "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
11
11
  */
12
12
 
13
13
  use flate2::read::GzDecoder;
14
14
  use flate2::write::GzEncoder;
15
15
  use flate2::Compression;
16
16
  use serde_json::{Value};
17
- use std::path::Path;
18
17
  use futures::StreamExt;
19
18
  use std::io::{self,Read,Write};
19
+ use std::time::Duration;
20
+ use tokio::io::{AsyncReadExt, BufReader};
21
+ use tokio::time::timeout;
22
+ use std::sync::{Arc, Mutex};
20
23
 
21
24
  // Struct to hold error information
22
25
  #[derive(serde::Serialize)]
@@ -45,6 +48,9 @@ fn select_maf_col(d:String,columns:&Vec<String>,url:&str) -> Result<(Vec<u8>,i32
45
48
  return Err((url.to_string(), error_msg));
46
49
  }
47
50
  }
51
+ };
52
+ if header_indices.is_empty() {
53
+ return Err((url.to_string(), "No matching columns found".to_string()));
48
54
  }
49
55
  } else {
50
56
  let maf_cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
@@ -67,16 +73,67 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
67
73
  // Accepting the piped input json from jodejs and assign to the variable
68
74
  // host: GDC host
69
75
  // url: urls to download single maf files
70
- let mut buffer = String::new();
71
- io::stdin().read_line(&mut buffer)?;
76
+ let timeout_duration = Duration::from_secs(5); // Set a 10-second timeout
77
+
78
+ // Wrap the read operation in a timeout
79
+ let result = timeout(timeout_duration, async {
80
+ let mut buffer = String::new(); // Initialize an empty string to store input
81
+ let mut reader = BufReader::new(tokio::io::stdin()); // Create a buffered reader for stdin
82
+ reader.read_to_string(&mut buffer).await?; // Read a line asynchronously
83
+ Ok::<String, io::Error>(buffer) // Return the input as a Result
84
+ })
85
+ .await;
86
+ // Handle the result of the timeout operation
87
+ let file_id_lst_js: Value = match result {
88
+ Ok(Ok(buffer)) => {
89
+ match serde_json::from_str(&buffer) {
90
+ Ok(js) => js,
91
+ Err(e) => {
92
+ let stdin_error = ErrorEntry {
93
+ url: String::new(),
94
+ error: format!("JSON parsing error: {}", e),
95
+ };
96
+ writeln!(io::stderr(), "{}", serde_json::to_string(&stdin_error).unwrap()).unwrap();
97
+ return Err(Box::new(std::io::Error::new(
98
+ std::io::ErrorKind::InvalidInput,
99
+ "JSON parsing error!",
100
+ )) as Box<dyn std::error::Error>);
101
+ }
102
+ }
103
+ }
104
+ Ok(Err(_e)) => {
105
+ let stdin_error = ErrorEntry {
106
+ url: String::new(),
107
+ error: "Error reading from stdin.".to_string(),
108
+ };
109
+ let stdin_error_js = serde_json::to_string(&stdin_error).unwrap();
110
+ writeln!(io::stderr(), "{}", stdin_error_js).expect("Failed to output stderr!");
111
+ return Err(Box::new(std::io::Error::new(
112
+ std::io::ErrorKind::InvalidInput,
113
+ "Failed to output stderr!",
114
+ )) as Box<dyn std::error::Error>);
115
+ }
116
+ Err(_) => {
117
+ let stdin_error = ErrorEntry {
118
+ url: String::new(),
119
+ error: "Timeout while reading from stdin.".to_string(),
120
+ };
121
+ let stdin_error_js = serde_json::to_string(&stdin_error).unwrap();
122
+ writeln!(io::stderr(), "{}", stdin_error_js).expect("Failed to output stderr!");
123
+ return Err(Box::new(std::io::Error::new(
124
+ std::io::ErrorKind::InvalidInput,
125
+ "The columns in arg is not an array",
126
+ )) as Box<dyn std::error::Error>);
127
+ }
128
+ };
72
129
 
73
130
  // reading the input from PP
74
- let file_id_lst_js = serde_json::from_str::<Value>(&buffer).expect("Error reading input and serializing to JSON");
75
131
  let host = file_id_lst_js.get("host").expect("Host was not provided").as_str().expect("Host is not a string");
76
132
  let mut url: Vec<String> = Vec::new();
77
133
  let file_id_lst = file_id_lst_js.get("fileIdLst").expect("File ID list is missed!").as_array().expect("File ID list is not an array");
78
134
  for v in file_id_lst {
79
- url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
135
+ //url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
136
+ url.push(format!("{}/{}",host.trim_end_matches('/'), v.as_str().unwrap()));
80
137
  };
81
138
 
82
139
  // read columns as array from input json and convert data type from Vec<Value> to Vec<String>
@@ -117,7 +174,19 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
117
174
  let download_futures = futures::stream::iter(
118
175
  url.into_iter().map(|url|{
119
176
  async move {
120
- match reqwest::get(&url).await {
177
+ let client = reqwest::Client::builder()
178
+ .timeout(Duration::from_secs(60)) // 60-second timeout per request
179
+ .connect_timeout(Duration::from_secs(15))
180
+ .build()
181
+ .map_err(|_e| {
182
+ let client_error = ErrorEntry{
183
+ url: url.clone(),
184
+ error: "Client build error".to_string(),
185
+ };
186
+ let client_error_js = serde_json::to_string(&client_error).unwrap();
187
+ writeln!(io::stderr(), "{}", client_error_js).expect("Failed to build reqwest client!");
188
+ });
189
+ match client.unwrap().get(&url).send().await {
121
190
  Ok(resp) if resp.status().is_success() => {
122
191
  match resp.bytes().await {
123
192
  Ok(content) => {
@@ -154,50 +223,65 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
154
223
  );
155
224
 
156
225
  // binary output
157
- let mut encoder = GzEncoder::new(io::stdout(), Compression::default());
158
- let _ = encoder.write_all(&maf_col.join("\t").as_bytes().to_vec()).expect("Failed to write header");
159
- let _ = encoder.write_all(b"\n").expect("Failed to write newline");
226
+ let encoder = Arc::new(Mutex::new(GzEncoder::new(io::stdout(), Compression::default())));
160
227
 
161
- download_futures.buffer_unordered(20).for_each(|result| {
162
- match result {
163
- Ok((url, content)) => {
164
- match select_maf_col(content, &maf_col, &url) {
165
- Ok((maf_bit,mafrows)) => {
166
- if mafrows > 0 {
167
- encoder.write_all(&maf_bit).expect("Failed to write file");
168
- } else {
228
+ // Write the header
229
+ {
230
+ let mut encoder_guard = encoder.lock().unwrap(); // Lock the Mutex to get access to the inner GzEncoder
231
+ encoder_guard.write_all(&maf_col.join("\t").as_bytes().to_vec()).expect("Failed to write header");
232
+ encoder_guard.write_all(b"\n").expect("Failed to write newline");
233
+ }
234
+
235
+ download_futures.buffer_unordered(20).for_each( |result| {
236
+ let encoder = Arc::clone(&encoder); // Clone the Arc for each task
237
+ let maf_col_cp = maf_col.clone();
238
+ async move {
239
+ match result {
240
+ Ok((url, content)) => {
241
+ match select_maf_col(content, &maf_col_cp, &url) {
242
+ Ok((maf_bit,mafrows)) => {
243
+ if mafrows > 0 {
244
+ let mut encoder_guard = encoder.lock().unwrap();
245
+ encoder_guard.write_all(&maf_bit).expect("Failed to write file");
246
+ } else {
247
+ let error = ErrorEntry {
248
+ url: url.clone(),
249
+ error: "Empty maf file".to_string(),
250
+ };
251
+ let error_js = serde_json::to_string(&error).unwrap();
252
+ writeln!(io::stderr(), "{}", error_js).expect("Failed to output stderr!");
253
+ }
254
+ }
255
+ Err((url,error)) => {
169
256
  let error = ErrorEntry {
170
- url: url.clone(),
171
- error: "Empty maf file".to_string(),
257
+ url,
258
+ error,
172
259
  };
173
260
  let error_js = serde_json::to_string(&error).unwrap();
174
261
  writeln!(io::stderr(), "{}", error_js).expect("Failed to output stderr!");
175
262
  }
176
263
  }
177
- Err((url,error)) => {
178
- let error = ErrorEntry {
179
- url,
180
- error,
181
- };
182
- let error_js = serde_json::to_string(&error).unwrap();
183
- writeln!(io::stderr(), "{}", error_js).expect("Failed to output stderr!");
184
- }
185
264
  }
186
- }
187
- Err((url, error)) => {
188
- let error = ErrorEntry {
189
- url,
190
- error,
191
- };
192
- let error_js = serde_json::to_string(&error).unwrap();
193
- writeln!(io::stderr(), "{}", error_js).expect("Failed to output stderr!");
194
- }
195
- };
196
- async {}
265
+ Err((url, error)) => {
266
+ let error = ErrorEntry {
267
+ url,
268
+ error,
269
+ };
270
+ let error_js = serde_json::to_string(&error).unwrap();
271
+ writeln!(io::stderr(), "{}", error_js).expect("Failed to output stderr!");
272
+ }
273
+ };
274
+ }
197
275
  }).await;
198
276
 
199
- // Finalize output and printing errors
277
+ // Finalize output
278
+
279
+ // Replace the value inside the Mutex with a dummy value (e.g., None)
280
+ let mut encoder_guard = encoder.lock().unwrap();
281
+ let encoder = std::mem::replace(&mut *encoder_guard, GzEncoder::new(io::stdout(), Compression::default()));
282
+ // Finalize the encoder
200
283
  encoder.finish().expect("Maf file output error!");
284
+
201
285
  // Manually flush stdout and stderr
202
286
  io::stdout().flush().expect("Failed to flush stdout");
203
287
  io::stderr().flush().expect("Failed to flush stderr");