@sjcrh/proteinpaint-rust 2.129.6 → 2.130.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -3
- package/package.json +1 -1
- package/src/gdcGRIN2.rs +109 -32
package/README.md
CHANGED
|
@@ -36,13 +36,18 @@ const out = await run_rust('indel', input_data)
|
|
|
36
36
|
|
|
37
37
|
## Test
|
|
38
38
|
|
|
39
|
-
|
|
39
|
+
For running the tests written in nodejs, from the `proteinpaint` directory run,
|
|
40
40
|
|
|
41
41
|
```bash
|
|
42
|
-
|
|
43
|
-
npx tsc
|
|
42
|
+
npm run test:unit --workspace="rust"
|
|
44
43
|
```
|
|
45
44
|
|
|
45
|
+
For running the tests written in native rust, from the `proteinpaint/rust` directory run.
|
|
46
|
+
```bash
|
|
47
|
+
cargo test
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
|
|
46
51
|
## Build
|
|
47
52
|
|
|
48
53
|
```bash
|
package/package.json
CHANGED
package/src/gdcGRIN2.rs
CHANGED
|
@@ -19,12 +19,13 @@
|
|
|
19
19
|
use flate2::read::GzDecoder;
|
|
20
20
|
use futures::StreamExt;
|
|
21
21
|
use memchr::memchr;
|
|
22
|
-
use serde::Deserialize;
|
|
22
|
+
use serde::{Deserialize, Serialize};
|
|
23
23
|
use serde_json;
|
|
24
24
|
use std::collections::HashMap;
|
|
25
25
|
use std::io::{self, Read};
|
|
26
26
|
use std::sync::Arc;
|
|
27
27
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
28
|
+
use std::thread::sleep;
|
|
28
29
|
use std::time::Duration;
|
|
29
30
|
use tokio::io::{AsyncReadExt, BufReader};
|
|
30
31
|
use tokio::sync::Mutex;
|
|
@@ -78,6 +79,30 @@ struct SuccessfulFileOutput {
|
|
|
78
79
|
data: Vec<Vec<String>>,
|
|
79
80
|
}
|
|
80
81
|
|
|
82
|
+
// struct for MAF filter details
|
|
83
|
+
#[derive(Clone, Serialize, Default)]
|
|
84
|
+
struct FilteredMafDetails {
|
|
85
|
+
invalid_consequences: usize,
|
|
86
|
+
t_alt_count: usize,
|
|
87
|
+
t_depth: usize,
|
|
88
|
+
invalid_rows: usize,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// struct for CNV filter details
|
|
92
|
+
#[derive(Clone, Serialize, Default)]
|
|
93
|
+
struct FilteredCnvDetails {
|
|
94
|
+
segment_mean: usize,
|
|
95
|
+
seg_length: usize,
|
|
96
|
+
invalid_rows: usize,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// struct for per-case filter details
|
|
100
|
+
#[derive(Clone, Serialize)]
|
|
101
|
+
struct FilteredCaseDetails {
|
|
102
|
+
maf: FilteredMafDetails,
|
|
103
|
+
cnv: FilteredCnvDetails,
|
|
104
|
+
}
|
|
105
|
+
|
|
81
106
|
// Final summary output (JSONL format)
|
|
82
107
|
#[derive(serde::Serialize)]
|
|
83
108
|
struct FinalSummary {
|
|
@@ -87,6 +112,10 @@ struct FinalSummary {
|
|
|
87
112
|
successful_files: usize,
|
|
88
113
|
failed_files: usize,
|
|
89
114
|
errors: Vec<ErrorEntry>,
|
|
115
|
+
filtered_records: usize,
|
|
116
|
+
filtered_maf_records: usize,
|
|
117
|
+
filtered_cnv_records: usize,
|
|
118
|
+
filtered_records_by_case: HashMap<String, FilteredCaseDetails>,
|
|
90
119
|
}
|
|
91
120
|
|
|
92
121
|
// Define the top-level input structure
|
|
@@ -107,24 +136,8 @@ struct DataTypeConfig {
|
|
|
107
136
|
output_columns: Vec<&'static str>,
|
|
108
137
|
}
|
|
109
138
|
|
|
110
|
-
// Function to check if CNV file has Segment_Mean column
|
|
111
|
-
fn has_segment_mean_column(content: &str) -> bool {
|
|
112
|
-
for line in content.lines() {
|
|
113
|
-
// Check if this line contains Segment_Mean (likely the header)
|
|
114
|
-
if line.contains("Segment_Mean") {
|
|
115
|
-
return true;
|
|
116
|
-
}
|
|
117
|
-
// Stop checking after a few non-comment lines to avoid parsing entire file
|
|
118
|
-
if !line.trim().is_empty() {
|
|
119
|
-
break;
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
false
|
|
123
|
-
}
|
|
124
|
-
|
|
125
139
|
// Function to parse TSV content
|
|
126
|
-
|
|
127
|
-
fn parse_content(
|
|
140
|
+
async fn parse_content(
|
|
128
141
|
content: &str,
|
|
129
142
|
case_id: &str,
|
|
130
143
|
data_type: &str,
|
|
@@ -134,12 +147,10 @@ fn parse_content(
|
|
|
134
147
|
gain_threshold: f32,
|
|
135
148
|
loss_threshold: f32,
|
|
136
149
|
seg_length: i32,
|
|
150
|
+
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
151
|
+
filtered_maf_records: &AtomicUsize,
|
|
152
|
+
filtered_cnv_records: &AtomicUsize,
|
|
137
153
|
) -> Result<Vec<Vec<String>>, (String, String, String)> {
|
|
138
|
-
// Early filter for CNV files - only process files with Segment_Mean
|
|
139
|
-
if data_type == "cnv" && !has_segment_mean_column(content) {
|
|
140
|
-
return Ok(Vec::new()); // Return empty result, no error
|
|
141
|
-
}
|
|
142
|
-
|
|
143
154
|
let config = match data_type {
|
|
144
155
|
"cnv" => DataTypeConfig {
|
|
145
156
|
header_marker: "Segment_Mean",
|
|
@@ -199,7 +210,11 @@ fn parse_content(
|
|
|
199
210
|
gain_threshold,
|
|
200
211
|
loss_threshold,
|
|
201
212
|
seg_length,
|
|
202
|
-
|
|
213
|
+
filtered_records,
|
|
214
|
+
filtered_maf_records,
|
|
215
|
+
filtered_cnv_records,
|
|
216
|
+
)
|
|
217
|
+
.await?;
|
|
203
218
|
|
|
204
219
|
if let Some(out_lst) = row {
|
|
205
220
|
parsed_data.push(out_lst);
|
|
@@ -254,7 +269,7 @@ fn setup_columns(
|
|
|
254
269
|
}
|
|
255
270
|
|
|
256
271
|
// Process a single row of data
|
|
257
|
-
fn process_row(
|
|
272
|
+
async fn process_row(
|
|
258
273
|
line: &str,
|
|
259
274
|
case_id: &str,
|
|
260
275
|
data_type: &str,
|
|
@@ -267,18 +282,41 @@ fn process_row(
|
|
|
267
282
|
gain_threshold: f32,
|
|
268
283
|
loss_threshold: f32,
|
|
269
284
|
seg_length: i32,
|
|
285
|
+
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
286
|
+
filtered_maf_records: &AtomicUsize,
|
|
287
|
+
filtered_cnv_records: &AtomicUsize,
|
|
270
288
|
) -> Result<Option<Vec<String>>, (String, String, String)> {
|
|
271
289
|
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
272
290
|
let mut out_lst = vec![case_id.to_string()];
|
|
273
291
|
|
|
292
|
+
// Initialize or update case details
|
|
293
|
+
let mut filtered_map = filtered_records.lock().await;
|
|
294
|
+
filtered_map
|
|
295
|
+
.entry(case_id.to_string())
|
|
296
|
+
.or_insert_with(|| FilteredCaseDetails {
|
|
297
|
+
maf: FilteredMafDetails::default(),
|
|
298
|
+
cnv: FilteredCnvDetails::default(),
|
|
299
|
+
});
|
|
300
|
+
|
|
301
|
+
let case_details = filtered_map.get_mut(case_id).unwrap();
|
|
302
|
+
|
|
274
303
|
// Check consequence filtering for MAF files
|
|
275
304
|
if data_type == "maf" && !is_valid_consequence(&cont_lst, variant_classification_index, consequences) {
|
|
305
|
+
case_details.maf.invalid_consequences += 1;
|
|
306
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
276
307
|
return Ok(None);
|
|
277
308
|
}
|
|
278
309
|
|
|
279
310
|
// Extract relevant columns
|
|
280
311
|
for &x in columns_indices {
|
|
281
312
|
if x >= cont_lst.len() {
|
|
313
|
+
if data_type == "maf" {
|
|
314
|
+
case_details.maf.invalid_rows += 1;
|
|
315
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
316
|
+
} else if data_type == "cnv" {
|
|
317
|
+
case_details.cnv.invalid_rows += 1;
|
|
318
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
319
|
+
}
|
|
282
320
|
return Ok(None); // Invalid row
|
|
283
321
|
}
|
|
284
322
|
|
|
@@ -286,6 +324,8 @@ fn process_row(
|
|
|
286
324
|
if data_type == "cnv" && header[x] == "Segment_Mean" {
|
|
287
325
|
element = process_segment_mean(&element, case_id, data_type, gain_threshold, loss_threshold)?;
|
|
288
326
|
if element.is_empty() {
|
|
327
|
+
case_details.cnv.segment_mean += 1;
|
|
328
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
289
329
|
return Ok(None);
|
|
290
330
|
}
|
|
291
331
|
}
|
|
@@ -295,10 +335,14 @@ fn process_row(
|
|
|
295
335
|
// Additional MAF-specific processing
|
|
296
336
|
if data_type == "maf" {
|
|
297
337
|
if out_lst.len() < 6 {
|
|
338
|
+
case_details.maf.invalid_rows += 1;
|
|
339
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
298
340
|
return Ok(None); // Not enough columns
|
|
299
341
|
}
|
|
300
342
|
|
|
301
343
|
let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
|
|
344
|
+
case_details.maf.invalid_rows += 1;
|
|
345
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
302
346
|
(
|
|
303
347
|
case_id.to_string(),
|
|
304
348
|
data_type.to_string(),
|
|
@@ -307,6 +351,8 @@ fn process_row(
|
|
|
307
351
|
})?;
|
|
308
352
|
|
|
309
353
|
let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
|
|
354
|
+
case_details.maf.invalid_rows += 1;
|
|
355
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
310
356
|
(
|
|
311
357
|
case_id.to_string(),
|
|
312
358
|
data_type.to_string(),
|
|
@@ -314,7 +360,14 @@ fn process_row(
|
|
|
314
360
|
)
|
|
315
361
|
})?;
|
|
316
362
|
|
|
317
|
-
if alle_depth < min_total_depth
|
|
363
|
+
if alle_depth < min_total_depth {
|
|
364
|
+
case_details.maf.t_depth += 1;
|
|
365
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
366
|
+
return Ok(None);
|
|
367
|
+
}
|
|
368
|
+
if alt_count < min_alt_allele_count {
|
|
369
|
+
case_details.maf.t_alt_count += 1;
|
|
370
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
318
371
|
return Ok(None);
|
|
319
372
|
}
|
|
320
373
|
|
|
@@ -323,10 +376,12 @@ fn process_row(
|
|
|
323
376
|
out_lst.push("mutation".to_string());
|
|
324
377
|
}
|
|
325
378
|
|
|
326
|
-
// filter cnvs based on segment length. Default:
|
|
379
|
+
// filter cnvs based on segment length. Default: 0 (no filtering)
|
|
327
380
|
if data_type == "cnv" {
|
|
328
381
|
// calculate segment length (End_Position - Start_Position)
|
|
329
382
|
let end_position = out_lst[3].parse::<i32>().map_err(|_| {
|
|
383
|
+
case_details.cnv.invalid_rows += 1;
|
|
384
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
330
385
|
(
|
|
331
386
|
case_id.to_string(),
|
|
332
387
|
data_type.to_string(),
|
|
@@ -335,6 +390,8 @@ fn process_row(
|
|
|
335
390
|
})?;
|
|
336
391
|
|
|
337
392
|
let start_position = out_lst[2].parse::<i32>().map_err(|_| {
|
|
393
|
+
case_details.cnv.invalid_rows += 1;
|
|
394
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
338
395
|
(
|
|
339
396
|
case_id.to_string(),
|
|
340
397
|
data_type.to_string(),
|
|
@@ -342,7 +399,9 @@ fn process_row(
|
|
|
342
399
|
)
|
|
343
400
|
})?;
|
|
344
401
|
let cnv_length = end_position - start_position;
|
|
345
|
-
if cnv_length > seg_length {
|
|
402
|
+
if seg_length > 0 && cnv_length > seg_length {
|
|
403
|
+
case_details.cnv.seg_length += 1;
|
|
404
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
346
405
|
return Ok(None);
|
|
347
406
|
}
|
|
348
407
|
}
|
|
@@ -517,7 +576,7 @@ async fn download_single_file(
|
|
|
517
576
|
))
|
|
518
577
|
}
|
|
519
578
|
|
|
520
|
-
///
|
|
579
|
+
/// Streaming download function
|
|
521
580
|
/// Outputs JSONL format: one JSON object per line
|
|
522
581
|
/// Node.js will read this line-by-line but still wait for completion
|
|
523
582
|
async fn download_data_streaming(
|
|
@@ -549,6 +608,9 @@ async fn download_data_streaming(
|
|
|
549
608
|
// Counters for final summary
|
|
550
609
|
let successful_downloads = Arc::new(AtomicUsize::new(0));
|
|
551
610
|
let failed_downloads = Arc::new(AtomicUsize::new(0));
|
|
611
|
+
let filtered_maf_records = Arc::new(AtomicUsize::new(0));
|
|
612
|
+
let filtered_cnv_records = Arc::new(AtomicUsize::new(0));
|
|
613
|
+
let filtered_records = Arc::new(Mutex::new(HashMap::<String, FilteredCaseDetails>::new()));
|
|
552
614
|
|
|
553
615
|
// Only collect errors (successful data is output immediately)
|
|
554
616
|
let errors = Arc::new(Mutex::new(Vec::<ErrorEntry>::new()));
|
|
@@ -565,6 +627,9 @@ async fn download_data_streaming(
|
|
|
565
627
|
.for_each(|download_result| {
|
|
566
628
|
let successful_downloads = Arc::clone(&successful_downloads);
|
|
567
629
|
let failed_downloads = Arc::clone(&failed_downloads);
|
|
630
|
+
let filtered_maf_records = Arc::clone(&filtered_maf_records);
|
|
631
|
+
let filtered_cnv_records = Arc::clone(&filtered_cnv_records);
|
|
632
|
+
let filtered_records = Arc::clone(&filtered_records);
|
|
568
633
|
let errors = Arc::clone(&errors);
|
|
569
634
|
|
|
570
635
|
async move {
|
|
@@ -581,7 +646,12 @@ async fn download_data_streaming(
|
|
|
581
646
|
gain_threshold,
|
|
582
647
|
loss_threshold,
|
|
583
648
|
seg_length,
|
|
584
|
-
|
|
649
|
+
&filtered_records,
|
|
650
|
+
&filtered_maf_records,
|
|
651
|
+
&filtered_cnv_records,
|
|
652
|
+
)
|
|
653
|
+
.await
|
|
654
|
+
{
|
|
585
655
|
Ok(parsed_data) => {
|
|
586
656
|
// SUCCESS: Output immediately as JSONL
|
|
587
657
|
let success_output = SuccessfulFileOutput {
|
|
@@ -597,6 +667,8 @@ async fn download_data_streaming(
|
|
|
597
667
|
// Force flush to ensure Node.js sees it immediately
|
|
598
668
|
use std::io::Write;
|
|
599
669
|
let _ = std::io::stdout().flush();
|
|
670
|
+
// Optional: Add small delay to separate lines
|
|
671
|
+
sleep(Duration::from_millis(10));
|
|
600
672
|
}
|
|
601
673
|
|
|
602
674
|
successful_downloads.fetch_add(1, Ordering::Relaxed);
|
|
@@ -643,6 +715,8 @@ async fn download_data_streaming(
|
|
|
643
715
|
// Output final summary as the last line
|
|
644
716
|
let success_count = successful_downloads.load(Ordering::Relaxed);
|
|
645
717
|
let failed_count = failed_downloads.load(Ordering::Relaxed);
|
|
718
|
+
let filtered_maf_count = filtered_maf_records.load(Ordering::Relaxed);
|
|
719
|
+
let filtered_cnv_count = filtered_cnv_records.load(Ordering::Relaxed);
|
|
646
720
|
|
|
647
721
|
let summary = FinalSummary {
|
|
648
722
|
output_type: "summary".to_string(),
|
|
@@ -650,6 +724,10 @@ async fn download_data_streaming(
|
|
|
650
724
|
successful_files: success_count,
|
|
651
725
|
failed_files: failed_count,
|
|
652
726
|
errors: errors.lock().await.clone(),
|
|
727
|
+
filtered_records: filtered_maf_count + filtered_cnv_count,
|
|
728
|
+
filtered_maf_records: filtered_maf_count,
|
|
729
|
+
filtered_cnv_records: filtered_cnv_count,
|
|
730
|
+
filtered_records_by_case: filtered_records.lock().await.clone(),
|
|
653
731
|
};
|
|
654
732
|
|
|
655
733
|
// Output final summary - Node.js will know processing is complete when it sees this
|
|
@@ -715,11 +793,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
715
793
|
// Set default cnv_options
|
|
716
794
|
let (gain_threshold, loss_threshold, seg_length) = match input_js.cnv_options {
|
|
717
795
|
Some(options) => (options.gain_threshold, options.loss_threshold, options.seg_length),
|
|
718
|
-
None => (0.3, -0.4,
|
|
796
|
+
None => (0.3, -0.4, 0), // Default values
|
|
719
797
|
};
|
|
720
798
|
|
|
721
799
|
// Download data - this will now handle errors gracefully
|
|
722
|
-
// download_data(case_files, HOST, min_total_depth, min_alt_allele_count, &consequences).await;
|
|
723
800
|
download_data_streaming(
|
|
724
801
|
case_files,
|
|
725
802
|
HOST,
|