@sjcrh/proteinpaint-rust 2.129.6 → 2.130.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -36,13 +36,18 @@ const out = await run_rust('indel', input_data)
36
36
 
37
37
  ## Test
38
38
 
39
- From the `proteinpaint/server` directory,
39
+ For running the tests written in nodejs, from the `proteinpaint` directory run,
40
40
 
41
41
  ```bash
42
- npx test
43
- npx tsc
42
+ npm run test:unit --workspace="rust"
44
43
  ```
45
44
 
45
+ For running the tests written in native rust, from the `proteinpaint/rust` directory run.
46
+ ```bash
47
+ cargo test
48
+ ```
49
+
50
+
46
51
  ## Build
47
52
 
48
53
  ```bash
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.129.6",
2
+ "version": "2.130.0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "type": "module",
5
5
  "description": "Rust-based utilities for proteinpaint",
package/src/gdcGRIN2.rs CHANGED
@@ -19,12 +19,13 @@
19
19
  use flate2::read::GzDecoder;
20
20
  use futures::StreamExt;
21
21
  use memchr::memchr;
22
- use serde::Deserialize;
22
+ use serde::{Deserialize, Serialize};
23
23
  use serde_json;
24
24
  use std::collections::HashMap;
25
25
  use std::io::{self, Read};
26
26
  use std::sync::Arc;
27
27
  use std::sync::atomic::{AtomicUsize, Ordering};
28
+ use std::thread::sleep;
28
29
  use std::time::Duration;
29
30
  use tokio::io::{AsyncReadExt, BufReader};
30
31
  use tokio::sync::Mutex;
@@ -78,6 +79,30 @@ struct SuccessfulFileOutput {
78
79
  data: Vec<Vec<String>>,
79
80
  }
80
81
 
82
+ // struct for MAF filter details
83
+ #[derive(Clone, Serialize, Default)]
84
+ struct FilteredMafDetails {
85
+ invalid_consequences: usize,
86
+ t_alt_count: usize,
87
+ t_depth: usize,
88
+ invalid_rows: usize,
89
+ }
90
+
91
+ // struct for CNV filter details
92
+ #[derive(Clone, Serialize, Default)]
93
+ struct FilteredCnvDetails {
94
+ segment_mean: usize,
95
+ seg_length: usize,
96
+ invalid_rows: usize,
97
+ }
98
+
99
+ // struct for per-case filter details
100
+ #[derive(Clone, Serialize)]
101
+ struct FilteredCaseDetails {
102
+ maf: FilteredMafDetails,
103
+ cnv: FilteredCnvDetails,
104
+ }
105
+
81
106
  // Final summary output (JSONL format)
82
107
  #[derive(serde::Serialize)]
83
108
  struct FinalSummary {
@@ -87,6 +112,10 @@ struct FinalSummary {
87
112
  successful_files: usize,
88
113
  failed_files: usize,
89
114
  errors: Vec<ErrorEntry>,
115
+ filtered_records: usize,
116
+ filtered_maf_records: usize,
117
+ filtered_cnv_records: usize,
118
+ filtered_records_by_case: HashMap<String, FilteredCaseDetails>,
90
119
  }
91
120
 
92
121
  // Define the top-level input structure
@@ -107,24 +136,8 @@ struct DataTypeConfig {
107
136
  output_columns: Vec<&'static str>,
108
137
  }
109
138
 
110
- // Function to check if CNV file has Segment_Mean column
111
- fn has_segment_mean_column(content: &str) -> bool {
112
- for line in content.lines() {
113
- // Check if this line contains Segment_Mean (likely the header)
114
- if line.contains("Segment_Mean") {
115
- return true;
116
- }
117
- // Stop checking after a few non-comment lines to avoid parsing entire file
118
- if !line.trim().is_empty() {
119
- break;
120
- }
121
- }
122
- false
123
- }
124
-
125
139
  // Function to parse TSV content
126
- // Updated parse_content function with better consequence filtering
127
- fn parse_content(
140
+ async fn parse_content(
128
141
  content: &str,
129
142
  case_id: &str,
130
143
  data_type: &str,
@@ -134,12 +147,10 @@ fn parse_content(
134
147
  gain_threshold: f32,
135
148
  loss_threshold: f32,
136
149
  seg_length: i32,
150
+ filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
151
+ filtered_maf_records: &AtomicUsize,
152
+ filtered_cnv_records: &AtomicUsize,
137
153
  ) -> Result<Vec<Vec<String>>, (String, String, String)> {
138
- // Early filter for CNV files - only process files with Segment_Mean
139
- if data_type == "cnv" && !has_segment_mean_column(content) {
140
- return Ok(Vec::new()); // Return empty result, no error
141
- }
142
-
143
154
  let config = match data_type {
144
155
  "cnv" => DataTypeConfig {
145
156
  header_marker: "Segment_Mean",
@@ -199,7 +210,11 @@ fn parse_content(
199
210
  gain_threshold,
200
211
  loss_threshold,
201
212
  seg_length,
202
- )?;
213
+ filtered_records,
214
+ filtered_maf_records,
215
+ filtered_cnv_records,
216
+ )
217
+ .await?;
203
218
 
204
219
  if let Some(out_lst) = row {
205
220
  parsed_data.push(out_lst);
@@ -254,7 +269,7 @@ fn setup_columns(
254
269
  }
255
270
 
256
271
  // Process a single row of data
257
- fn process_row(
272
+ async fn process_row(
258
273
  line: &str,
259
274
  case_id: &str,
260
275
  data_type: &str,
@@ -267,18 +282,41 @@ fn process_row(
267
282
  gain_threshold: f32,
268
283
  loss_threshold: f32,
269
284
  seg_length: i32,
285
+ filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
286
+ filtered_maf_records: &AtomicUsize,
287
+ filtered_cnv_records: &AtomicUsize,
270
288
  ) -> Result<Option<Vec<String>>, (String, String, String)> {
271
289
  let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
272
290
  let mut out_lst = vec![case_id.to_string()];
273
291
 
292
+ // Initialize or update case details
293
+ let mut filtered_map = filtered_records.lock().await;
294
+ filtered_map
295
+ .entry(case_id.to_string())
296
+ .or_insert_with(|| FilteredCaseDetails {
297
+ maf: FilteredMafDetails::default(),
298
+ cnv: FilteredCnvDetails::default(),
299
+ });
300
+
301
+ let case_details = filtered_map.get_mut(case_id).unwrap();
302
+
274
303
  // Check consequence filtering for MAF files
275
304
  if data_type == "maf" && !is_valid_consequence(&cont_lst, variant_classification_index, consequences) {
305
+ case_details.maf.invalid_consequences += 1;
306
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
276
307
  return Ok(None);
277
308
  }
278
309
 
279
310
  // Extract relevant columns
280
311
  for &x in columns_indices {
281
312
  if x >= cont_lst.len() {
313
+ if data_type == "maf" {
314
+ case_details.maf.invalid_rows += 1;
315
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
316
+ } else if data_type == "cnv" {
317
+ case_details.cnv.invalid_rows += 1;
318
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
319
+ }
282
320
  return Ok(None); // Invalid row
283
321
  }
284
322
 
@@ -286,6 +324,8 @@ fn process_row(
286
324
  if data_type == "cnv" && header[x] == "Segment_Mean" {
287
325
  element = process_segment_mean(&element, case_id, data_type, gain_threshold, loss_threshold)?;
288
326
  if element.is_empty() {
327
+ case_details.cnv.segment_mean += 1;
328
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
289
329
  return Ok(None);
290
330
  }
291
331
  }
@@ -295,10 +335,14 @@ fn process_row(
295
335
  // Additional MAF-specific processing
296
336
  if data_type == "maf" {
297
337
  if out_lst.len() < 6 {
338
+ case_details.maf.invalid_rows += 1;
339
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
298
340
  return Ok(None); // Not enough columns
299
341
  }
300
342
 
301
343
  let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
344
+ case_details.maf.invalid_rows += 1;
345
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
302
346
  (
303
347
  case_id.to_string(),
304
348
  data_type.to_string(),
@@ -307,6 +351,8 @@ fn process_row(
307
351
  })?;
308
352
 
309
353
  let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
354
+ case_details.maf.invalid_rows += 1;
355
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
310
356
  (
311
357
  case_id.to_string(),
312
358
  data_type.to_string(),
@@ -314,7 +360,14 @@ fn process_row(
314
360
  )
315
361
  })?;
316
362
 
317
- if alle_depth < min_total_depth || alt_count < min_alt_allele_count {
363
+ if alle_depth < min_total_depth {
364
+ case_details.maf.t_depth += 1;
365
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
366
+ return Ok(None);
367
+ }
368
+ if alt_count < min_alt_allele_count {
369
+ case_details.maf.t_alt_count += 1;
370
+ filtered_maf_records.fetch_add(1, Ordering::Relaxed);
318
371
  return Ok(None);
319
372
  }
320
373
 
@@ -323,10 +376,12 @@ fn process_row(
323
376
  out_lst.push("mutation".to_string());
324
377
  }
325
378
 
326
- // filter cnvs based on segment length. Default: 2000000
379
+ // filter cnvs based on segment length. Default: 0 (no filtering)
327
380
  if data_type == "cnv" {
328
381
  // calculate segment length (End_Position - Start_Position)
329
382
  let end_position = out_lst[3].parse::<i32>().map_err(|_| {
383
+ case_details.cnv.invalid_rows += 1;
384
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
330
385
  (
331
386
  case_id.to_string(),
332
387
  data_type.to_string(),
@@ -335,6 +390,8 @@ fn process_row(
335
390
  })?;
336
391
 
337
392
  let start_position = out_lst[2].parse::<i32>().map_err(|_| {
393
+ case_details.cnv.invalid_rows += 1;
394
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
338
395
  (
339
396
  case_id.to_string(),
340
397
  data_type.to_string(),
@@ -342,7 +399,9 @@ fn process_row(
342
399
  )
343
400
  })?;
344
401
  let cnv_length = end_position - start_position;
345
- if cnv_length > seg_length {
402
+ if seg_length > 0 && cnv_length > seg_length {
403
+ case_details.cnv.seg_length += 1;
404
+ filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
346
405
  return Ok(None);
347
406
  }
348
407
  }
@@ -517,7 +576,7 @@ async fn download_single_file(
517
576
  ))
518
577
  }
519
578
 
520
- /// NEW: Phase 1 streaming download function
579
+ /// Streaming download function
521
580
  /// Outputs JSONL format: one JSON object per line
522
581
  /// Node.js will read this line-by-line but still wait for completion
523
582
  async fn download_data_streaming(
@@ -549,6 +608,9 @@ async fn download_data_streaming(
549
608
  // Counters for final summary
550
609
  let successful_downloads = Arc::new(AtomicUsize::new(0));
551
610
  let failed_downloads = Arc::new(AtomicUsize::new(0));
611
+ let filtered_maf_records = Arc::new(AtomicUsize::new(0));
612
+ let filtered_cnv_records = Arc::new(AtomicUsize::new(0));
613
+ let filtered_records = Arc::new(Mutex::new(HashMap::<String, FilteredCaseDetails>::new()));
552
614
 
553
615
  // Only collect errors (successful data is output immediately)
554
616
  let errors = Arc::new(Mutex::new(Vec::<ErrorEntry>::new()));
@@ -565,6 +627,9 @@ async fn download_data_streaming(
565
627
  .for_each(|download_result| {
566
628
  let successful_downloads = Arc::clone(&successful_downloads);
567
629
  let failed_downloads = Arc::clone(&failed_downloads);
630
+ let filtered_maf_records = Arc::clone(&filtered_maf_records);
631
+ let filtered_cnv_records = Arc::clone(&filtered_cnv_records);
632
+ let filtered_records = Arc::clone(&filtered_records);
568
633
  let errors = Arc::clone(&errors);
569
634
 
570
635
  async move {
@@ -581,7 +646,12 @@ async fn download_data_streaming(
581
646
  gain_threshold,
582
647
  loss_threshold,
583
648
  seg_length,
584
- ) {
649
+ &filtered_records,
650
+ &filtered_maf_records,
651
+ &filtered_cnv_records,
652
+ )
653
+ .await
654
+ {
585
655
  Ok(parsed_data) => {
586
656
  // SUCCESS: Output immediately as JSONL
587
657
  let success_output = SuccessfulFileOutput {
@@ -597,6 +667,8 @@ async fn download_data_streaming(
597
667
  // Force flush to ensure Node.js sees it immediately
598
668
  use std::io::Write;
599
669
  let _ = std::io::stdout().flush();
670
+ // Optional: Add small delay to separate lines
671
+ sleep(Duration::from_millis(10));
600
672
  }
601
673
 
602
674
  successful_downloads.fetch_add(1, Ordering::Relaxed);
@@ -643,6 +715,8 @@ async fn download_data_streaming(
643
715
  // Output final summary as the last line
644
716
  let success_count = successful_downloads.load(Ordering::Relaxed);
645
717
  let failed_count = failed_downloads.load(Ordering::Relaxed);
718
+ let filtered_maf_count = filtered_maf_records.load(Ordering::Relaxed);
719
+ let filtered_cnv_count = filtered_cnv_records.load(Ordering::Relaxed);
646
720
 
647
721
  let summary = FinalSummary {
648
722
  output_type: "summary".to_string(),
@@ -650,6 +724,10 @@ async fn download_data_streaming(
650
724
  successful_files: success_count,
651
725
  failed_files: failed_count,
652
726
  errors: errors.lock().await.clone(),
727
+ filtered_records: filtered_maf_count + filtered_cnv_count,
728
+ filtered_maf_records: filtered_maf_count,
729
+ filtered_cnv_records: filtered_cnv_count,
730
+ filtered_records_by_case: filtered_records.lock().await.clone(),
653
731
  };
654
732
 
655
733
  // Output final summary - Node.js will know processing is complete when it sees this
@@ -715,11 +793,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
715
793
  // Set default cnv_options
716
794
  let (gain_threshold, loss_threshold, seg_length) = match input_js.cnv_options {
717
795
  Some(options) => (options.gain_threshold, options.loss_threshold, options.seg_length),
718
- None => (0.3, -0.4, 2000000), // Default values
796
+ None => (0.3, -0.4, 0), // Default values
719
797
  };
720
798
 
721
799
  // Download data - this will now handle errors gracefully
722
- // download_data(case_files, HOST, min_total_depth, min_alt_allele_count, &consequences).await;
723
800
  download_data_streaming(
724
801
  case_files,
725
802
  HOST,