npm - @sjcrh/proteinpaint-rust - Versions diffs - 2.129.6 → 2.132.0 - Mend

@sjcrh/proteinpaint-rust 2.129.6 → 2.132.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -36,13 +36,18 @@ const out = await run_rust('indel', input_data)
 ## Test
-From the `proteinpaint/server` directory,
+For running the tests written in nodejs, from the `proteinpaint` directory run,
 ```bash
-npx test
-npx tsc
+npm run test:unit --workspace="rust"
 ```
+For running the tests written in native rust, from the `proteinpaint/rust` directory run.
+```bash
+cargo test
+```
 ## Build
 ```bash

package/package.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-	"version": "2.129.6",
+	"version": "2.132.0",
 	"name": "@sjcrh/proteinpaint-rust",
 	"type": "module",
 	"description": "Rust-based utilities for proteinpaint",

package/src/gdcGRIN2.rs CHANGED Viewed

@@ -13,18 +13,19 @@
   Output mutations as JSON array.
   Example of usage:
-    echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000}}' | ./target/release/gdcGRIN2
+    echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20,"hyperMutator":1000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000}}' | ./target/release/gdcGRIN2
 */
 use flate2::read::GzDecoder;
 use futures::StreamExt;
 use memchr::memchr;
-use serde::Deserialize;
+use serde::{Deserialize, Serialize};
 use serde_json;
 use std::collections::HashMap;
 use std::io::{self, Read};
 use std::sync::Arc;
 use std::sync::atomic::{AtomicUsize, Ordering};
+use std::thread::sleep;
 use std::time::Duration;
 use tokio::io::{AsyncReadExt, BufReader};
 use tokio::sync::Mutex;
@@ -54,6 +55,8 @@ struct MafOptions {
     min_total_depth: i32,
     #[serde(rename = "minAltAlleleCount")]
     min_alt_allele_count: i32,
+    #[serde(rename = "hyperMutator")]
+    hyper_mutator: i32,
     consequences: Option<Vec<String>>, // Optional list of consequences to filter MAF files
 }
@@ -78,6 +81,31 @@ struct SuccessfulFileOutput {
     data: Vec<Vec<String>>,
 }
+// struct for MAF filter details
+#[derive(Clone, Serialize, Default)]
+struct FilteredMafDetails {
+    matched_consequences: HashMap<String, usize>,
+    rejected_consequences: HashMap<String, usize>,
+    t_alt_count: usize,
+    t_depth: usize,
+    invalid_rows: usize,
+}
+// struct for CNV filter details
+#[derive(Clone, Serialize, Default)]
+struct FilteredCnvDetails {
+    segment_mean: usize,
+    seg_length: usize,
+    invalid_rows: usize,
+}
+// struct for per-case filter details
+#[derive(Clone, Serialize)]
+struct FilteredCaseDetails {
+    maf: FilteredMafDetails,
+    cnv: FilteredCnvDetails,
+}
 // Final summary output (JSONL format)
 #[derive(serde::Serialize)]
 struct FinalSummary {
@@ -87,6 +115,13 @@ struct FinalSummary {
     successful_files: usize,
     failed_files: usize,
     errors: Vec<ErrorEntry>,
+    filtered_records: usize,
+    filtered_maf_records: usize,
+    filtered_cnv_records: usize,
+    included_maf_records: usize,
+    included_cnv_records: usize,
+    filtered_records_by_case: HashMap<String, FilteredCaseDetails>,
+    hyper_mutator_records: Vec<String>,
 }
 // Define the top-level input structure
@@ -107,39 +142,25 @@ struct DataTypeConfig {
     output_columns: Vec<&'static str>,
 }
-// Function to check if CNV file has Segment_Mean column
-fn has_segment_mean_column(content: &str) -> bool {
-    for line in content.lines() {
-        // Check if this line contains Segment_Mean (likely the header)
-        if line.contains("Segment_Mean") {
-            return true;
-        }
-        // Stop checking after a few non-comment lines to avoid parsing entire file
-        if !line.trim().is_empty() {
-            break;
-        }
-    }
-    false
-}
 // Function to parse TSV content
-// Updated parse_content function with better consequence filtering
-fn parse_content(
+async fn parse_content(
     content: &str,
     case_id: &str,
     data_type: &str,
     min_total_depth: i32,
     min_alt_allele_count: i32,
+    hyper_mutator: i32,
     consequences: &Option<Vec<String>>,
     gain_threshold: f32,
     loss_threshold: f32,
     seg_length: i32,
+    filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
+    filtered_maf_records: &AtomicUsize,
+    filtered_cnv_records: &AtomicUsize,
+    included_maf_records: &AtomicUsize,
+    included_cnv_records: &AtomicUsize,
+    hyper_mutator_records: &Arc<Mutex<Vec<String>>>,
 ) -> Result<Vec<Vec<String>>, (String, String, String)> {
-    // Early filter for CNV files - only process files with Segment_Mean
-    if data_type == "cnv" && !has_segment_mean_column(content) {
-        return Ok(Vec::new()); // Return empty result, no error
-    }
     let config = match data_type {
         "cnv" => DataTypeConfig {
             header_marker: "Segment_Mean",
@@ -158,6 +179,18 @@ fn parse_content(
         }
     };
+    // check hyperMutator for MAF files
+    if data_type == "maf" && hyper_mutator > 0 {
+        let line_count = content.lines().count();
+        if line_count as i32 > hyper_mutator {
+            let mut hyper_records = hyper_mutator_records.lock().await;
+            if !hyper_records.contains(&case_id.to_string()) {
+                hyper_records.push(case_id.to_string());
+            }
+            return Ok(Vec::new());
+        }
+    };
     let lines = content.lines();
     let mut parsed_data = Vec::new();
     let mut columns_indices: Vec<usize> = Vec::new();
@@ -199,7 +232,13 @@ fn parse_content(
             gain_threshold,
             loss_threshold,
             seg_length,
-        )?;
+            filtered_records,
+            filtered_maf_records,
+            filtered_cnv_records,
+            included_maf_records,
+            included_cnv_records,
+        )
+        .await?;
         if let Some(out_lst) = row {
             parsed_data.push(out_lst);
@@ -240,7 +279,7 @@ fn setup_columns(
     }
     if data_type == "maf" {
-        *variant_classification_index = header.iter().position(|x| x == "Variant_Classification");
+        *variant_classification_index = header.iter().position(|x| x == "One_Consequence");
         if variant_classification_index.is_none() {
             return Err((
                 case_id.to_string(),
@@ -254,7 +293,7 @@ fn setup_columns(
 }
 // Process a single row of data
-fn process_row(
+async fn process_row(
     line: &str,
     case_id: &str,
     data_type: &str,
@@ -267,18 +306,88 @@ fn process_row(
     gain_threshold: f32,
     loss_threshold: f32,
     seg_length: i32,
+    filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
+    filtered_maf_records: &AtomicUsize,
+    filtered_cnv_records: &AtomicUsize,
+    included_maf_records: &AtomicUsize,
+    included_cnv_records: &AtomicUsize,
 ) -> Result<Option<Vec<String>>, (String, String, String)> {
     let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
     let mut out_lst = vec![case_id.to_string()];
-    // Check consequence filtering for MAF files
-    if data_type == "maf" && !is_valid_consequence(&cont_lst, variant_classification_index, consequences) {
-        return Ok(None);
+    // Initialize or update case details
+    let mut filtered_map = filtered_records.lock().await;
+    filtered_map
+        .entry(case_id.to_string())
+        .or_insert_with(|| FilteredCaseDetails {
+            maf: FilteredMafDetails::default(),
+            cnv: FilteredCnvDetails::default(),
+        });
+    let case_details = filtered_map.get_mut(case_id).unwrap();
+    // Handle consequence filtering and counting for MAF files
+    if data_type == "maf" {
+        if let Some(var_class_idx) = variant_classification_index {
+            if var_class_idx < cont_lst.len() {
+                let variant_classification = &cont_lst[var_class_idx];
+                if let Some(consequence_filter) = consequences {
+                    if !consequence_filter.is_empty() {
+                        if consequence_filter.contains(variant_classification) {
+                            // Matched consequence
+                            *case_details
+                                .maf
+                                .matched_consequences
+                                .entry(variant_classification.to_string())
+                                .or_insert(0) += 1;
+                        } else {
+                            // Unmatched consequence
+                            *case_details
+                                .maf
+                                .rejected_consequences
+                                .entry(variant_classification.to_string())
+                                .or_insert(0) += 1;
+                            filtered_maf_records.fetch_add(1, Ordering::Relaxed);
+                            return Ok(None);
+                        }
+                    } else {
+                        // Empty filter, count as matched
+                        *case_details
+                            .maf
+                            .matched_consequences
+                            .entry(variant_classification.to_string())
+                            .or_insert(0) += 1;
+                    }
+                } else {
+                    // No filter, count as matched
+                    *case_details
+                        .maf
+                        .matched_consequences
+                        .entry(variant_classification.to_string())
+                        .or_insert(0) += 1;
+                }
+            } else {
+                case_details.maf.invalid_rows += 1;
+                filtered_maf_records.fetch_add(1, Ordering::Relaxed);
+                return Ok(None);
+            }
+        } else {
+            case_details.maf.invalid_rows += 1;
+            filtered_maf_records.fetch_add(1, Ordering::Relaxed);
+            return Ok(None);
+        }
     }
     // Extract relevant columns
     for &x in columns_indices {
         if x >= cont_lst.len() {
+            if data_type == "maf" {
+                case_details.maf.invalid_rows += 1;
+                filtered_maf_records.fetch_add(1, Ordering::Relaxed);
+            } else if data_type == "cnv" {
+                case_details.cnv.invalid_rows += 1;
+                filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
+            }
             return Ok(None); // Invalid row
         }
@@ -286,6 +395,8 @@ fn process_row(
         if data_type == "cnv" && header[x] == "Segment_Mean" {
             element = process_segment_mean(&element, case_id, data_type, gain_threshold, loss_threshold)?;
             if element.is_empty() {
+                case_details.cnv.segment_mean += 1;
+                filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
                 return Ok(None);
             }
         }
@@ -295,10 +406,14 @@ fn process_row(
     // Additional MAF-specific processing
     if data_type == "maf" {
         if out_lst.len() < 6 {
+            case_details.maf.invalid_rows += 1;
+            filtered_maf_records.fetch_add(1, Ordering::Relaxed);
             return Ok(None); // Not enough columns
         }
         let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
+            case_details.maf.invalid_rows += 1;
+            filtered_maf_records.fetch_add(1, Ordering::Relaxed);
             (
                 case_id.to_string(),
                 data_type.to_string(),
@@ -307,6 +422,8 @@ fn process_row(
         })?;
         let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
+            case_details.maf.invalid_rows += 1;
+            filtered_maf_records.fetch_add(1, Ordering::Relaxed);
             (
                 case_id.to_string(),
                 data_type.to_string(),
@@ -314,19 +431,31 @@ fn process_row(
             )
         })?;
-        if alle_depth < min_total_depth || alt_count < min_alt_allele_count {
+        if alle_depth < min_total_depth {
+            case_details.maf.t_depth += 1;
+            filtered_maf_records.fetch_add(1, Ordering::Relaxed);
+            return Ok(None);
+        }
+        if alt_count < min_alt_allele_count {
+            case_details.maf.t_alt_count += 1;
+            filtered_maf_records.fetch_add(1, Ordering::Relaxed);
             return Ok(None);
         }
         // Keep case_id, chr, start, end, and add "mutation"
         out_lst = out_lst[0..4].to_vec();
         out_lst.push("mutation".to_string());
+        // Update counters for included MAF records
+        included_maf_records.fetch_add(1, Ordering::Relaxed);
     }
-    // filter cnvs based on segment length. Default: 2000000
+    // filter cnvs based on segment length. Default: 0 (no filtering)
     if data_type == "cnv" {
         // calculate segment length (End_Position - Start_Position)
         let end_position = out_lst[3].parse::<i32>().map_err(|_| {
+            case_details.cnv.invalid_rows += 1;
+            filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
             (
                 case_id.to_string(),
                 data_type.to_string(),
@@ -335,6 +464,8 @@ fn process_row(
         })?;
         let start_position = out_lst[2].parse::<i32>().map_err(|_| {
+            case_details.cnv.invalid_rows += 1;
+            filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
             (
                 case_id.to_string(),
                 data_type.to_string(),
@@ -342,36 +473,17 @@ fn process_row(
             )
         })?;
         let cnv_length = end_position - start_position;
-        if cnv_length > seg_length {
+        if seg_length > 0 && cnv_length > seg_length {
+            case_details.cnv.seg_length += 1;
+            filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
             return Ok(None);
         }
+        included_cnv_records.fetch_add(1, Ordering::Relaxed);
     }
     Ok(Some(out_lst))
 }
-// Check if the row meets consequence filtering criteria
-fn is_valid_consequence(
-    cont_lst: &[String],
-    variant_classification_index: Option<usize>,
-    consequences: &Option<Vec<String>>,
-) -> bool {
-    if let Some(consequence_filter) = consequences {
-        if !consequence_filter.is_empty() {
-            if let Some(var_class_idx) = variant_classification_index {
-                if var_class_idx < cont_lst.len() {
-                    let variant_classification = &cont_lst[var_class_idx];
-                    if let Some(normalized_consequence) = normalize_consequence(variant_classification) {
-                        return consequence_filter.contains(&normalized_consequence);
-                    }
-                }
-                return false; // Invalid row or unknown consequence
-            }
-        }
-    }
-    true // No filtering or empty filter
-}
 // Process Segment_Mean for CNV files
 fn process_segment_mean(
     element: &str,
@@ -398,23 +510,6 @@ fn process_segment_mean(
 }
 /// Updated helper function to normalize MAF consequence types to frontend format
-/// Returns None for unknown consequence types (which will be filtered out)
-fn normalize_consequence(maf_consequence: &str) -> Option<String> {
-    match maf_consequence.to_lowercase().as_str() {
-        // Only map the consequence types we actually support
-        "missense_mutation" => Some("missense".to_string()),
-        "nonsense_mutation" | "stop_gained" | "stop_lost" => Some("nonsense".to_string()),
-        "frame_shift_del" | "frame_shift_ins" | "frameshift_variant" => Some("frameshift".to_string()),
-        "silent" | "synonymous_variant" => Some("silent".to_string()),
-        "in_frame_del" => Some("deletion".to_string()),
-        "in_frame_ins" => Some("insertion".to_string()),
-        "splice_site" | "splice_acceptor_variant" | "splice_donor_variant" => Some("splice_site".to_string()),
-        "tandem_duplication" | "duplication" => Some("duplication".to_string()),
-        "inversion" => Some("inversion".to_string()),
-        // Return None for all unknown consequence types - they will be filtered out
-        _ => None,
-    }
-}
 /// Downloads a single file with minimal retry logic for transient failures
 async fn download_single_file(
     case_id: String,
@@ -517,7 +612,7 @@ async fn download_single_file(
     ))
 }
-/// NEW: Phase 1 streaming download function
+/// Streaming download function
 /// Outputs JSONL format: one JSON object per line
 /// Node.js will read this line-by-line but still wait for completion
 async fn download_data_streaming(
@@ -525,6 +620,7 @@ async fn download_data_streaming(
     host: &str,
     min_total_depth: i32,
     min_alt_allele_count: i32,
+    hyper_mutator: i32,
     consequences: &Option<Vec<String>>,
     gain_threshold: f32,
     loss_threshold: f32,
@@ -549,6 +645,12 @@ async fn download_data_streaming(
     // Counters for final summary
     let successful_downloads = Arc::new(AtomicUsize::new(0));
     let failed_downloads = Arc::new(AtomicUsize::new(0));
+    let filtered_maf_records = Arc::new(AtomicUsize::new(0));
+    let filtered_cnv_records = Arc::new(AtomicUsize::new(0));
+    let filtered_records = Arc::new(Mutex::new(HashMap::<String, FilteredCaseDetails>::new()));
+    let hyper_mutator_records = Arc::new(Mutex::new(Vec::<String>::new()));
+    let included_maf_records = Arc::new(AtomicUsize::new(0));
+    let included_cnv_records = Arc::new(AtomicUsize::new(0));
     // Only collect errors (successful data is output immediately)
     let errors = Arc::new(Mutex::new(Vec::<ErrorEntry>::new()));
@@ -565,6 +667,12 @@ async fn download_data_streaming(
         .for_each(|download_result| {
             let successful_downloads = Arc::clone(&successful_downloads);
             let failed_downloads = Arc::clone(&failed_downloads);
+            let filtered_maf_records = Arc::clone(&filtered_maf_records);
+            let filtered_cnv_records = Arc::clone(&filtered_cnv_records);
+            let filtered_records = Arc::clone(&filtered_records);
+            let included_maf_records = Arc::clone(&included_maf_records);
+            let included_cnv_records = Arc::clone(&included_cnv_records);
+            let hyper_mutator_records = Arc::clone(&hyper_mutator_records);
             let errors = Arc::clone(&errors);
             async move {
@@ -577,11 +685,20 @@ async fn download_data_streaming(
                             &data_type,
                             min_total_depth,
                             min_alt_allele_count,
+                            hyper_mutator,
                             &consequences,
                             gain_threshold,
                             loss_threshold,
                             seg_length,
-                        ) {
+                            &filtered_records,
+                            &filtered_maf_records,
+                            &filtered_cnv_records,
+                            &included_maf_records,
+                            &included_cnv_records,
+                            &hyper_mutator_records,
+                        )
+                        .await
+                        {
                             Ok(parsed_data) => {
                                 // SUCCESS: Output immediately as JSONL
                                 let success_output = SuccessfulFileOutput {
@@ -597,6 +714,8 @@ async fn download_data_streaming(
                                     // Force flush to ensure Node.js sees it immediately
                                     use std::io::Write;
                                     let _ = std::io::stdout().flush();
+                                    // Optional: Add small delay to separate lines
+                                    sleep(Duration::from_millis(10));
                                 }
                                 successful_downloads.fetch_add(1, Ordering::Relaxed);
@@ -643,6 +762,10 @@ async fn download_data_streaming(
     // Output final summary as the last line
     let success_count = successful_downloads.load(Ordering::Relaxed);
     let failed_count = failed_downloads.load(Ordering::Relaxed);
+    let filtered_maf_count = filtered_maf_records.load(Ordering::Relaxed);
+    let filtered_cnv_count = filtered_cnv_records.load(Ordering::Relaxed);
+    let included_maf_count = included_maf_records.load(Ordering::Relaxed);
+    let included_cnv_count = included_cnv_records.load(Ordering::Relaxed);
     let summary = FinalSummary {
         output_type: "summary".to_string(),
@@ -650,6 +773,13 @@ async fn download_data_streaming(
         successful_files: success_count,
         failed_files: failed_count,
         errors: errors.lock().await.clone(),
+        filtered_records: filtered_maf_count + filtered_cnv_count,
+        filtered_maf_records: filtered_maf_count,
+        filtered_cnv_records: filtered_cnv_count,
+        filtered_records_by_case: filtered_records.lock().await.clone(),
+        included_maf_records: included_maf_count,
+        included_cnv_records: included_cnv_count,
+        hyper_mutator_records: hyper_mutator_records.lock().await.clone(),
     };
     // Output final summary - Node.js will know processing is complete when it sees this
@@ -703,28 +833,29 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     let case_files = input_js.case_files;
     // Set default maf_options
-    let (min_total_depth, min_alt_allele_count, consequences) = match input_js.maf_options {
+    let (min_total_depth, min_alt_allele_count, hyper_mutator, consequences) = match input_js.maf_options {
         Some(options) => (
             options.min_total_depth,
             options.min_alt_allele_count,
+            options.hyper_mutator,
             options.consequences.clone(),
         ),
-        None => (10, 2, None), // Default values
+        None => (10, 2, 8000, None), // Default values
     };
     // Set default cnv_options
     let (gain_threshold, loss_threshold, seg_length) = match input_js.cnv_options {
         Some(options) => (options.gain_threshold, options.loss_threshold, options.seg_length),
-        None => (0.3, -0.4, 2000000), // Default values
+        None => (0.3, -0.4, 0), // Default values
     };
     // Download data - this will now handle errors gracefully
-    // download_data(case_files, HOST, min_total_depth, min_alt_allele_count, &consequences).await;
     download_data_streaming(
         case_files,
         HOST,
         min_total_depth,
         min_alt_allele_count,
+        hyper_mutator,
         &consequences,
         gain_threshold,
         loss_threshold,