npm - @sjcrh/proteinpaint-rust - Versions diffs - 2.38.1 → 2.39.0 - Mend

@sjcrh/proteinpaint-rust 2.38.1 → 2.39.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/Cargo.toml +4 -4
package/package.json +1 -1
package/src/gdcmaf.rs +26 -28
package/src/{gene_variance.rs → topGeneByExpressionVariance.rs} +50 -76

package/Cargo.toml CHANGED Viewed

@@ -11,6 +11,7 @@ autobins = false
 [dependencies]
 kodama = "0.3"
 rayon = "1.7.0"
+bgzip = "0.3.1"
 petgraph = "0.6.3"
 ndarray = "0.15.6"
 nalgebra = {version = "0.32.2", features = ["serde-serialize"]}
@@ -26,7 +27,7 @@ serde = {version = "^1.0.147", features = ["derive"]}
 serde_json="^1.0.88"
 num = "^0.4.1"
 csv = "^1.2.2"
-r_mathlib="^0.2.0" # Uncomment this line to activate DE expression app for high sample sizes
+r_mathlib="^0.2.0"
 tokio = { version="1", features = ["full"] }
 reqwest = "0.11"
 flate2 = "1"
@@ -73,14 +74,13 @@ name="gdcmaf"
 path="src/gdcmaf.rs"
 [[bin]]
-name="gene_variance"
-path="src/gene_variance.rs"
+name="topGeneByExpressionVariance"
+path="src/topGeneByExpressionVariance.rs"
 #[[bin]]
 #name="wilcoxon"
 #path="src/wilcoxon.rs"
-# Uncomment the lines below to use DE app for higher sample sizes
 [[bin]]
 name="DEanalysis"
 path="src/DEanalysis.rs"

package/package.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "version": "2.38.1",
+  "version": "2.39.0",
   "name": "@sjcrh/proteinpaint-rust",
   "description": "Rust-based utilities for proteinpaint",
   "main": "index.js",

package/src/gdcmaf.rs CHANGED Viewed

@@ -7,7 +7,7 @@
 	Output gzip compressed maf file to stdout.
 	Example of usage:
-		echo '{"host": "https://api.gdc.cancer.gov/data/", "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
+		echo '{"host": "https://api.gdc.cancer.gov/data/","columns": ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome", "Start_Position"], "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
 */
 use flate2::read::GzDecoder;
@@ -20,7 +20,7 @@ use std::io::{self,Read,Write};
-fn select_maf_col(d:String) -> Vec<u8> {
+fn select_maf_col(d:String,columns:&Vec<String>) -> Vec<u8> {
     let mut maf_str: String = String::new();
     let mut header_indices: Vec<usize> = Vec::new();
     let lines = d.trim_end().split("\n");
@@ -29,9 +29,12 @@ fn select_maf_col(d:String) -> Vec<u8> {
             continue
         } else if line.contains("Hugo_Symbol") {
             let header: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
-            for col in MAF_COL {
-                let col_index: usize = header.iter().position(|x| x == col).unwrap();
-                header_indices.push(col_index);
+            for col in columns {
+                if let Some(index) = header.iter().position(|x| x == col) {
+                    header_indices.push(index);
+                } else {
+                    panic!("{} was not found!",col);
+                }
             }
         } else {
             let maf_cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
@@ -47,27 +50,6 @@ fn select_maf_col(d:String) -> Vec<u8> {
 }
-// GDC MAF columns (96)
-const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
-                            "Start_Position", "End_Position", "Strand", "Variant_Classification",
-                            "Variant_Type", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2",
-                            "dbSNP_RS", "dbSNP_Val_Status", "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode",
-                            "Match_Norm_Seq_Allele1", "Match_Norm_Seq_Allele2", "Tumor_Validation_Allele1",
-                            "Tumor_Validation_Allele2", "Match_Norm_Validation_Allele1", "Match_Norm_Validation_Allele2",
-                            "Verification_Status", "Validation_Status", "Mutation_Status", "Sequencing_Phase",
-                            "Sequence_Source", "Validation_Method", "Score", "BAM_File", "Sequencer",
-                            "Tumor_Sample_UUID", "Matched_Norm_Sample_UUID", "HGVSc", "HGVSp", "HGVSp_Short",
-                            "Transcript_ID", "Exon_Number", "t_depth", "t_ref_count", "t_alt_count", "n_depth",
-                            "n_ref_count", "n_alt_count", "all_effects", "Allele", "Gene", "Feature", "Feature_type",
-                            "One_Consequence", "Consequence", "cDNA_position", "CDS_position", "Protein_position",
-                            "Amino_acids", "Codons", "Existing_variation", "DISTANCE", "TRANSCRIPT_STRAND", "SYMBOL",
-                            "SYMBOL_SOURCE", "HGNC_ID", "BIOTYPE", "CANONICAL", "CCDS", "ENSP", "SWISSPROT", "TREMBL",
-                            "UNIPARC", "RefSeq", "SIFT", "PolyPhen", "EXON", "INTRON", "DOMAINS", "CLIN_SIG", "SOMATIC",
-                            "PUBMED", "MOTIF_NAME", "MOTIF_POS", "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "IMPACT", "PICK",
-                            "VARIANT_CLASS", "TSL", "HGVS_OFFSET", "PHENO", "GENE_PHENO", "CONTEXT", "tumor_bam_uuid",
-                            "normal_bam_uuid", "case_id", "GDC_FILTER", "COSMIC"];
 #[tokio::main]
 async fn main() -> Result<(),Box<dyn std::error::Error>> {
     // Accepting the piped input json from jodejs and assign to the variable
@@ -83,6 +65,22 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
         url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
     };
+    // read columns as array from input json and convert data type from Vec<Value> to Vec<String>
+    let maf_col:Vec<String>;
+    if let Some(maf_col_value) = file_id_lst_js.get("columns") {
+        //convert Vec<Value> to Vec<String>
+        if let Some(maf_col_array) = maf_col_value.as_array() {
+            maf_col = maf_col_array
+                .iter()
+                .map(|v| v.to_string().replace("\"",""))
+                .collect::<Vec<String>>();
+        } else {
+            panic!("Columns is not an array");
+        }
+    } else {
+        panic!("Columns was not selected");
+    };
     //downloading maf files parallelly and merge them into single maf file
     let download_futures = futures::stream::iter(
         url.into_iter().map(|url|{
@@ -110,13 +108,13 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
     // output
     let mut encoder = GzEncoder::new(io::stdout(), Compression::default());
-    let _ = encoder.write_all(&MAF_COL.join("\t").as_bytes().to_vec()).expect("Failed to write header");
+    let _ = encoder.write_all(&maf_col.join("\t").as_bytes().to_vec()).expect("Failed to write header");
     let _ = encoder.write_all(b"\n").expect("Failed to write newline");
     download_futures.buffer_unordered(20).for_each(|item| {
         if item.starts_with("Failed") {
             eprintln!("{}",item);
         } else {
-            let maf_bit = select_maf_col(item);
+            let maf_bit = select_maf_col(item,&maf_col);
             let _ = encoder.write_all(&maf_bit).expect("Failed to write file");
         };
         async {}

package/src/{gene_variance.rs → topGeneByExpressionVariance.rs} RENAMED Viewed

@@ -11,6 +11,7 @@ Various JSON parameters:
  Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","input_file":"/path/to/input/file","filter_extreme_values":true,"num_genes":100, "param":"var"}' && time echo $json | target/release/gene_variance
 */
 #![allow(non_snake_case)]
+use bgzip::BGZFReader;
 use json;
 use nalgebra::base::dimension::Dyn;
 use nalgebra::base::Matrix;
@@ -23,8 +24,9 @@ use statrs::statistics::Median;
 use statrs::statistics::OrderStatistics;
 use statrs::statistics::Statistics;
 use std::cmp::Ordering;
+use std::fs;
 use std::io;
-use std::path::Path;
+use std::io::Read;
 use std::str::FromStr;
 use std::time::Instant;
@@ -34,100 +36,78 @@ fn input_data(
 ) -> (
     Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
     Vec<String>,
-    Vec<String>,
 ) {
     // Build the CSV reader and iterate over each record.
-    let path = Path::new(filename);
-    let mut rdr = csv::Reader::from_path(path).unwrap();
+    let mut reader = BGZFReader::new(fs::File::open(filename).unwrap()).unwrap();
     let mut num_lines: usize = 0;
-    let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
-    let mut gene_names: Vec<String> = Vec::with_capacity(65000);
-    let mut gene_symbols: Vec<String> = Vec::with_capacity(65000);
-    let mut num_columns: usize = 0;
+    let mut gene_symbols: Vec<String> = Vec::with_capacity(500);
-    // Check headers for samples
-    let header_line = rdr.headers().unwrap();
-    let mut headers: Vec<&str> = Vec::with_capacity(1500);
-    for field in header_line.iter() {
-        headers = field.split('\t').collect::<Vec<&str>>();
-    }
-    //println!("headers:{:?}", headers);
-    let mut sample_indexes_original: Vec<usize> = Vec::with_capacity(sample_list.len());
-    let gene_name_index = headers.iter().position(|r| r == &"geneID");
-    let gene_symbol_index = headers.iter().position(|r| r == &"geneSymbol");
-    //let mut case_samples_not_found: Vec<&str> = Vec::with_capacity(sample_list.len());
-    //let mut control_samples_not_found: Vec<&str> = Vec::with_capacity(control_list.len());
+    let mut buffer = String::new();
+    reader.read_to_string(&mut buffer).unwrap();
-    for item in sample_list {
-        //println!("item:{}", item);
-        let index = headers.iter().position(|r| r == item);
-        match index {
-            Some(n) => sample_indexes_original.push(n),
-            None => {
-                //panic!("Case sample not found:{}", item);
-                //case_samples_not_found.push(item);
+    let lines = buffer.split("\n");
+    let mut first = true;
+    let mut input_vector: Vec<f64> = Vec::with_capacity(1000 * 500);
+    let mut column_numbers: Vec<usize> = Vec::with_capacity(300);
+    for line in lines {
+        if first == true {
+            first = false;
+            let columns: Vec<&str> = line.split("\t").collect();
+            // Finding column numbers corresponding to each sample given in the input list
+            for item in sample_list {
+                if let Some(index) = columns.iter().position(|num| num == item) {
+                    column_numbers.push(index)
+                } else {
+                    panic!("Sample {} not found:", item)
+                }
             }
-        }
-    }
-    //println!("case_indexes_original:{:?}", case_indexes_original);
-    let mut samples_indexes: Vec<usize> = Vec::with_capacity(sample_list.len());
-    for result in rdr.records() {
-        // The iterator yields Result<StringRecord, Error>, so we check the
-        // error here.
-        let record = result.unwrap();
-        //println!("record:{:?}", record);
-        let mut index = 0;
-        for field in record[0].split('\t').collect::<Vec<&str>>() {
-            if index == gene_name_index.unwrap() {
-                gene_names.push(field.to_string());
-            } else if index == gene_symbol_index.unwrap() {
-                gene_symbols.push(field.to_string());
-            } else if sample_indexes_original.contains(&index) {
-                let num = FromStr::from_str(field);
-                match num {
-                    Ok(n) => {
-                        //println!("n:{}", n);
-                        input_vector.push(n);
-                        if num_lines == 0 {
-                            samples_indexes.push(num_columns);
-                            num_columns += 1;
+        } else {
+            let line2: Vec<&str> = line.split("\t").collect();
+            if line2.len() == 1 {
+                break; // end of file
+            } else {
+                num_lines += 1;
+                //println!("line2:{:?}", line2);
+                gene_symbols.push(line2[3].to_string());
+                for i in &column_numbers {
+                    let field = line2[*i];
+                    let num = FromStr::from_str(field);
+                    match num {
+                        Ok(n) => {
+                            //println!("n:{}", n);
+                            input_vector.push(n);
+                        }
+                        Err(_n) => {
+                            panic!(
+                                "Number {} in line {} and column {} is not a decimal number",
+                                field,
+                                num_lines + 1,
+                                i + 1
+                            );
                         }
-                    }
-                    Err(_n) => {
-                        panic!(
-                            "Number {} in line {} and column {} is not a decimal number",
-                            field,
-                            num_lines + 1,
-                            index + 1
-                        );
                     }
                 }
             }
-            index += 1;
         }
-        num_lines += 1;
     }
     //println!("case_indexes:{:?}", case_indexes);
     //println!("control_indexes:{:?}", control_indexes);
-    let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
+    let dm = DMatrix::from_row_slice(num_lines, sample_list.len(), &input_vector);
     //println!("dm:{:?}", dm);
-    (dm, gene_names, gene_symbols)
+    (dm, gene_symbols)
 }
 #[allow(dead_code)]
 #[derive(Debug, Serialize, Deserialize)]
 struct GeneInfo {
-    gene_name: String,
     gene_symbol: String,
     param: f64,
 }
 fn calculate_variance(
     input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
-    gene_names: Vec<String>,
     gene_symbols: Vec<String>,
     mut min_sample_size: f64,
     filter_extreme_values: bool,
@@ -196,13 +176,11 @@ fn calculate_variance(
             {
                 gene_infos.push(GeneInfo {
                     param: gene_counts.variance(),
-                    gene_name: gene_names[row].clone(),
                     gene_symbol: gene_symbols[row].clone(),
                 });
             } else if filter_extreme_values == false {
                 gene_infos.push(GeneInfo {
                     param: gene_counts.variance(),
-                    gene_name: gene_names[row].clone(),
                     gene_symbol: gene_symbols[row].clone(),
                 });
             }
@@ -216,13 +194,11 @@ fn calculate_variance(
             {
                 gene_infos.push(GeneInfo {
                     param: gene_counts_data.interquartile_range(),
-                    gene_name: gene_names[row].clone(),
                     gene_symbol: gene_symbols[row].clone(),
                 });
             } else if filter_extreme_values == false {
                 gene_infos.push(GeneInfo {
                     param: gene_counts_data.interquartile_range(),
-                    gene_name: gene_names[row].clone(),
                     gene_symbol: gene_symbols[row].clone(),
                 });
             }
@@ -321,11 +297,9 @@ fn main() {
                     }
                     let samples_list: Vec<&str> = samples_string.split(",").collect();
-                    let (input_matrix, gene_names, gene_symbols) =
-                        input_data(&file_name, &samples_list);
+                    let (input_matrix, gene_symbols) = input_data(&file_name, &samples_list);
                     let gene_infos = calculate_variance(
                         input_matrix,
-                        gene_names,
                         gene_symbols,
                         samples_list.len() as f64,
                         filter_extreme_values,
@@ -343,7 +317,7 @@ fn main() {
                         }
                     }
                     output_string += &"]".to_string();
-                    println!("{}", output_string);
+                    println!("output_json:{}", output_string);
                     println!("Time for calculating variances:{:?}", now.elapsed());
                 }
                 Err(error) => println!("Incorrect json: {}", error),