npm - @sjcrh/proteinpaint-rust - Versions diffs - 2.38.0 → 2.39.0 - Mend

@sjcrh/proteinpaint-rust 2.38.0 → 2.39.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/Cargo.toml +4 -4
package/package.json +1 -1
package/src/gdcmaf.rs +52 -83
package/src/{gene_variance.rs → topGeneByExpressionVariance.rs} +50 -76

package/Cargo.toml CHANGED Viewed

@@ -11,6 +11,7 @@ autobins = false
 [dependencies]
 kodama = "0.3"
 rayon = "1.7.0"
+bgzip = "0.3.1"
 petgraph = "0.6.3"
 ndarray = "0.15.6"
 nalgebra = {version = "0.32.2", features = ["serde-serialize"]}
@@ -26,7 +27,7 @@ serde = {version = "^1.0.147", features = ["derive"]}
 serde_json="^1.0.88"
 num = "^0.4.1"
 csv = "^1.2.2"
-r_mathlib="^0.2.0" # Uncomment this line to activate DE expression app for high sample sizes
+r_mathlib="^0.2.0"
 tokio = { version="1", features = ["full"] }
 reqwest = "0.11"
 flate2 = "1"
@@ -73,14 +74,13 @@ name="gdcmaf"
 path="src/gdcmaf.rs"
 [[bin]]
-name="gene_variance"
-path="src/gene_variance.rs"
+name="topGeneByExpressionVariance"
+path="src/topGeneByExpressionVariance.rs"
 #[[bin]]
 #name="wilcoxon"
 #path="src/wilcoxon.rs"
-# Uncomment the lines below to use DE app for higher sample sizes
 [[bin]]
 name="DEanalysis"
 path="src/DEanalysis.rs"

package/package.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "version": "2.38.0",
+  "version": "2.39.0",
   "name": "@sjcrh/proteinpaint-rust",
   "description": "Rust-based utilities for proteinpaint",
   "main": "index.js",

package/src/gdcmaf.rs CHANGED Viewed

@@ -1,5 +1,5 @@
 /*
-	This script download cohort maf files from GDC, combine them into a single file, and output the sorted file based on chromsome and Start_Position.
+	This script download cohort maf files from GDC, concatenate them into a single file that includes user specified columns.
 	Input JSON:
 		host: GDC host
@@ -7,7 +7,7 @@
 	Output gzip compressed maf file to stdout.
 	Example of usage:
-		echo '{"host": "https://api.gdc.cancer.gov/data/", "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
+		echo '{"host": "https://api.gdc.cancer.gov/data/","columns": ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome", "Start_Position"], "fileIdLst": ["8b31d6d1-56f7-4aa8-b026-c64bafd531e7", "b429fcc1-2b59-4b4c-a472-fb27758f6249"]}'|./target/release/gdcmaf
 */
 use flate2::read::GzDecoder;
@@ -17,81 +17,38 @@ use serde_json::Value;
 use std::path::Path;
 use futures::StreamExt;
 use std::io::{self,Read,Write};
-use std::sync::mpsc;
-fn gen_vec(d:String) -> (Vec<String>,Vec<Vec<u8>>) {
-    let mut maf_bit: Vec<Vec<u8>> = Vec::new();
-    let mut lst_chrom_pos: Vec<String> = Vec::new();
+fn select_maf_col(d:String,columns:&Vec<String>) -> Vec<u8> {
+    let mut maf_str: String = String::new();
     let mut header_indices: Vec<usize> = Vec::new();
-    let mut chrom_index: usize = 9999;
-    let mut pos_index: usize = 9999;
     let lines = d.trim_end().split("\n");
     for line in lines {
         if line.starts_with("#") {
             continue
         } else if line.contains("Hugo_Symbol") {
             let header: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
-            for col in MAF_COL {
-                let col_index: usize = header.iter().position(|x| x == col).unwrap();
-                header_indices.push(col_index);
-                if col == "Chromosome" {
-                    chrom_index = col_index;
-                } else if col == "Start_Position" {
-                    pos_index = col_index;
+            for col in columns {
+                if let Some(index) = header.iter().position(|x| x == col) {
+                    header_indices.push(index);
+                } else {
+                    panic!("{} was not found!",col);
                 }
             }
         } else {
             let maf_cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
             let mut maf_out_lst: Vec<String> = Vec::new();
-            let mut chrom = String::new();
-            let mut pos = String::new();
-            for (i,x) in header_indices.iter().enumerate() {
+            for x in header_indices.iter() {
                 maf_out_lst.push(maf_cont_lst[*x].to_string());
-                if chrom_index != 9999 && i == chrom_index {
-                    chrom = maf_cont_lst[*x].to_string();
-                } else if pos_index != 9999 && i == pos_index {
-                    pos = maf_cont_lst[*x].to_string();
-                }
             };
             maf_out_lst.push("\n".to_string());
-            maf_bit.push(maf_out_lst.join("\t").as_bytes().to_vec());
-            lst_chrom_pos.push(chrom+"\t"+&pos);
+            maf_str.push_str(maf_out_lst.join("\t").as_str());
         }
     };
-    (lst_chrom_pos,maf_bit)
-}
-fn get_sorted_indices(lst: &Vec<String>) -> Vec<usize>{
-    let mut indices = (0..lst.len()).collect::<Vec<usize>>();
-    indices.sort_by(|a,b| {
-        lst[*a].split('\t').collect::<Vec<&str>>()[0].cmp(lst[*b].split('\t').collect::<Vec<&str>>()[0])
-            .then(lst[*a].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap().cmp(&lst[*b].split('\t').collect::<Vec<&str>>()[1].parse::<u32>().unwrap()))
-        });
-    indices
+    maf_str.as_bytes().to_vec()
 }
-// GDC MAF columns (96)
-const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
-                            "Start_Position", "End_Position", "Strand", "Variant_Classification",
-                            "Variant_Type", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2",
-                            "dbSNP_RS", "dbSNP_Val_Status", "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode",
-                            "Match_Norm_Seq_Allele1", "Match_Norm_Seq_Allele2", "Tumor_Validation_Allele1",
-                            "Tumor_Validation_Allele2", "Match_Norm_Validation_Allele1", "Match_Norm_Validation_Allele2",
-                            "Verification_Status", "Validation_Status", "Mutation_Status", "Sequencing_Phase",
-                            "Sequence_Source", "Validation_Method", "Score", "BAM_File", "Sequencer",
-                            "Tumor_Sample_UUID", "Matched_Norm_Sample_UUID", "HGVSc", "HGVSp", "HGVSp_Short",
-                            "Transcript_ID", "Exon_Number", "t_depth", "t_ref_count", "t_alt_count", "n_depth",
-                            "n_ref_count", "n_alt_count", "all_effects", "Allele", "Gene", "Feature", "Feature_type",
-                            "One_Consequence", "Consequence", "cDNA_position", "CDS_position", "Protein_position",
-                            "Amino_acids", "Codons", "Existing_variation", "DISTANCE", "TRANSCRIPT_STRAND", "SYMBOL",
-                            "SYMBOL_SOURCE", "HGNC_ID", "BIOTYPE", "CANONICAL", "CCDS", "ENSP", "SWISSPROT", "TREMBL",
-                            "UNIPARC", "RefSeq", "SIFT", "PolyPhen", "EXON", "INTRON", "DOMAINS", "CLIN_SIG", "SOMATIC",
-                            "PUBMED", "MOTIF_NAME", "MOTIF_POS", "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "IMPACT", "PICK",
-                            "VARIANT_CLASS", "TSL", "HGVS_OFFSET", "PHENO", "GENE_PHENO", "CONTEXT", "tumor_bam_uuid",
-                            "normal_bam_uuid", "case_id", "GDC_FILTER", "COSMIC"];
 #[tokio::main]
 async fn main() -> Result<(),Box<dyn std::error::Error>> {
@@ -108,47 +65,59 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
         url.push(Path::new(&host).join(&v.as_str().unwrap()).display().to_string());
     };
+    // read columns as array from input json and convert data type from Vec<Value> to Vec<String>
+    let maf_col:Vec<String>;
+    if let Some(maf_col_value) = file_id_lst_js.get("columns") {
+        //convert Vec<Value> to Vec<String>
+        if let Some(maf_col_array) = maf_col_value.as_array() {
+            maf_col = maf_col_array
+                .iter()
+                .map(|v| v.to_string().replace("\"",""))
+                .collect::<Vec<String>>();
+        } else {
+            panic!("Columns is not an array");
+        }
+    } else {
+        panic!("Columns was not selected");
+    };
     //downloading maf files parallelly and merge them into single maf file
-    let (tx, rx) = mpsc::channel();
-    let fetches = futures::stream::iter(
+    let download_futures = futures::stream::iter(
         url.into_iter().map(|url|{
-            let txt = tx.clone();
             async move {
-                if let Ok(resp) = reqwest::get(&url).await {
+                let result = reqwest::get(&url).await;
+                if let Ok(resp) = result {
                     let content = resp.bytes().await.unwrap();
                     let mut decoder = GzDecoder::new(&content[..]);
                     let mut decompressed_content = Vec::new();
-                    if let Ok(_) = decoder.read_to_end(&mut decompressed_content) {
-                        let text = String::from_utf8_lossy(&decompressed_content);
-                        let (lst_chrom_pos,maf_bit) = gen_vec(text.to_string());
-                        txt.send((lst_chrom_pos,maf_bit)).unwrap();
+                    let read_content = decoder.read_to_end(&mut decompressed_content);
+                    if let Ok(_) = read_content {
+                        let text = String::from_utf8_lossy(&decompressed_content).to_string();
+                        text
+                    } else {
+                        let error_msg = "Failed to read content downloaded from: ".to_string() + &url;
+                        error_msg
                     }
+                } else {
+                    let error_msg = "Failed to download: ".to_string() + &url;
+                    error_msg
                 }
             }
         })
-    ).buffer_unordered(20).collect::<Vec<()>>();
-    fetches.await;
-    drop(tx);
-    // write downloaded maf (GZIP format) into a Vector
-    // lst_chrom_pos: a vector including chromsome&position info for sorting maf
-    // idx_sorted: indices after sorting basedon chromsome&position
-    let mut maf_bit: Vec<Vec<u8>> = Vec::new();
-    let mut lst_chrom_pos: Vec<String> = Vec::new();
-    for (chr_pos_lst,maf_bit_lst) in rx {
-        maf_bit.extend_from_slice(&maf_bit_lst);
-        lst_chrom_pos.extend_from_slice(&chr_pos_lst);
-    };
-    let idx_sorted = get_sorted_indices(&lst_chrom_pos);
+    );
     // output
-    // maf_out_bit: A vector of GZIPPED maf
-    // compress_header: output header
     let mut encoder = GzEncoder::new(io::stdout(), Compression::default());
-    let _ = encoder.write_all(&MAF_COL.join("\t").as_bytes().to_vec()).expect("Failed to write header");
+    let _ = encoder.write_all(&maf_col.join("\t").as_bytes().to_vec()).expect("Failed to write header");
     let _ = encoder.write_all(b"\n").expect("Failed to write newline");
-    for i in idx_sorted.iter() {
-        let _ = encoder.write_all(&maf_bit[*i]).expect("Failed to write file");
-    };
+    download_futures.buffer_unordered(20).for_each(|item| {
+        if item.starts_with("Failed") {
+            eprintln!("{}",item);
+        } else {
+            let maf_bit = select_maf_col(item,&maf_col);
+            let _ = encoder.write_all(&maf_bit).expect("Failed to write file");
+        };
+        async {}
+    }).await;
     Ok(())
 }

package/src/{gene_variance.rs → topGeneByExpressionVariance.rs} RENAMED Viewed

@@ -11,6 +11,7 @@ Various JSON parameters:
  Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","input_file":"/path/to/input/file","filter_extreme_values":true,"num_genes":100, "param":"var"}' && time echo $json | target/release/gene_variance
 */
 #![allow(non_snake_case)]
+use bgzip::BGZFReader;
 use json;
 use nalgebra::base::dimension::Dyn;
 use nalgebra::base::Matrix;
@@ -23,8 +24,9 @@ use statrs::statistics::Median;
 use statrs::statistics::OrderStatistics;
 use statrs::statistics::Statistics;
 use std::cmp::Ordering;
+use std::fs;
 use std::io;
-use std::path::Path;
+use std::io::Read;
 use std::str::FromStr;
 use std::time::Instant;
@@ -34,100 +36,78 @@ fn input_data(
 ) -> (
     Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
     Vec<String>,
-    Vec<String>,
 ) {
     // Build the CSV reader and iterate over each record.
-    let path = Path::new(filename);
-    let mut rdr = csv::Reader::from_path(path).unwrap();
+    let mut reader = BGZFReader::new(fs::File::open(filename).unwrap()).unwrap();
     let mut num_lines: usize = 0;
-    let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
-    let mut gene_names: Vec<String> = Vec::with_capacity(65000);
-    let mut gene_symbols: Vec<String> = Vec::with_capacity(65000);
-    let mut num_columns: usize = 0;
+    let mut gene_symbols: Vec<String> = Vec::with_capacity(500);
-    // Check headers for samples
-    let header_line = rdr.headers().unwrap();
-    let mut headers: Vec<&str> = Vec::with_capacity(1500);
-    for field in header_line.iter() {
-        headers = field.split('\t').collect::<Vec<&str>>();
-    }
-    //println!("headers:{:?}", headers);
-    let mut sample_indexes_original: Vec<usize> = Vec::with_capacity(sample_list.len());
-    let gene_name_index = headers.iter().position(|r| r == &"geneID");
-    let gene_symbol_index = headers.iter().position(|r| r == &"geneSymbol");
-    //let mut case_samples_not_found: Vec<&str> = Vec::with_capacity(sample_list.len());
-    //let mut control_samples_not_found: Vec<&str> = Vec::with_capacity(control_list.len());
+    let mut buffer = String::new();
+    reader.read_to_string(&mut buffer).unwrap();
-    for item in sample_list {
-        //println!("item:{}", item);
-        let index = headers.iter().position(|r| r == item);
-        match index {
-            Some(n) => sample_indexes_original.push(n),
-            None => {
-                //panic!("Case sample not found:{}", item);
-                //case_samples_not_found.push(item);
+    let lines = buffer.split("\n");
+    let mut first = true;
+    let mut input_vector: Vec<f64> = Vec::with_capacity(1000 * 500);
+    let mut column_numbers: Vec<usize> = Vec::with_capacity(300);
+    for line in lines {
+        if first == true {
+            first = false;
+            let columns: Vec<&str> = line.split("\t").collect();
+            // Finding column numbers corresponding to each sample given in the input list
+            for item in sample_list {
+                if let Some(index) = columns.iter().position(|num| num == item) {
+                    column_numbers.push(index)
+                } else {
+                    panic!("Sample {} not found:", item)
+                }
             }
-        }
-    }
-    //println!("case_indexes_original:{:?}", case_indexes_original);
-    let mut samples_indexes: Vec<usize> = Vec::with_capacity(sample_list.len());
-    for result in rdr.records() {
-        // The iterator yields Result<StringRecord, Error>, so we check the
-        // error here.
-        let record = result.unwrap();
-        //println!("record:{:?}", record);
-        let mut index = 0;
-        for field in record[0].split('\t').collect::<Vec<&str>>() {
-            if index == gene_name_index.unwrap() {
-                gene_names.push(field.to_string());
-            } else if index == gene_symbol_index.unwrap() {
-                gene_symbols.push(field.to_string());
-            } else if sample_indexes_original.contains(&index) {
-                let num = FromStr::from_str(field);
-                match num {
-                    Ok(n) => {
-                        //println!("n:{}", n);
-                        input_vector.push(n);
-                        if num_lines == 0 {
-                            samples_indexes.push(num_columns);
-                            num_columns += 1;
+        } else {
+            let line2: Vec<&str> = line.split("\t").collect();
+            if line2.len() == 1 {
+                break; // end of file
+            } else {
+                num_lines += 1;
+                //println!("line2:{:?}", line2);
+                gene_symbols.push(line2[3].to_string());
+                for i in &column_numbers {
+                    let field = line2[*i];
+                    let num = FromStr::from_str(field);
+                    match num {
+                        Ok(n) => {
+                            //println!("n:{}", n);
+                            input_vector.push(n);
+                        }
+                        Err(_n) => {
+                            panic!(
+                                "Number {} in line {} and column {} is not a decimal number",
+                                field,
+                                num_lines + 1,
+                                i + 1
+                            );
                         }
-                    }
-                    Err(_n) => {
-                        panic!(
-                            "Number {} in line {} and column {} is not a decimal number",
-                            field,
-                            num_lines + 1,
-                            index + 1
-                        );
                     }
                 }
             }
-            index += 1;
         }
-        num_lines += 1;
     }
     //println!("case_indexes:{:?}", case_indexes);
     //println!("control_indexes:{:?}", control_indexes);
-    let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
+    let dm = DMatrix::from_row_slice(num_lines, sample_list.len(), &input_vector);
     //println!("dm:{:?}", dm);
-    (dm, gene_names, gene_symbols)
+    (dm, gene_symbols)
 }
 #[allow(dead_code)]
 #[derive(Debug, Serialize, Deserialize)]
 struct GeneInfo {
-    gene_name: String,
     gene_symbol: String,
     param: f64,
 }
 fn calculate_variance(
     input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
-    gene_names: Vec<String>,
     gene_symbols: Vec<String>,
     mut min_sample_size: f64,
     filter_extreme_values: bool,
@@ -196,13 +176,11 @@ fn calculate_variance(
             {
                 gene_infos.push(GeneInfo {
                     param: gene_counts.variance(),
-                    gene_name: gene_names[row].clone(),
                     gene_symbol: gene_symbols[row].clone(),
                 });
             } else if filter_extreme_values == false {
                 gene_infos.push(GeneInfo {
                     param: gene_counts.variance(),
-                    gene_name: gene_names[row].clone(),
                     gene_symbol: gene_symbols[row].clone(),
                 });
             }
@@ -216,13 +194,11 @@ fn calculate_variance(
             {
                 gene_infos.push(GeneInfo {
                     param: gene_counts_data.interquartile_range(),
-                    gene_name: gene_names[row].clone(),
                     gene_symbol: gene_symbols[row].clone(),
                 });
             } else if filter_extreme_values == false {
                 gene_infos.push(GeneInfo {
                     param: gene_counts_data.interquartile_range(),
-                    gene_name: gene_names[row].clone(),
                     gene_symbol: gene_symbols[row].clone(),
                 });
             }
@@ -321,11 +297,9 @@ fn main() {
                     }
                     let samples_list: Vec<&str> = samples_string.split(",").collect();
-                    let (input_matrix, gene_names, gene_symbols) =
-                        input_data(&file_name, &samples_list);
+                    let (input_matrix, gene_symbols) = input_data(&file_name, &samples_list);
                     let gene_infos = calculate_variance(
                         input_matrix,
-                        gene_names,
                         gene_symbols,
                         samples_list.len() as f64,
                         filter_extreme_values,
@@ -343,7 +317,7 @@ fn main() {
                         }
                     }
                     output_string += &"]".to_string();
-                    println!("{}", output_string);
+                    println!("output_json:{}", output_string);
                     println!("Time for calculating variances:{:?}", now.elapsed());
                 }
                 Err(error) => println!("Incorrect json: {}", error),