npm - @sjcrh/proteinpaint-rust - Versions diffs - 2.59.0 → 2.61.1 - Mend

@sjcrh/proteinpaint-rust 2.59.0 → 2.61.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/Cargo.toml CHANGED Viewed

@@ -86,3 +86,7 @@ path="src/DEanalysis.rs"
 [[bin]]
 name="genesetORA"
 path="src/genesetORA.rs"
+[[bin]]
+name="computeTopTerms"
+path="src/computeTopTerms.rs"

package/package.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-	"version": "2.59.0",
+	"version": "2.61.1",
 	"name": "@sjcrh/proteinpaint-rust",
 	"description": "Rust-based utilities for proteinpaint",
 	"main": "index.js",
@@ -38,5 +38,5 @@
 	"devDependencies": {
 		"tape": "^5.2.2"
 	},
-	"pp_release_tag": "v2.59.0"
+	"pp_release_tag": "v2.61.1"
 }

package/src/DEanalysis.rs CHANGED Viewed

@@ -1,4 +1,4 @@
-// cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
+// cd .. && cargo build --release && json='{"min_count":10,"min_total_count":15,"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
 // cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/DEanalysis
 #![allow(non_snake_case)]
 use json;
@@ -368,6 +368,22 @@ fn main() {
             match input_json {
                 Ok(json_string) => {
                     let now = Instant::now();
+                    let min_count_option = json_string["min_count"].as_f64().to_owned();
+                    let min_total_count_option = json_string["min_total_count"].as_f64().to_owned();
+                    let min_count;
+                    match min_count_option {
+                        Some(x) => min_count = x,
+                        None => {
+                            panic!("min_count is missing a value")
+                        }
+                    }
+                    let min_total_count;
+                    match min_total_count_option {
+                        Some(x) => min_total_count = x,
+                        None => {
+                            panic!("min_total_count is missing a value")
+                        }
+                    }
                     let case_string = &json_string["case"].to_owned().as_str().unwrap().to_string();
                     let control_string = &json_string["control"]
                         .to_owned()
@@ -388,6 +404,8 @@ fn main() {
                     let filtering_time = Instant::now();
                     let (filtered_matrix, lib_sizes, filtered_genes, filtered_gene_symbols) =
                         filter_by_expr(
+                            min_count,
+                            min_total_count,
                             &input_matrix,
                             case_indexes.len(),
                             control_indexes.len(),
@@ -671,6 +689,7 @@ fn adjust_p_values_bonferroni(original_p_values: Vec<PValueIndexes>) -> Vec<Adju
     adjusted_p_values
 }
+// Original TMM normalization source code in edgeR: https://rdrr.io/bioc/edgeR/src/R/calcNormFactors.R
 fn tmm_normalization(
     input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
     lib_sizes: &Vec<f64>,
@@ -947,7 +966,10 @@ fn calc_quantile(mut input: Vec<f64>, p: f64) -> f64 {
     qs_final
 }
+// Original filterByExpr source code in edgeR: https://rdrr.io/bioc/edgeR/src/R/filterByExpr.R
 fn filter_by_expr(
+    min_count: f64,
+    min_total_count: f64,
     raw_data: &Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
     num_diseased: usize,
     num_control: usize,
@@ -960,9 +982,8 @@ fn filter_by_expr(
     Vec<String>,
 ) {
     // Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>
-    #[allow(non_upper_case_globals)]
-    const MIN_COUNT: f64 = 10.0; // Value of constant from R implementation
-    const MIN_TOTAL_COUNT: f64 = 15.0; // Value of constant from R implementation
+    //const min_count: f64 = 10.0; // Value of constant from R implementation
+    //const min_total_count: f64 = 15.0; // Value of constant from R implementation
     const LARGE_N: f64 = 10.0; // Value of constant from R implementation
     const MIN_PROP: f64 = 0.7; // Value of constant from R implementation
@@ -989,7 +1010,7 @@ fn filter_by_expr(
     //println!("lib_sizes:{:?}", lib_sizes);
     //println!("min_sample_size:{}", min_sample_size);
     let median_lib_size = Data::new(lib_sizes.clone()).median();
-    let cpm_cutoff = (MIN_COUNT / median_lib_size) * 1000000.0;
+    let cpm_cutoff = (min_count / median_lib_size) * 1000000.0;
     //println!("cpm_cutoff:{}", cpm_cutoff);
     let cpm_matrix = cpm(&raw_data);
     const TOL: f64 = 1e-14; // Value of constant from R implementation
@@ -1018,7 +1039,7 @@ fn filter_by_expr(
         //}
         let mut keep_total_bool = false;
-        if row_sums[(row, 0)] as f64 >= MIN_TOTAL_COUNT - TOL {
+        if row_sums[(row, 0)] as f64 >= min_total_count - TOL {
             keep_total_bool = true;
             //keep_total.push(keep_total_bool);
             //positive_total += 1;

package/src/computeTopTerms.rs ADDED Viewed

@@ -0,0 +1,152 @@
+/*
+ This script selects the top most variant metabolite by calculating the variance/interquartile region for each metabolite.
+Various JSON parameters:
+   samples: Enter the sample ID(s) separated by comma
+   input_file: Path to input file(txt file instead of *.gz file)
+   num_metabolites: The top num_metabolites that need to be reported in the output(optional. 20 by default).
+   param: var/iqr . This parameter decides whether to sort metabolites using variance or interquartile region. There is an article which states that its better to use interquartile region than variance for selecting genes for clustering https://www.frontiersin.org/articles/10.3389/fgene.2021.632620/full
+ Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","input_file":"/path/to/input/file","filter_extreme_values":true,"param":"var"}' && time echo $json | target/release/computeTopTerms
+*/
+#![allow(non_snake_case)]
+use serde_json::{self,Value};
+use serde::{Serialize,Deserialize};
+use std::io::{self, BufReader, BufRead};
+use std::fs::File;
+use nalgebra::base::dimension::Dyn;
+use nalgebra::base::Matrix;
+use nalgebra::base::VecStorage;
+use nalgebra::DMatrix;
+use std::str::FromStr;
+use std::cmp::Ordering;
+use statrs::statistics::Data;
+use statrs::statistics::OrderStatistics;
+use statrs::statistics::Statistics;
+fn input_data(
+	filename: &String,
+	sample_list: &Vec<&str>,
+) -> (
+	Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
+	Vec<String>,
+) {
+	let mut num_lines: usize = 0;
+	let mut metabolites: Vec<String> = Vec::with_capacity(500);
+	let file = File::open(filename).expect("Reading metabolite intensity file error!");
+	let reader = BufReader::new(file);
+	let mut input_vector: Vec<f64> = Vec::with_capacity(1000 * 500);
+	let mut column_numbers: Vec<usize> = Vec::with_capacity(300);
+	for line in reader.lines() {
+		let line_str = line.expect("line reading error");
+		let columns: Vec<&str> = line_str.split("\t").collect();
+		// Finding column numbers corresponding to each sample given in the input list
+		if columns[0] == "#Metabolites" {
+			for sam in sample_list {
+				if let Some(index) = columns.iter().position(|s| s == sam) {
+					column_numbers.push(index)
+				} else {
+					panic!("Sample {} not found:", sam);
+				}
+			}
+		} else {
+			num_lines += 1;
+			metabolites.push(columns[0].to_string());
+			for i in &column_numbers {
+				let intensity = columns[*i];
+				let intensity_num = FromStr::from_str(intensity);
+				match intensity_num {
+					Ok(n) => {
+						input_vector.push(n);
+					}
+					Err(_) => {
+						panic!(
+							"Number {} in line {} and column {} is not a decimal number",
+							intensity,
+							num_lines + 1,
+							i + 1
+						)
+					}
+				}
+			}
+		}
+	};
+	let dm = DMatrix::from_row_slice(num_lines, sample_list.len(), &input_vector);
+	(dm, metabolites)
+}
+#[derive(Debug, Serialize, Deserialize)]
+struct MetaboliteInfo {
+	metabolite: String,
+	param: f64,
+}
+fn calculate_variance(
+	input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
+    metabolites: Vec<String>,
+    param: String,
+) -> Vec<MetaboliteInfo> {
+	let mut metabolite_infos = Vec::<MetaboliteInfo>::new();
+	for row in 0..input_matrix.nrows() {
+		let mut metabolite_counts: Vec<f64> = Vec::with_capacity(input_matrix.ncols());
+		for col in 0..input_matrix.ncols() {
+			metabolite_counts.push(input_matrix[(row, col)]);
+		}
+		if param == "var" {
+			// Calculating variance
+				metabolite_infos.push(MetaboliteInfo {
+					metabolite: metabolites[row].clone(),
+					param: metabolite_counts.clone().variance(),
+				});
+		} else {
+			// Calculating interquartile region
+			let metabolite_counts_data = Data::new(metabolite_counts);
+			metabolite_infos.push(MetaboliteInfo {
+					metabolite: metabolites[row].clone(),
+					param: metabolite_counts_data.clone().interquartile_range(),
+				});
+		}
+	}
+	metabolite_infos
+		.as_mut_slice()
+		.sort_by(|a, b| (a.param).partial_cmp(&b.param).unwrap_or(Ordering::Equal));
+	//println!("{:?}",metabolite_infos);
+	metabolite_infos
+}
+fn main() {
+	let mut input = String::new();
+	io::stdin().read_line(&mut input).expect("Piping error");
+	let input_json = serde_json::from_str::<Value>(&input).expect("Error reading input and serializing to JSON");
+	let sample_string = &input_json.get("samples").expect("samples is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
+	let file_name = &input_json.get("input_file").expect("input_file is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
+	let param = &input_json.get("param").expect("param is missed from input JSON").to_owned().to_string().trim_matches('"').to_string();
+	if param != "var" && param != "iqr" {
+		panic!("Unknown method:{}", param); // Check if any unknown method has been provided
+	};
+	let num_metabolites = match input_json.get("num_metabolites") {
+		Some(value) => {
+			let string_value = value.as_str().expect("Invalid value type for 'num_metabolites'");
+			string_value.parse::<usize>().expect("Invalid number of metabolites")
+		}
+		None => 20
+	};
+	let samples_list: Vec<&str> = sample_string.split(",").collect();
+	let (input_matrix, metabolites) = input_data(&file_name, &samples_list);
+	let metabolite_infos = calculate_variance(
+		input_matrix,
+		metabolites,
+		param.to_string(),
+	);
+	let mut output_string = "[".to_string();
+	for j in 0..num_metabolites {
+		let i = metabolite_infos.len() - j - 1;
+		output_string += &serde_json::to_string(&metabolite_infos[i]).unwrap();
+		if i > metabolite_infos.len() - num_metabolites {
+			output_string += &",".to_string();
+		}
+	}
+	output_string += &"]".to_string();
+	println!("output_json:{}", output_string);
+}

package/src/genesetORA.rs CHANGED Viewed

@@ -112,12 +112,12 @@ fn main() -> Result<()> {
                             + &genesetgroup
                             + "'"),
                     );
+                    let mut iter = 0;
                     match stmt_result {
                         Ok(mut stmt) => {
                             #[allow(non_snake_case)]
                             let GO_iter =
                                 stmt.query_map([], |row| Ok(GO_pathway { GO_id: row.get(0)? }))?;
-                            let mut iter = 0;
                             #[allow(non_snake_case)]
                             for GO_term in GO_iter {
                                 iter += 1;
@@ -178,14 +178,15 @@ fn main() -> Result<()> {
                                     }
                                 }
                             }
-                            println!("Number of pathway entries:{}", iter);
                         }
                         Err(_) => panic!("sqlite database file not found"),
                     }
-                    println!(
-                        "pathway_p_values:{}",
-                        adjust_p_values(pathway_p_values, num_items_output)
-                    );
+                    let output_string = "{\"num_pathways\":".to_string()
+                        + &iter.to_string()
+                        + &",\"pathways\":"
+                        + &adjust_p_values(pathway_p_values, num_items_output)
+                        + &"}";
+                    println!("pathway_p_values:{}", output_string);
                     println!(
                         "Time for calculating gene overrepresentation:{:?}",
                         run_time.elapsed()
@@ -199,7 +200,10 @@ fn main() -> Result<()> {
     Ok(())
 }
-fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>, num_items_output: usize) -> String {
+fn adjust_p_values(
+    mut original_p_values: Vec<pathway_p_value>,
+    mut num_items_output: usize,
+) -> String {
     // Sorting p-values in ascending order
     original_p_values.as_mut_slice().sort_by(|a, b| {
         (a.p_value_original)
@@ -243,6 +247,10 @@ fn adjust_p_values(mut original_p_values: Vec<pathway_p_value>, num_items_output
             .unwrap_or(Ordering::Equal)
     });
+    if num_items_output > adjusted_p_values.len() {
+        num_items_output = adjusted_p_values.len()
+    }
     let mut output_string = "[".to_string();
     for i in 0..num_items_output {
         let j = adjusted_p_values.len() - i - 1;