npm - @sjcrh/proteinpaint-rust - Versions diffs - 2.39.0 → 2.44.0 - Mend

@sjcrh/proteinpaint-rust 2.39.0 → 2.44.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/Cargo.toml CHANGED Viewed

@@ -19,14 +19,13 @@ plotters = "0.3.4"
 colorgrad = "0.6.2"
 statrs = "^0.16.0"
 fishers_exact="^1.0.1"
-bio = "^0.39"
+bio = "1.5.0"
 bigtools = "^0.1.11"
 libmath = "^0.2.1"
 json = "^0.12.4"
 serde = {version = "^1.0.147", features = ["derive"]}
 serde_json="^1.0.88"
 num = "^0.4.1"
-csv = "^1.2.2"
 r_mathlib="^0.2.0"
 tokio = { version="1", features = ["full"] }
 reqwest = "0.11"

package/package.json CHANGED Viewed

@@ -1,37 +1,37 @@
 {
-  "version": "2.39.0",
-  "name": "@sjcrh/proteinpaint-rust",
-  "description": "Rust-based utilities for proteinpaint",
-  "main": "index.js",
-  "bin": {
-    "proteinpaint-rust": "index.js"
-  },
-  "scripts": {
-    "dev": "cargo build --release",
-    "build": "cargo build --release",
-    "postinstall": "if [ ! -d ./test ] & [ ! -d ./target/release ]; then cargo build --release; fi",
-    "test": "tape **/test/*.spec.js",
-    "test:unit": "tape **/test/*.unit.spec.js",
-    "test:integration": "echo 'TODO: rust integration tests'"
-  },
-  "author": "Robin Paul",
-  "license": "SEE LICENSE IN ./LICENSE",
-  "repository": {
-    "type": "git",
-    "url": "https://github.com/stjude/proteinpaint.git",
-    "directory": "rust"
-  },
-  "files": [
-    "index.js",
-    "Cargo.toml",
-    "src",
-    "LICENSE/*"
-  ],
-  "bugs": {
-    "url": "https://github.com/stjude/proteinpaint"
-  },
-  "homepage": "https://github.com/stjude/proteinpaint#readme",
-  "devDependencies": {
-    "tape": "^5.2.2"
-  }
+	"version": "2.44.0",
+	"name": "@sjcrh/proteinpaint-rust",
+	"description": "Rust-based utilities for proteinpaint",
+	"main": "index.js",
+	"bin": {
+		"proteinpaint-rust": "index.js"
+	},
+	"scripts": {
+		"dev": "cargo clean && cargo build --release",
+		"build": "cargo clean && cargo build --release",
+		"postinstall": "if [ ! -d ./test ] & [ ! -d ./target/release ]; then cargo clean && cargo build --release; fi",
+		"test": "tape **/test/*.spec.js",
+		"test:unit": "tape **/test/*.unit.spec.js",
+		"test:integration": "echo 'TODO: rust integration tests'"
+	},
+	"author": "Robin Paul",
+	"license": "SEE LICENSE IN ./LICENSE",
+	"repository": {
+		"type": "git",
+		"url": "https://github.com/stjude/proteinpaint.git",
+		"directory": "rust"
+	},
+	"files": [
+		"index.js",
+		"Cargo.toml",
+		"src",
+		"LICENSE/*"
+	],
+	"bugs": {
+		"url": "https://github.com/stjude/proteinpaint"
+	},
+	"homepage": "https://github.com/stjude/proteinpaint#readme",
+	"devDependencies": {
+		"tape": "^5.2.2"
+	}
 }

package/src/DEanalysis.rs CHANGED Viewed

@@ -1,5 +1,5 @@
 // cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
-// cd .. && cargo build --release && cat ~/sjpp/test.txt | target/release/DEanalysis
+// cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/DEanalysis
 #![allow(non_snake_case)]
 use json;
 use nalgebra::base::dimension::Const;
@@ -15,13 +15,46 @@ use statrs::statistics::Data;
 use statrs::statistics::Distribution;
 use statrs::statistics::Median;
 use std::cmp::Ordering;
-use std::path::Path;
+use std::fs::File;
+use std::io::Read;
 use std::str::FromStr;
+use std::sync::{Arc, Mutex}; // Multithreading library
+use std::thread;
 use std::time::Instant;
 //use std::cmp::Ordering;
 //use std::env;
 use std::io;
 //mod stats_functions; // Importing Wilcoxon function from stats_functions.rs
+const PAR_CUTOFF: usize = 100000; // Cutoff for triggering multithreading processing of data
+//const PAR_CUTOFF: usize = 1000000000000000;
+#[allow(non_upper_case_globals)]
+const max_threads: usize = 6; // Max number of threads in case the parallel processing of reads is invoked
+fn binary_search(input: &Vec<usize>, y: usize) -> i64 {
+    let input_dup = &input[..];
+    let mut index: i64 = -1;
+    let mut l: usize = 0;
+    let mut r: usize = input_dup.len() - 1;
+    let mut m: usize;
+    while l <= r {
+        m = l + ((r - l) / 2);
+        if y == input_dup[m] {
+            index = m as i64;
+            break;
+        } else if y > input_dup[m] {
+            l = m + 1;
+        }
+        // If x is smaller, ignore right half
+        else {
+            if m == 0 as usize {
+                break;
+            }
+            r = m - 1;
+        }
+    }
+    index
+}
 fn input_data(
     filename: &String,
@@ -34,9 +67,9 @@ fn input_data(
     Vec<String>,
     Vec<String>,
 ) {
-    // Build the CSV reader and iterate over each record.
-    let path = Path::new(filename);
-    let mut rdr = csv::Reader::from_path(path).unwrap();
+    let input_time = Instant::now();
+    //let mut rdr = csv::Reader::from_path(path).unwrap();
+    let mut file = File::open(filename).unwrap();
     let mut num_lines: usize = 0;
     let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
     let mut gene_names: Vec<String> = Vec::with_capacity(65000);
@@ -44,11 +77,12 @@ fn input_data(
     let mut num_columns: usize = 0;
     // Check headers for samples
-    let header_line = rdr.headers().unwrap();
-    let mut headers: Vec<&str> = Vec::with_capacity(1500);
-    for field in header_line.iter() {
-        headers = field.split('\t').collect::<Vec<&str>>();
-    }
+    let mut buffer = String::new();
+    file.read_to_string(&mut buffer).unwrap();
+    // Check headers for samples
+    let lines: Vec<&str> = buffer.split('\n').collect::<Vec<&str>>();
+    let total_lines = lines.len();
+    let headers: Vec<&str> = lines[0].split('\t').collect::<Vec<&str>>();
     //println!("headers:{:?}", headers);
     let mut case_indexes_original: Vec<usize> = Vec::with_capacity(case_list.len());
     let mut control_indexes_original: Vec<usize> = Vec::with_capacity(control_list.len());
@@ -68,6 +102,7 @@ fn input_data(
             }
         }
     }
+    let num_cases = case_list.len();
     for item in control_list {
         //println!("item:{}", item);
@@ -80,70 +115,223 @@ fn input_data(
             }
         }
     }
+    let num_controls = control_list.len();
     //println!("case_indexes_original:{:?}", case_indexes_original);
     //println!("control_indexes_original:{:?}", control_indexes_original);
+    case_indexes_original.sort();
+    case_indexes_original.dedup();
+    control_indexes_original.sort();
+    control_indexes_original.dedup();
     let mut case_indexes: Vec<usize> = Vec::with_capacity(case_list.len());
     let mut control_indexes: Vec<usize> = Vec::with_capacity(control_list.len());
-    for result in rdr.records() {
-        // The iterator yields Result<StringRecord, Error>, so we check the
-        // error here.
-        let record = result.unwrap();
-        //println!("record:{:?}", record);
-        let mut index = 0;
-        for field in record[0].split('\t').collect::<Vec<&str>>() {
-            if index == gene_name_index.unwrap() {
-                gene_names.push(field.to_string());
-            } else if index == gene_symbol_index.unwrap() {
-                gene_symbols.push(field.to_string());
-            } else if case_indexes_original.contains(&index) {
-                let num = FromStr::from_str(field);
-                match num {
-                    Ok(n) => {
-                        //println!("n:{}", n);
-                        input_vector.push(n);
-                        if num_lines == 0 {
-                            case_indexes.push(num_columns);
-                            num_columns += 1;
+    if lines.len() * (case_indexes_original.len() + control_indexes_original.len()) < PAR_CUTOFF {
+        // If number of lines is below this number
+        let lines_slice = &lines[..];
+        for line_iter in 1..lines_slice.len() - 1 {
+            // Subtracting 1 from total length of lines_slice because the last one will be empty
+            let line = lines_slice[line_iter];
+            let mut index = 0;
+            for field in line.split('\t').collect::<Vec<&str>>() {
+                if index == gene_name_index.unwrap() {
+                    gene_names.push(field.to_string());
+                } else if index == gene_symbol_index.unwrap() {
+                    gene_symbols.push(field.to_string());
+                } else if binary_search(&case_indexes_original, index) != -1 {
+                    let num = FromStr::from_str(field);
+                    match num {
+                        Ok(n) => {
+                            //println!("n:{}", n);
+                            input_vector.push(n);
+                            if num_lines == 0 {
+                                case_indexes.push(num_columns);
+                                num_columns += 1;
+                            }
+                        }
+                        Err(_n) => {
+                            panic!(
+                                "Number {} in line {} and column {} is not a decimal number",
+                                field,
+                                num_lines + 1,
+                                index + 1
+                            );
                         }
                     }
-                    Err(_n) => {
-                        panic!(
-                            "Number {} in line {} and column {} is not a decimal number",
-                            field,
-                            num_lines + 1,
-                            index + 1
-                        );
+                } else if binary_search(&control_indexes_original, index) != -1 {
+                    let num = FromStr::from_str(field);
+                    match num {
+                        Ok(n) => {
+                            //println!("n:{}", n);
+                            input_vector.push(n);
+                            if num_lines == 0 {
+                                control_indexes.push(num_columns);
+                                num_columns += 1;
+                            }
+                        }
+                        Err(_n) => {
+                            panic!(
+                                "Number {} in line {} and column {} is not a decimal number",
+                                field,
+                                num_lines + 1,
+                                index + 1
+                            );
+                        }
                     }
                 }
-            } else if control_indexes_original.contains(&index) {
-                let num = FromStr::from_str(field);
-                match num {
-                    Ok(n) => {
-                        //println!("n:{}", n);
-                        input_vector.push(n);
-                        if num_lines == 0 {
-                            control_indexes.push(num_columns);
-                            num_columns += 1;
+                index += 1;
+            }
+            num_lines += 1;
+        }
+    } else {
+        // Multithreaded implementation for parsing data in parallel starts from here
+        // Generally in rust one variable only own a data at a time, but `Arc` keyword is special and allows for multiple threads to access the same data.
+        let case_indexes_original = Arc::new(case_indexes_original);
+        let control_indexes_original = Arc::new(control_indexes_original);
+        let buffer = Arc::new(buffer);
+        let case_indexes_temp = Arc::new(Mutex::new(Vec::<usize>::with_capacity(case_list.len())));
+        let control_indexes_temp =
+            Arc::new(Mutex::new(Vec::<usize>::with_capacity(control_list.len())));
+        let num_lines_temp = Arc::new(Mutex::<usize>::new(0));
+        let num_columns_temp = Arc::new(Mutex::<usize>::new(0));
+        let genes_names_temp = Arc::new(Mutex::new(Vec::<String>::new()));
+        let genes_symbols_temp = Arc::new(Mutex::new(Vec::<String>::new()));
+        let input_vector_temp = Arc::new(Mutex::new(Vec::<f64>::new()));
+        let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
+        println!("Number of threads used:{}", max_threads);
+        for thread_num in 0..max_threads {
+            let case_indexes_original = Arc::clone(&case_indexes_original);
+            let control_indexes_original = Arc::clone(&control_indexes_original);
+            let case_indexes_temp = Arc::clone(&case_indexes_temp);
+            let control_indexes_temp = Arc::clone(&control_indexes_temp);
+            let input_vector_temp = Arc::clone(&input_vector_temp);
+            let genes_names_temp = Arc::clone(&genes_names_temp);
+            let genes_symbols_temp = Arc::clone(&genes_symbols_temp);
+            let num_lines_temp = Arc::clone(&num_lines_temp);
+            let num_columns_temp = Arc::clone(&num_columns_temp);
+            let buffer = Arc::clone(&buffer);
+            let handle = thread::spawn(move || {
+                let mut case_indexes_thread: Vec<usize> = Vec::with_capacity(num_cases);
+                let mut control_indexes_thread: Vec<usize> = Vec::with_capacity(num_controls);
+                let mut genes_names_thread: Vec<String> = Vec::with_capacity(65000);
+                let mut genes_symbols_thread: Vec<String> = Vec::with_capacity(65000);
+                let mut input_vector_thread: Vec<f64> = Vec::with_capacity(65000);
+                let mut num_columns_thread: usize = 0;
+                let mut num_lines_thread: usize = 0;
+                let lines: Vec<&str> = buffer.split('\n').collect();
+                //println!("case_indexes_original:{:?}", case_indexes_original);
+                //println!("control_indexes:{:?}", control_indexes);
+                for line_iter in 1..total_lines - 1 {
+                    let remainder: usize = line_iter % max_threads; // Calculate remainder of line number divided by max_threads to decide which thread parses this line
+                    if remainder == thread_num {
+                        //println!("buffer:{}", buffer);
+                        // Thread analyzing a particular line must have the same remainder as the thread_num, this avoids multiple threads from parsing the same line
+                        let line = lines[line_iter];
+                        let mut index = 0;
+                        for field in line.split('\t').collect::<Vec<&str>>() {
+                            if index == gene_name_index.unwrap() {
+                                genes_names_thread.push(field.to_string());
+                            } else if index == gene_symbol_index.unwrap() {
+                                genes_symbols_thread.push(field.to_string());
+                            } else if binary_search(&case_indexes_original, index) != -1 {
+                                let num = FromStr::from_str(field);
+                                match num {
+                                    Ok(n) => {
+                                        //println!("n:{}", n);
+                                        input_vector_thread.push(n);
+                                        if line_iter == 1 {
+                                            case_indexes_thread.push(num_columns_thread);
+                                            num_columns_thread += 1;
+                                        }
+                                    }
+                                    Err(_n) => {
+                                        panic!(
+                                        "Number {} in line {} and column {} is not a decimal number",
+                                        field,
+                                        num_lines_thread + 1,
+                                        index + 1
+                                    );
+                                    }
+                                }
+                            } else if binary_search(&control_indexes_original, index) != -1 {
+                                let num = FromStr::from_str(field);
+                                match num {
+                                    Ok(n) => {
+                                        //println!("n:{}", n);
+                                        input_vector_thread.push(n);
+                                        if line_iter == 1 {
+                                            control_indexes_thread.push(num_columns_thread);
+                                            num_columns_thread += 1;
+                                        }
+                                    }
+                                    Err(_n) => {
+                                        panic!(
+                                        "Number {} in line {} and column {} is not a decimal number",
+                                        field,
+                                        num_lines_thread + 1,
+                                        index + 1
+                                    );
+                                    }
+                                }
+                            }
+                            index += 1;
                         }
-                    }
-                    Err(_n) => {
-                        panic!(
-                            "Number {} in line {} and column {} is not a decimal number",
-                            field,
-                            num_lines + 1,
-                            index + 1
-                        );
+                        num_lines_thread += 1;
                     }
                 }
-            }
-            index += 1;
+                input_vector_temp
+                    .lock()
+                    .unwrap()
+                    .append(&mut input_vector_thread);
+                case_indexes_temp
+                    .lock()
+                    .unwrap()
+                    .append(&mut case_indexes_thread);
+                control_indexes_temp
+                    .lock()
+                    .unwrap()
+                    .append(&mut control_indexes_thread);
+                genes_names_temp
+                    .lock()
+                    .unwrap()
+                    .append(&mut genes_names_thread);
+                genes_symbols_temp
+                    .lock()
+                    .unwrap()
+                    .append(&mut genes_symbols_thread);
+                *num_lines_temp.lock().unwrap() += num_lines_thread;
+                if num_columns_thread > 0 {
+                    *num_columns_temp.lock().unwrap() += num_columns_thread;
+                }
+                drop(input_vector_temp);
+                drop(case_indexes_temp);
+                drop(control_indexes_temp);
+                drop(genes_names_temp);
+                drop(genes_symbols_temp);
+                drop(num_lines_temp);
+                drop(num_columns_temp);
+            });
+            handles.push(handle); // The handle (which contains the thread) is stored in the handles vector
+        }
+        for handle in handles {
+            // Wait for all threads to finish before proceeding further
+            handle.join().unwrap();
         }
-        num_lines += 1;
+        // Combining data from all different threads
+        input_vector.append(&mut *input_vector_temp.lock().unwrap());
+        case_indexes.append(&mut *case_indexes_temp.lock().unwrap());
+        control_indexes.append(&mut *control_indexes_temp.lock().unwrap());
+        gene_names.append(&mut *genes_names_temp.lock().unwrap());
+        gene_symbols.append(&mut *genes_symbols_temp.lock().unwrap());
+        num_lines += *num_lines_temp.lock().unwrap();
+        num_columns += *num_columns_temp.lock().unwrap();
     }
     //println!("case_indexes:{:?}", case_indexes);
     //println!("control_indexes:{:?}", control_indexes);
+    //println!("num_lines:{}", num_lines);
+    //println!("num_columns:{}", num_columns);
+    //println!("num_lines * num_columns:{}", num_lines * num_columns);
+    //println!("input_vector:{:?}", input_vector.len());
+    println!("Time for inputting data:{:?}", input_time.elapsed());
     let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
     //println!("dm:{:?}", dm);
     (dm, case_indexes, control_indexes, gene_names, gene_symbols)
@@ -198,6 +386,7 @@ fn main() {
                     let control_list: Vec<&str> = control_string.split(",").collect();
                     let (input_matrix, case_indexes, control_indexes, gene_names, gene_symbols) =
                         input_data(file_name, &case_list, &control_list);
+                    let filtering_time = Instant::now();
                     let (filtered_matrix, lib_sizes, filtered_genes, filtered_gene_symbols) =
                         filter_by_expr(
                             &input_matrix,
@@ -206,10 +395,21 @@ fn main() {
                             gene_names,
                             gene_symbols,
                         );
+                    println!("filtering time:{:?}", filtering_time.elapsed());
                     //println!("filtered_matrix_rows:{:?}", filtered_matrix.nrows());
                     //println!("filtered_matrix_cols:{:?}", filtered_matrix.ncols());
+                    let cpm_normalization_time = Instant::now();
                     let mut normalized_matrix = cpm(&filtered_matrix);
+                    println!(
+                        "cpm normalization time:{:?}",
+                        cpm_normalization_time.elapsed()
+                    );
+                    let tmm_normalization_time = Instant::now();
                     let norm_factors = tmm_normalization(filtered_matrix, &lib_sizes);
+                    println!(
+                        "tmm normalization time:{:?}",
+                        tmm_normalization_time.elapsed()
+                    );
                     //println!("norm_factors:{:?}", norm_factors);
                     for col in 0..normalized_matrix.ncols() {
@@ -232,58 +432,154 @@ fn main() {
                     //println!("case_indexes:{:?}", case_indexes);
                     //println!("control_indexes:{:?}", control_indexes);
-                    for i in 0..normalized_matrix.nrows() {
-                        let row = normalized_matrix.row(i);
-                        //println!("row:{:?}", row);
-                        let mut treated = Vec::<f64>::new();
-                        let mut control = Vec::<f64>::new();
-                        //println!("conditions:{:?}", conditions);
-                        for j in 0..(case_indexes.len() + control_indexes.len()) {
-                            //println!("row[(0, j)]:{}", row[(0, j)]);
-                            if case_indexes.contains(&j) {
-                                treated.push(row[(0, j)]);
-                                //println!("{},{}", input_data_vec.0[i][j], "Diseased");
-                            } else if control_indexes.contains(&j) {
-                                // + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
-                                control.push(row[(0, j)]);
-                                //println!("{},{}", input_data_vec.0[i][j], "Control");
-                            } else {
-                                panic!("Column {} could not be classified into case/control", j);
+                    let num_normalized_rows = normalized_matrix.nrows();
+                    if normalized_matrix.nrows() * normalized_matrix.ncols() < PAR_CUTOFF {
+                        for i in 0..normalized_matrix.nrows() {
+                            let row = normalized_matrix.row(i);
+                            //println!("row:{:?}", row);
+                            let mut treated = Vec::<f64>::new();
+                            let mut control = Vec::<f64>::new();
+                            //println!("conditions:{:?}", conditions);
+                            for j in 0..(case_indexes.len() + control_indexes.len()) {
+                                //println!("row[(0, j)]:{}", row[(0, j)]);
+                                if case_indexes.contains(&j) {
+                                    treated.push(row[(0, j)]);
+                                    //println!("{},{}", input_data_vec.0[i][j], "Diseased");
+                                } else if control_indexes.contains(&j) {
+                                    // + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
+                                    control.push(row[(0, j)]);
+                                    //println!("{},{}", input_data_vec.0[i][j], "Control");
+                                } else {
+                                    panic!(
+                                        "Column {} could not be classified into case/control",
+                                        j
+                                    );
+                                }
                             }
-                        }
-                        //println!("treated{:?}", treated);
-                        //println!("control{:?}", control);
-                        let p_value = wilcoxon_rank_sum_test(
-                            treated.clone(),
-                            control.clone(),
-                            THRESHOLD,
-                            't',
-                            true,
-                        ); // Setting continuity correction to true in case of normal approximation
-                        let treated_mean = Data::new(treated).mean();
-                        let control_mean = Data::new(control).mean();
-                        if (treated_mean.unwrap() / control_mean.unwrap())
-                            .log2()
-                            .is_nan()
-                            == false
-                            && (treated_mean.unwrap() / control_mean.unwrap())
+                            //println!("treated{:?}", treated);
+                            //println!("control{:?}", control);
+                            let p_value = wilcoxon_rank_sum_test(
+                                treated.clone(),
+                                control.clone(),
+                                THRESHOLD,
+                                't',
+                                true,
+                            ); // Setting continuity correction to true in case of normal approximation
+                            let treated_mean = Data::new(treated).mean();
+                            let control_mean = Data::new(control).mean();
+                            if (treated_mean.unwrap() / control_mean.unwrap())
                                 .log2()
-                                .is_infinite()
+                                .is_nan()
                                 == false
-                        {
-                            p_values.push(PValueIndexes {
-                                index: i,
-                                gene_name: filtered_genes[i].to_owned(),
-                                gene_symbol: filtered_gene_symbols[i].to_owned(),
-                                fold_change: (treated_mean.unwrap() / control_mean.unwrap()).log2(),
-                                p_value: p_value,
+                                && (treated_mean.unwrap() / control_mean.unwrap())
+                                    .log2()
+                                    .is_infinite()
+                                    == false
+                            {
+                                p_values.push(PValueIndexes {
+                                    index: i,
+                                    gene_name: filtered_genes[i].to_owned(),
+                                    gene_symbol: filtered_gene_symbols[i].to_owned(),
+                                    fold_change: (treated_mean.unwrap() / control_mean.unwrap())
+                                        .log2(),
+                                    p_value: p_value,
+                                });
+                            }
+                        }
+                    } else {
+                        // Multithreaded implementation of calculating wilcoxon p-values
+                        let normalized_matrix_temp = Arc::new(normalized_matrix);
+                        let filtered_genes_temp = Arc::new(filtered_genes);
+                        let filtered_gene_symbols_temp = Arc::new(filtered_gene_symbols);
+                        let case_indexes_temp = Arc::new(case_indexes);
+                        let control_indexes_temp = Arc::new(control_indexes);
+                        let p_values_temp = Arc::new(Mutex::new(Vec::<PValueIndexes>::new()));
+                        let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
+                        for thread_num in 0..max_threads {
+                            let normalized_matrix_temp = Arc::clone(&normalized_matrix_temp);
+                            let case_indexes_temp = Arc::clone(&case_indexes_temp);
+                            let control_indexes_temp = Arc::clone(&control_indexes_temp);
+                            let p_values_temp = Arc::clone(&p_values_temp);
+                            let filtered_genes_temp = Arc::clone(&filtered_genes_temp);
+                            let filtered_gene_symbols_temp =
+                                Arc::clone(&filtered_gene_symbols_temp);
+                            let handle = thread::spawn(move || {
+                                let mut p_values_thread: Vec<PValueIndexes> = Vec::with_capacity(
+                                    normalized_matrix_temp.nrows() / max_threads,
+                                );
+                                for i in 0..normalized_matrix_temp.nrows() {
+                                    let remainder: usize = i % max_threads; // Calculate remainder of iteration number divided by max_threads to decide which thread parses the row
+                                    if remainder == thread_num {
+                                        let row = normalized_matrix_temp.row(i);
+                                        //println!("row:{:?}", row);
+                                        let mut treated = Vec::<f64>::new();
+                                        let mut control = Vec::<f64>::new();
+                                        //println!("conditions:{:?}", conditions);
+                                        for j in 0..(case_indexes_temp.len()
+                                            + control_indexes_temp.len())
+                                        {
+                                            //println!("row[(0, j)]:{}", row[(0, j)]);
+                                            if case_indexes_temp.contains(&j) {
+                                                treated.push(row[(0, j)]);
+                                                //println!("{},{}", input_data_vec.0[i][j], "Diseased");
+                                            } else if control_indexes_temp.contains(&j) {
+                                                // + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
+                                                control.push(row[(0, j)]);
+                                                //println!("{},{}", input_data_vec.0[i][j], "Control");
+                                            } else {
+                                                panic!(
+                                        "Column {} could not be classified into case/control",
+                                        j
+                                    );
+                                            }
+                                        }
+                                        //println!("treated{:?}", treated);
+                                        //println!("control{:?}", control);
+                                        let p_value = wilcoxon_rank_sum_test(
+                                            treated.clone(),
+                                            control.clone(),
+                                            THRESHOLD,
+                                            't',
+                                            true,
+                                        ); // Setting continuity correction to true in case of normal approximation
+                                        let treated_mean = Data::new(treated).mean();
+                                        let control_mean = Data::new(control).mean();
+                                        if (treated_mean.unwrap() / control_mean.unwrap())
+                                            .log2()
+                                            .is_nan()
+                                            == false
+                                            && (treated_mean.unwrap() / control_mean.unwrap())
+                                                .log2()
+                                                .is_infinite()
+                                                == false
+                                        {
+                                            p_values_thread.push(PValueIndexes {
+                                                index: i,
+                                                gene_name: filtered_genes_temp[i].to_owned(),
+                                                gene_symbol: filtered_gene_symbols_temp[i]
+                                                    .to_owned(),
+                                                fold_change: (treated_mean.unwrap()
+                                                    / control_mean.unwrap())
+                                                .log2(),
+                                                p_value: p_value,
+                                            });
+                                        }
+                                    }
+                                }
+                                p_values_temp.lock().unwrap().append(&mut p_values_thread);
                             });
+                            handles.push(handle);
                         }
+                        for handle in handles {
+                            // Wait for all threads to finish before proceeding further
+                            handle.join().unwrap();
+                        }
+                        p_values.append(&mut *p_values_temp.lock().unwrap());
                     }
                     //println!("p_values:{:?}", p_values);
                     println!(
                         "Time for running {} wilcoxon tests:{:?}",
-                        normalized_matrix.nrows(),
+                        num_normalized_rows,
                         now2.elapsed()
                     );
                     let adjusted_p_values = adjust_p_values(p_values);
@@ -408,18 +704,62 @@ fn tmm_normalization(
         }
     }
     //println!("ref_column:{}", ref_column);
-    let ref_data = input_matrix.column(ref_column);
-    let ref_lib_size = lib_sizes[ref_column];
+    let num_cols = input_matrix.ncols();
     let mut f: Vec<f64> = Vec::with_capacity(input_matrix.ncols());
-    for col in 0..input_matrix.ncols() {
-        let obs_data = input_matrix.column(col);
-        let obs_lib_size = lib_sizes[col];
-        f.push(calc_factor_tmm(
-            obs_data,
-            &ref_data,
-            ref_lib_size,
-            obs_lib_size,
-        ));
+    if input_matrix.nrows() * input_matrix.ncols() < PAR_CUTOFF {
+        let ref_data = input_matrix.column(ref_column);
+        let ref_lib_size = lib_sizes[ref_column];
+        for col in 0..input_matrix.ncols() {
+            let obs_data = input_matrix.column(col);
+            let obs_lib_size = lib_sizes[col];
+            f.push(calc_factor_tmm(
+                obs_data,
+                &ref_data,
+                ref_lib_size,
+                obs_lib_size,
+            ));
+        }
+    } else {
+        // Multithreaded implementation of TMM normalization
+        let f_temp = Arc::new(Mutex::new(Vec::<f_index>::new()));
+        let lib_sizes_temp = Arc::new(lib_sizes.to_owned());
+        let input_matrix_temp = Arc::new(input_matrix);
+        let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
+        for thread_num in 0..max_threads {
+            let f_temp = Arc::clone(&f_temp);
+            let lib_sizes_temp = Arc::clone(&lib_sizes_temp);
+            let input_matrix_temp = Arc::clone(&input_matrix_temp);
+            let handle = thread::spawn(move || {
+                let mut f_thread: Vec<f_index> =
+                    Vec::with_capacity(input_matrix_temp.ncols() / max_threads);
+                let ref_data = input_matrix_temp.column(ref_column);
+                let ref_lib_size = lib_sizes_temp[ref_column];
+                for col in 0..input_matrix_temp.ncols() {
+                    let remainder: usize = col % max_threads; // Calculate remainder of column number divided by max_threads to decide which thread parses this column
+                    if remainder == thread_num {
+                        let obs_data = input_matrix_temp.column(col);
+                        let obs_lib_size = lib_sizes_temp[col];
+                        f_thread.push(f_index {
+                            f: calc_factor_tmm(obs_data, &ref_data, ref_lib_size, obs_lib_size),
+                            ind: col,
+                        })
+                    }
+                }
+                f_temp.lock().unwrap().append(&mut f_thread);
+            });
+            handles.push(handle);
+        }
+        for handle in handles {
+            // Wait for all threads to finish before proceeding further
+            handle.join().unwrap();
+        }
+        let mut f_orig: Vec<f_index> = Vec::with_capacity(num_cols);
+        f_orig.append(&mut *f_temp.lock().unwrap());
+        // Need to sort vector because the vector will not be ordered accord to ind because of multithreading
+        f_orig
+            .as_mut_slice()
+            .sort_by(|a, b| (a.ind).partial_cmp(&b.ind).unwrap_or(Ordering::Equal));
+        f = f_orig.into_iter().map(|x| x.f).collect::<Vec<f64>>();
     }
     const NATURAL_E: f64 = 2.718281828459;
     let log_f: Vec<f64> = f.clone().into_iter().map(|x| x.log(NATURAL_E)).collect();
@@ -427,6 +767,11 @@ fn tmm_normalization(
     let final_f: Vec<f64> = f.into_iter().map(|x| x / exp_mean_log_f).collect();
     final_f
 }
+#[allow(non_camel_case_types)]
+struct f_index {
+    f: f64,
+    ind: usize,
+}
 fn calc_factor_tmm(
     obs_data: Matrix<f64, Dyn, Const<1>, ViewStorage<'_, f64, Dyn, Const<1>, Const<1>, Dyn>>,

package/src/indel.rs CHANGED Viewed

@@ -1,7 +1,7 @@
 // Syntax: cd .. && cargo build --release
 // Test case below:
-//Debug syntax: cd .. && cargo build --release && time cat ~/sjpp/test.txt | ~/proteinpaint/rust/target/release/indel
+//Debug syntax: cd .. && cargo build --release && time cat ~/sjpp/test.txt | ~/sjpp/proteinpaint/rust/target/release/indel
 // Strictness:
 //   0: No postprocessing, pure indel typing results
@@ -848,7 +848,7 @@ fn main() {
                                     let remainder: usize = iter % max_threads; // Calculate remainder of read number divided by max_threads to decide which thread parses this read
                                                                                //println!("iter:{}", iter);
                                     if remainder == thread_num {
-                                        // Thread analyzing a particular read must have the same remainder as the thread_num, this avoids multiple reads from parsing the same read. Also checking if the read length > 0
+                                        // Thread analyzing a particular read must have the same remainder as the thread_num, this avoids multiple threads from parsing the same read. Also checking if the read length > 0
                                         //println!(
                                         //    "start_positions_list:{}",
@@ -1356,7 +1356,12 @@ fn main() {
                     //let mut output_string = "[".to_string();
                     //output_string += &all_alleles.to_string();
                     output_string.pop();
-                    output_string += &"]".to_string();
+                    if output_string.len() == 0 {
+                        // Pass empty JSON "[]" when no reads are passed back to nodejs
+                        output_string = "[]".to_string();
+                    } else {
+                        output_string += &"]".to_string();
+                    }
                     println!("Final_output:{:?}", output_string);
                 }
                 Err(error) => println!("Incorrect json: {}", error),