@sjcrh/proteinpaint-rust 2.40.6 → 2.44.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -19,14 +19,13 @@ plotters = "0.3.4"
19
19
  colorgrad = "0.6.2"
20
20
  statrs = "^0.16.0"
21
21
  fishers_exact="^1.0.1"
22
- bio = "^0.39"
22
+ bio = "1.5.0"
23
23
  bigtools = "^0.1.11"
24
24
  libmath = "^0.2.1"
25
25
  json = "^0.12.4"
26
26
  serde = {version = "^1.0.147", features = ["derive"]}
27
27
  serde_json="^1.0.88"
28
28
  num = "^0.4.1"
29
- csv = "^1.2.2"
30
29
  r_mathlib="^0.2.0"
31
30
  tokio = { version="1", features = ["full"] }
32
31
  reqwest = "0.11"
package/package.json CHANGED
@@ -1,37 +1,37 @@
1
1
  {
2
- "version": "2.40.6",
3
- "name": "@sjcrh/proteinpaint-rust",
4
- "description": "Rust-based utilities for proteinpaint",
5
- "main": "index.js",
6
- "bin": {
7
- "proteinpaint-rust": "index.js"
8
- },
9
- "scripts": {
10
- "dev": "cargo build --release",
11
- "build": "cargo build --release",
12
- "postinstall": "if [ ! -d ./test ] & [ ! -d ./target/release ]; then cargo build --release; fi",
13
- "test": "tape **/test/*.spec.js",
14
- "test:unit": "tape **/test/*.unit.spec.js",
15
- "test:integration": "echo 'TODO: rust integration tests'"
16
- },
17
- "author": "Robin Paul",
18
- "license": "SEE LICENSE IN ./LICENSE",
19
- "repository": {
20
- "type": "git",
21
- "url": "https://github.com/stjude/proteinpaint.git",
22
- "directory": "rust"
23
- },
24
- "files": [
25
- "index.js",
26
- "Cargo.toml",
27
- "src",
28
- "LICENSE/*"
29
- ],
30
- "bugs": {
31
- "url": "https://github.com/stjude/proteinpaint"
32
- },
33
- "homepage": "https://github.com/stjude/proteinpaint#readme",
34
- "devDependencies": {
35
- "tape": "^5.2.2"
36
- }
2
+ "version": "2.44.0",
3
+ "name": "@sjcrh/proteinpaint-rust",
4
+ "description": "Rust-based utilities for proteinpaint",
5
+ "main": "index.js",
6
+ "bin": {
7
+ "proteinpaint-rust": "index.js"
8
+ },
9
+ "scripts": {
10
+ "dev": "cargo clean && cargo build --release",
11
+ "build": "cargo clean && cargo build --release",
12
+ "postinstall": "if [ ! -d ./test ] & [ ! -d ./target/release ]; then cargo clean && cargo build --release; fi",
13
+ "test": "tape **/test/*.spec.js",
14
+ "test:unit": "tape **/test/*.unit.spec.js",
15
+ "test:integration": "echo 'TODO: rust integration tests'"
16
+ },
17
+ "author": "Robin Paul",
18
+ "license": "SEE LICENSE IN ./LICENSE",
19
+ "repository": {
20
+ "type": "git",
21
+ "url": "https://github.com/stjude/proteinpaint.git",
22
+ "directory": "rust"
23
+ },
24
+ "files": [
25
+ "index.js",
26
+ "Cargo.toml",
27
+ "src",
28
+ "LICENSE/*"
29
+ ],
30
+ "bugs": {
31
+ "url": "https://github.com/stjude/proteinpaint"
32
+ },
33
+ "homepage": "https://github.com/stjude/proteinpaint#readme",
34
+ "devDependencies": {
35
+ "tape": "^5.2.2"
36
+ }
37
37
  }
package/src/DEanalysis.rs CHANGED
@@ -1,5 +1,5 @@
1
1
  // cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
2
- // cd .. && cargo build --release && cat ~/sjpp/test.txt | target/release/DEanalysis
2
+ // cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/DEanalysis
3
3
  #![allow(non_snake_case)]
4
4
  use json;
5
5
  use nalgebra::base::dimension::Const;
@@ -15,13 +15,46 @@ use statrs::statistics::Data;
15
15
  use statrs::statistics::Distribution;
16
16
  use statrs::statistics::Median;
17
17
  use std::cmp::Ordering;
18
- use std::path::Path;
18
+ use std::fs::File;
19
+ use std::io::Read;
19
20
  use std::str::FromStr;
21
+ use std::sync::{Arc, Mutex}; // Multithreading library
22
+ use std::thread;
20
23
  use std::time::Instant;
21
24
  //use std::cmp::Ordering;
22
25
  //use std::env;
23
26
  use std::io;
24
27
  //mod stats_functions; // Importing Wilcoxon function from stats_functions.rs
28
+ const PAR_CUTOFF: usize = 100000; // Cutoff for triggering multithreading processing of data
29
+
30
+ //const PAR_CUTOFF: usize = 1000000000000000;
31
+ #[allow(non_upper_case_globals)]
32
+ const max_threads: usize = 6; // Max number of threads in case the parallel processing of reads is invoked
33
+
34
+ fn binary_search(input: &Vec<usize>, y: usize) -> i64 {
35
+ let input_dup = &input[..];
36
+ let mut index: i64 = -1;
37
+ let mut l: usize = 0;
38
+ let mut r: usize = input_dup.len() - 1;
39
+ let mut m: usize;
40
+ while l <= r {
41
+ m = l + ((r - l) / 2);
42
+ if y == input_dup[m] {
43
+ index = m as i64;
44
+ break;
45
+ } else if y > input_dup[m] {
46
+ l = m + 1;
47
+ }
48
+ // If x is smaller, ignore right half
49
+ else {
50
+ if m == 0 as usize {
51
+ break;
52
+ }
53
+ r = m - 1;
54
+ }
55
+ }
56
+ index
57
+ }
25
58
 
26
59
  fn input_data(
27
60
  filename: &String,
@@ -34,9 +67,9 @@ fn input_data(
34
67
  Vec<String>,
35
68
  Vec<String>,
36
69
  ) {
37
- // Build the CSV reader and iterate over each record.
38
- let path = Path::new(filename);
39
- let mut rdr = csv::Reader::from_path(path).unwrap();
70
+ let input_time = Instant::now();
71
+ //let mut rdr = csv::Reader::from_path(path).unwrap();
72
+ let mut file = File::open(filename).unwrap();
40
73
  let mut num_lines: usize = 0;
41
74
  let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
42
75
  let mut gene_names: Vec<String> = Vec::with_capacity(65000);
@@ -44,11 +77,12 @@ fn input_data(
44
77
  let mut num_columns: usize = 0;
45
78
 
46
79
  // Check headers for samples
47
- let header_line = rdr.headers().unwrap();
48
- let mut headers: Vec<&str> = Vec::with_capacity(1500);
49
- for field in header_line.iter() {
50
- headers = field.split('\t').collect::<Vec<&str>>();
51
- }
80
+ let mut buffer = String::new();
81
+ file.read_to_string(&mut buffer).unwrap();
82
+ // Check headers for samples
83
+ let lines: Vec<&str> = buffer.split('\n').collect::<Vec<&str>>();
84
+ let total_lines = lines.len();
85
+ let headers: Vec<&str> = lines[0].split('\t').collect::<Vec<&str>>();
52
86
  //println!("headers:{:?}", headers);
53
87
  let mut case_indexes_original: Vec<usize> = Vec::with_capacity(case_list.len());
54
88
  let mut control_indexes_original: Vec<usize> = Vec::with_capacity(control_list.len());
@@ -68,6 +102,7 @@ fn input_data(
68
102
  }
69
103
  }
70
104
  }
105
+ let num_cases = case_list.len();
71
106
 
72
107
  for item in control_list {
73
108
  //println!("item:{}", item);
@@ -80,70 +115,223 @@ fn input_data(
80
115
  }
81
116
  }
82
117
  }
118
+ let num_controls = control_list.len();
83
119
  //println!("case_indexes_original:{:?}", case_indexes_original);
84
120
  //println!("control_indexes_original:{:?}", control_indexes_original);
85
-
121
+ case_indexes_original.sort();
122
+ case_indexes_original.dedup();
123
+ control_indexes_original.sort();
124
+ control_indexes_original.dedup();
86
125
  let mut case_indexes: Vec<usize> = Vec::with_capacity(case_list.len());
87
126
  let mut control_indexes: Vec<usize> = Vec::with_capacity(control_list.len());
88
- for result in rdr.records() {
89
- // The iterator yields Result<StringRecord, Error>, so we check the
90
- // error here.
91
- let record = result.unwrap();
92
- //println!("record:{:?}", record);
93
- let mut index = 0;
94
- for field in record[0].split('\t').collect::<Vec<&str>>() {
95
- if index == gene_name_index.unwrap() {
96
- gene_names.push(field.to_string());
97
- } else if index == gene_symbol_index.unwrap() {
98
- gene_symbols.push(field.to_string());
99
- } else if case_indexes_original.contains(&index) {
100
- let num = FromStr::from_str(field);
101
- match num {
102
- Ok(n) => {
103
- //println!("n:{}", n);
104
- input_vector.push(n);
105
- if num_lines == 0 {
106
- case_indexes.push(num_columns);
107
- num_columns += 1;
127
+ if lines.len() * (case_indexes_original.len() + control_indexes_original.len()) < PAR_CUTOFF {
128
+ // If number of lines is below this number
129
+ let lines_slice = &lines[..];
130
+ for line_iter in 1..lines_slice.len() - 1 {
131
+ // Subtracting 1 from total length of lines_slice because the last one will be empty
132
+ let line = lines_slice[line_iter];
133
+ let mut index = 0;
134
+ for field in line.split('\t').collect::<Vec<&str>>() {
135
+ if index == gene_name_index.unwrap() {
136
+ gene_names.push(field.to_string());
137
+ } else if index == gene_symbol_index.unwrap() {
138
+ gene_symbols.push(field.to_string());
139
+ } else if binary_search(&case_indexes_original, index) != -1 {
140
+ let num = FromStr::from_str(field);
141
+ match num {
142
+ Ok(n) => {
143
+ //println!("n:{}", n);
144
+ input_vector.push(n);
145
+ if num_lines == 0 {
146
+ case_indexes.push(num_columns);
147
+ num_columns += 1;
148
+ }
149
+ }
150
+ Err(_n) => {
151
+ panic!(
152
+ "Number {} in line {} and column {} is not a decimal number",
153
+ field,
154
+ num_lines + 1,
155
+ index + 1
156
+ );
108
157
  }
109
158
  }
110
- Err(_n) => {
111
- panic!(
112
- "Number {} in line {} and column {} is not a decimal number",
113
- field,
114
- num_lines + 1,
115
- index + 1
116
- );
159
+ } else if binary_search(&control_indexes_original, index) != -1 {
160
+ let num = FromStr::from_str(field);
161
+ match num {
162
+ Ok(n) => {
163
+ //println!("n:{}", n);
164
+ input_vector.push(n);
165
+ if num_lines == 0 {
166
+ control_indexes.push(num_columns);
167
+ num_columns += 1;
168
+ }
169
+ }
170
+ Err(_n) => {
171
+ panic!(
172
+ "Number {} in line {} and column {} is not a decimal number",
173
+ field,
174
+ num_lines + 1,
175
+ index + 1
176
+ );
177
+ }
117
178
  }
118
179
  }
119
- } else if control_indexes_original.contains(&index) {
120
- let num = FromStr::from_str(field);
121
- match num {
122
- Ok(n) => {
123
- //println!("n:{}", n);
124
- input_vector.push(n);
125
- if num_lines == 0 {
126
- control_indexes.push(num_columns);
127
- num_columns += 1;
180
+ index += 1;
181
+ }
182
+ num_lines += 1;
183
+ }
184
+ } else {
185
+ // Multithreaded implementation for parsing data in parallel starts from here
186
+ // Generally in rust one variable only own a data at a time, but `Arc` keyword is special and allows for multiple threads to access the same data.
187
+ let case_indexes_original = Arc::new(case_indexes_original);
188
+ let control_indexes_original = Arc::new(control_indexes_original);
189
+ let buffer = Arc::new(buffer);
190
+ let case_indexes_temp = Arc::new(Mutex::new(Vec::<usize>::with_capacity(case_list.len())));
191
+ let control_indexes_temp =
192
+ Arc::new(Mutex::new(Vec::<usize>::with_capacity(control_list.len())));
193
+ let num_lines_temp = Arc::new(Mutex::<usize>::new(0));
194
+ let num_columns_temp = Arc::new(Mutex::<usize>::new(0));
195
+ let genes_names_temp = Arc::new(Mutex::new(Vec::<String>::new()));
196
+ let genes_symbols_temp = Arc::new(Mutex::new(Vec::<String>::new()));
197
+ let input_vector_temp = Arc::new(Mutex::new(Vec::<f64>::new()));
198
+ let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
199
+ println!("Number of threads used:{}", max_threads);
200
+ for thread_num in 0..max_threads {
201
+ let case_indexes_original = Arc::clone(&case_indexes_original);
202
+ let control_indexes_original = Arc::clone(&control_indexes_original);
203
+ let case_indexes_temp = Arc::clone(&case_indexes_temp);
204
+ let control_indexes_temp = Arc::clone(&control_indexes_temp);
205
+ let input_vector_temp = Arc::clone(&input_vector_temp);
206
+ let genes_names_temp = Arc::clone(&genes_names_temp);
207
+ let genes_symbols_temp = Arc::clone(&genes_symbols_temp);
208
+ let num_lines_temp = Arc::clone(&num_lines_temp);
209
+ let num_columns_temp = Arc::clone(&num_columns_temp);
210
+ let buffer = Arc::clone(&buffer);
211
+ let handle = thread::spawn(move || {
212
+ let mut case_indexes_thread: Vec<usize> = Vec::with_capacity(num_cases);
213
+ let mut control_indexes_thread: Vec<usize> = Vec::with_capacity(num_controls);
214
+ let mut genes_names_thread: Vec<String> = Vec::with_capacity(65000);
215
+ let mut genes_symbols_thread: Vec<String> = Vec::with_capacity(65000);
216
+ let mut input_vector_thread: Vec<f64> = Vec::with_capacity(65000);
217
+ let mut num_columns_thread: usize = 0;
218
+ let mut num_lines_thread: usize = 0;
219
+ let lines: Vec<&str> = buffer.split('\n').collect();
220
+ //println!("case_indexes_original:{:?}", case_indexes_original);
221
+ //println!("control_indexes:{:?}", control_indexes);
222
+ for line_iter in 1..total_lines - 1 {
223
+ let remainder: usize = line_iter % max_threads; // Calculate remainder of line number divided by max_threads to decide which thread parses this line
224
+ if remainder == thread_num {
225
+ //println!("buffer:{}", buffer);
226
+ // Thread analyzing a particular line must have the same remainder as the thread_num, this avoids multiple threads from parsing the same line
227
+ let line = lines[line_iter];
228
+ let mut index = 0;
229
+ for field in line.split('\t').collect::<Vec<&str>>() {
230
+ if index == gene_name_index.unwrap() {
231
+ genes_names_thread.push(field.to_string());
232
+ } else if index == gene_symbol_index.unwrap() {
233
+ genes_symbols_thread.push(field.to_string());
234
+ } else if binary_search(&case_indexes_original, index) != -1 {
235
+ let num = FromStr::from_str(field);
236
+ match num {
237
+ Ok(n) => {
238
+ //println!("n:{}", n);
239
+ input_vector_thread.push(n);
240
+ if line_iter == 1 {
241
+ case_indexes_thread.push(num_columns_thread);
242
+ num_columns_thread += 1;
243
+ }
244
+ }
245
+ Err(_n) => {
246
+ panic!(
247
+ "Number {} in line {} and column {} is not a decimal number",
248
+ field,
249
+ num_lines_thread + 1,
250
+ index + 1
251
+ );
252
+ }
253
+ }
254
+ } else if binary_search(&control_indexes_original, index) != -1 {
255
+ let num = FromStr::from_str(field);
256
+ match num {
257
+ Ok(n) => {
258
+ //println!("n:{}", n);
259
+ input_vector_thread.push(n);
260
+ if line_iter == 1 {
261
+ control_indexes_thread.push(num_columns_thread);
262
+ num_columns_thread += 1;
263
+ }
264
+ }
265
+ Err(_n) => {
266
+ panic!(
267
+ "Number {} in line {} and column {} is not a decimal number",
268
+ field,
269
+ num_lines_thread + 1,
270
+ index + 1
271
+ );
272
+ }
273
+ }
274
+ }
275
+ index += 1;
128
276
  }
129
- }
130
- Err(_n) => {
131
- panic!(
132
- "Number {} in line {} and column {} is not a decimal number",
133
- field,
134
- num_lines + 1,
135
- index + 1
136
- );
277
+ num_lines_thread += 1;
137
278
  }
138
279
  }
139
- }
140
- index += 1;
280
+ input_vector_temp
281
+ .lock()
282
+ .unwrap()
283
+ .append(&mut input_vector_thread);
284
+ case_indexes_temp
285
+ .lock()
286
+ .unwrap()
287
+ .append(&mut case_indexes_thread);
288
+ control_indexes_temp
289
+ .lock()
290
+ .unwrap()
291
+ .append(&mut control_indexes_thread);
292
+ genes_names_temp
293
+ .lock()
294
+ .unwrap()
295
+ .append(&mut genes_names_thread);
296
+ genes_symbols_temp
297
+ .lock()
298
+ .unwrap()
299
+ .append(&mut genes_symbols_thread);
300
+ *num_lines_temp.lock().unwrap() += num_lines_thread;
301
+ if num_columns_thread > 0 {
302
+ *num_columns_temp.lock().unwrap() += num_columns_thread;
303
+ }
304
+ drop(input_vector_temp);
305
+ drop(case_indexes_temp);
306
+ drop(control_indexes_temp);
307
+ drop(genes_names_temp);
308
+ drop(genes_symbols_temp);
309
+ drop(num_lines_temp);
310
+ drop(num_columns_temp);
311
+ });
312
+ handles.push(handle); // The handle (which contains the thread) is stored in the handles vector
313
+ }
314
+ for handle in handles {
315
+ // Wait for all threads to finish before proceeding further
316
+ handle.join().unwrap();
141
317
  }
142
- num_lines += 1;
318
+ // Combining data from all different threads
319
+ input_vector.append(&mut *input_vector_temp.lock().unwrap());
320
+ case_indexes.append(&mut *case_indexes_temp.lock().unwrap());
321
+ control_indexes.append(&mut *control_indexes_temp.lock().unwrap());
322
+ gene_names.append(&mut *genes_names_temp.lock().unwrap());
323
+ gene_symbols.append(&mut *genes_symbols_temp.lock().unwrap());
324
+
325
+ num_lines += *num_lines_temp.lock().unwrap();
326
+ num_columns += *num_columns_temp.lock().unwrap();
143
327
  }
144
328
  //println!("case_indexes:{:?}", case_indexes);
145
329
  //println!("control_indexes:{:?}", control_indexes);
146
-
330
+ //println!("num_lines:{}", num_lines);
331
+ //println!("num_columns:{}", num_columns);
332
+ //println!("num_lines * num_columns:{}", num_lines * num_columns);
333
+ //println!("input_vector:{:?}", input_vector.len());
334
+ println!("Time for inputting data:{:?}", input_time.elapsed());
147
335
  let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
148
336
  //println!("dm:{:?}", dm);
149
337
  (dm, case_indexes, control_indexes, gene_names, gene_symbols)
@@ -198,6 +386,7 @@ fn main() {
198
386
  let control_list: Vec<&str> = control_string.split(",").collect();
199
387
  let (input_matrix, case_indexes, control_indexes, gene_names, gene_symbols) =
200
388
  input_data(file_name, &case_list, &control_list);
389
+ let filtering_time = Instant::now();
201
390
  let (filtered_matrix, lib_sizes, filtered_genes, filtered_gene_symbols) =
202
391
  filter_by_expr(
203
392
  &input_matrix,
@@ -206,10 +395,21 @@ fn main() {
206
395
  gene_names,
207
396
  gene_symbols,
208
397
  );
398
+ println!("filtering time:{:?}", filtering_time.elapsed());
209
399
  //println!("filtered_matrix_rows:{:?}", filtered_matrix.nrows());
210
400
  //println!("filtered_matrix_cols:{:?}", filtered_matrix.ncols());
401
+ let cpm_normalization_time = Instant::now();
211
402
  let mut normalized_matrix = cpm(&filtered_matrix);
403
+ println!(
404
+ "cpm normalization time:{:?}",
405
+ cpm_normalization_time.elapsed()
406
+ );
407
+ let tmm_normalization_time = Instant::now();
212
408
  let norm_factors = tmm_normalization(filtered_matrix, &lib_sizes);
409
+ println!(
410
+ "tmm normalization time:{:?}",
411
+ tmm_normalization_time.elapsed()
412
+ );
213
413
  //println!("norm_factors:{:?}", norm_factors);
214
414
 
215
415
  for col in 0..normalized_matrix.ncols() {
@@ -232,58 +432,154 @@ fn main() {
232
432
 
233
433
  //println!("case_indexes:{:?}", case_indexes);
234
434
  //println!("control_indexes:{:?}", control_indexes);
235
- for i in 0..normalized_matrix.nrows() {
236
- let row = normalized_matrix.row(i);
237
- //println!("row:{:?}", row);
238
- let mut treated = Vec::<f64>::new();
239
- let mut control = Vec::<f64>::new();
240
- //println!("conditions:{:?}", conditions);
241
- for j in 0..(case_indexes.len() + control_indexes.len()) {
242
- //println!("row[(0, j)]:{}", row[(0, j)]);
243
- if case_indexes.contains(&j) {
244
- treated.push(row[(0, j)]);
245
- //println!("{},{}", input_data_vec.0[i][j], "Diseased");
246
- } else if control_indexes.contains(&j) {
247
- // + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
248
- control.push(row[(0, j)]);
249
- //println!("{},{}", input_data_vec.0[i][j], "Control");
250
- } else {
251
- panic!("Column {} could not be classified into case/control", j);
435
+ let num_normalized_rows = normalized_matrix.nrows();
436
+ if normalized_matrix.nrows() * normalized_matrix.ncols() < PAR_CUTOFF {
437
+ for i in 0..normalized_matrix.nrows() {
438
+ let row = normalized_matrix.row(i);
439
+ //println!("row:{:?}", row);
440
+ let mut treated = Vec::<f64>::new();
441
+ let mut control = Vec::<f64>::new();
442
+ //println!("conditions:{:?}", conditions);
443
+ for j in 0..(case_indexes.len() + control_indexes.len()) {
444
+ //println!("row[(0, j)]:{}", row[(0, j)]);
445
+ if case_indexes.contains(&j) {
446
+ treated.push(row[(0, j)]);
447
+ //println!("{},{}", input_data_vec.0[i][j], "Diseased");
448
+ } else if control_indexes.contains(&j) {
449
+ // + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
450
+ control.push(row[(0, j)]);
451
+ //println!("{},{}", input_data_vec.0[i][j], "Control");
452
+ } else {
453
+ panic!(
454
+ "Column {} could not be classified into case/control",
455
+ j
456
+ );
457
+ }
252
458
  }
253
- }
254
- //println!("treated{:?}", treated);
255
- //println!("control{:?}", control);
256
- let p_value = wilcoxon_rank_sum_test(
257
- treated.clone(),
258
- control.clone(),
259
- THRESHOLD,
260
- 't',
261
- true,
262
- ); // Setting continuity correction to true in case of normal approximation
263
- let treated_mean = Data::new(treated).mean();
264
- let control_mean = Data::new(control).mean();
265
- if (treated_mean.unwrap() / control_mean.unwrap())
266
- .log2()
267
- .is_nan()
268
- == false
269
- && (treated_mean.unwrap() / control_mean.unwrap())
459
+ //println!("treated{:?}", treated);
460
+ //println!("control{:?}", control);
461
+ let p_value = wilcoxon_rank_sum_test(
462
+ treated.clone(),
463
+ control.clone(),
464
+ THRESHOLD,
465
+ 't',
466
+ true,
467
+ ); // Setting continuity correction to true in case of normal approximation
468
+ let treated_mean = Data::new(treated).mean();
469
+ let control_mean = Data::new(control).mean();
470
+ if (treated_mean.unwrap() / control_mean.unwrap())
270
471
  .log2()
271
- .is_infinite()
472
+ .is_nan()
272
473
  == false
273
- {
274
- p_values.push(PValueIndexes {
275
- index: i,
276
- gene_name: filtered_genes[i].to_owned(),
277
- gene_symbol: filtered_gene_symbols[i].to_owned(),
278
- fold_change: (treated_mean.unwrap() / control_mean.unwrap()).log2(),
279
- p_value: p_value,
474
+ && (treated_mean.unwrap() / control_mean.unwrap())
475
+ .log2()
476
+ .is_infinite()
477
+ == false
478
+ {
479
+ p_values.push(PValueIndexes {
480
+ index: i,
481
+ gene_name: filtered_genes[i].to_owned(),
482
+ gene_symbol: filtered_gene_symbols[i].to_owned(),
483
+ fold_change: (treated_mean.unwrap() / control_mean.unwrap())
484
+ .log2(),
485
+ p_value: p_value,
486
+ });
487
+ }
488
+ }
489
+ } else {
490
+ // Multithreaded implementation of calculating wilcoxon p-values
491
+ let normalized_matrix_temp = Arc::new(normalized_matrix);
492
+ let filtered_genes_temp = Arc::new(filtered_genes);
493
+ let filtered_gene_symbols_temp = Arc::new(filtered_gene_symbols);
494
+ let case_indexes_temp = Arc::new(case_indexes);
495
+ let control_indexes_temp = Arc::new(control_indexes);
496
+ let p_values_temp = Arc::new(Mutex::new(Vec::<PValueIndexes>::new()));
497
+ let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
498
+ for thread_num in 0..max_threads {
499
+ let normalized_matrix_temp = Arc::clone(&normalized_matrix_temp);
500
+ let case_indexes_temp = Arc::clone(&case_indexes_temp);
501
+ let control_indexes_temp = Arc::clone(&control_indexes_temp);
502
+ let p_values_temp = Arc::clone(&p_values_temp);
503
+ let filtered_genes_temp = Arc::clone(&filtered_genes_temp);
504
+ let filtered_gene_symbols_temp =
505
+ Arc::clone(&filtered_gene_symbols_temp);
506
+ let handle = thread::spawn(move || {
507
+ let mut p_values_thread: Vec<PValueIndexes> = Vec::with_capacity(
508
+ normalized_matrix_temp.nrows() / max_threads,
509
+ );
510
+ for i in 0..normalized_matrix_temp.nrows() {
511
+ let remainder: usize = i % max_threads; // Calculate remainder of iteration number divided by max_threads to decide which thread parses the row
512
+ if remainder == thread_num {
513
+ let row = normalized_matrix_temp.row(i);
514
+ //println!("row:{:?}", row);
515
+ let mut treated = Vec::<f64>::new();
516
+ let mut control = Vec::<f64>::new();
517
+ //println!("conditions:{:?}", conditions);
518
+ for j in 0..(case_indexes_temp.len()
519
+ + control_indexes_temp.len())
520
+ {
521
+ //println!("row[(0, j)]:{}", row[(0, j)]);
522
+ if case_indexes_temp.contains(&j) {
523
+ treated.push(row[(0, j)]);
524
+ //println!("{},{}", input_data_vec.0[i][j], "Diseased");
525
+ } else if control_indexes_temp.contains(&j) {
526
+ // + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
527
+ control.push(row[(0, j)]);
528
+ //println!("{},{}", input_data_vec.0[i][j], "Control");
529
+ } else {
530
+ panic!(
531
+ "Column {} could not be classified into case/control",
532
+ j
533
+ );
534
+ }
535
+ }
536
+ //println!("treated{:?}", treated);
537
+ //println!("control{:?}", control);
538
+ let p_value = wilcoxon_rank_sum_test(
539
+ treated.clone(),
540
+ control.clone(),
541
+ THRESHOLD,
542
+ 't',
543
+ true,
544
+ ); // Setting continuity correction to true in case of normal approximation
545
+ let treated_mean = Data::new(treated).mean();
546
+ let control_mean = Data::new(control).mean();
547
+ if (treated_mean.unwrap() / control_mean.unwrap())
548
+ .log2()
549
+ .is_nan()
550
+ == false
551
+ && (treated_mean.unwrap() / control_mean.unwrap())
552
+ .log2()
553
+ .is_infinite()
554
+ == false
555
+ {
556
+ p_values_thread.push(PValueIndexes {
557
+ index: i,
558
+ gene_name: filtered_genes_temp[i].to_owned(),
559
+ gene_symbol: filtered_gene_symbols_temp[i]
560
+ .to_owned(),
561
+ fold_change: (treated_mean.unwrap()
562
+ / control_mean.unwrap())
563
+ .log2(),
564
+ p_value: p_value,
565
+ });
566
+ }
567
+ }
568
+ }
569
+ p_values_temp.lock().unwrap().append(&mut p_values_thread);
280
570
  });
571
+ handles.push(handle);
281
572
  }
573
+ for handle in handles {
574
+ // Wait for all threads to finish before proceeding further
575
+ handle.join().unwrap();
576
+ }
577
+ p_values.append(&mut *p_values_temp.lock().unwrap());
282
578
  }
283
579
  //println!("p_values:{:?}", p_values);
284
580
  println!(
285
581
  "Time for running {} wilcoxon tests:{:?}",
286
- normalized_matrix.nrows(),
582
+ num_normalized_rows,
287
583
  now2.elapsed()
288
584
  );
289
585
  let adjusted_p_values = adjust_p_values(p_values);
@@ -408,18 +704,62 @@ fn tmm_normalization(
408
704
  }
409
705
  }
410
706
  //println!("ref_column:{}", ref_column);
411
- let ref_data = input_matrix.column(ref_column);
412
- let ref_lib_size = lib_sizes[ref_column];
707
+ let num_cols = input_matrix.ncols();
413
708
  let mut f: Vec<f64> = Vec::with_capacity(input_matrix.ncols());
414
- for col in 0..input_matrix.ncols() {
415
- let obs_data = input_matrix.column(col);
416
- let obs_lib_size = lib_sizes[col];
417
- f.push(calc_factor_tmm(
418
- obs_data,
419
- &ref_data,
420
- ref_lib_size,
421
- obs_lib_size,
422
- ));
709
+ if input_matrix.nrows() * input_matrix.ncols() < PAR_CUTOFF {
710
+ let ref_data = input_matrix.column(ref_column);
711
+ let ref_lib_size = lib_sizes[ref_column];
712
+ for col in 0..input_matrix.ncols() {
713
+ let obs_data = input_matrix.column(col);
714
+ let obs_lib_size = lib_sizes[col];
715
+ f.push(calc_factor_tmm(
716
+ obs_data,
717
+ &ref_data,
718
+ ref_lib_size,
719
+ obs_lib_size,
720
+ ));
721
+ }
722
+ } else {
723
+ // Multithreaded implementation of TMM normalization
724
+ let f_temp = Arc::new(Mutex::new(Vec::<f_index>::new()));
725
+ let lib_sizes_temp = Arc::new(lib_sizes.to_owned());
726
+ let input_matrix_temp = Arc::new(input_matrix);
727
+ let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
728
+ for thread_num in 0..max_threads {
729
+ let f_temp = Arc::clone(&f_temp);
730
+ let lib_sizes_temp = Arc::clone(&lib_sizes_temp);
731
+ let input_matrix_temp = Arc::clone(&input_matrix_temp);
732
+ let handle = thread::spawn(move || {
733
+ let mut f_thread: Vec<f_index> =
734
+ Vec::with_capacity(input_matrix_temp.ncols() / max_threads);
735
+ let ref_data = input_matrix_temp.column(ref_column);
736
+ let ref_lib_size = lib_sizes_temp[ref_column];
737
+ for col in 0..input_matrix_temp.ncols() {
738
+ let remainder: usize = col % max_threads; // Calculate remainder of column number divided by max_threads to decide which thread parses this column
739
+ if remainder == thread_num {
740
+ let obs_data = input_matrix_temp.column(col);
741
+ let obs_lib_size = lib_sizes_temp[col];
742
+ f_thread.push(f_index {
743
+ f: calc_factor_tmm(obs_data, &ref_data, ref_lib_size, obs_lib_size),
744
+ ind: col,
745
+ })
746
+ }
747
+ }
748
+ f_temp.lock().unwrap().append(&mut f_thread);
749
+ });
750
+ handles.push(handle);
751
+ }
752
+ for handle in handles {
753
+ // Wait for all threads to finish before proceeding further
754
+ handle.join().unwrap();
755
+ }
756
+ let mut f_orig: Vec<f_index> = Vec::with_capacity(num_cols);
757
+ f_orig.append(&mut *f_temp.lock().unwrap());
758
+ // Need to sort vector because the vector will not be ordered accord to ind because of multithreading
759
+ f_orig
760
+ .as_mut_slice()
761
+ .sort_by(|a, b| (a.ind).partial_cmp(&b.ind).unwrap_or(Ordering::Equal));
762
+ f = f_orig.into_iter().map(|x| x.f).collect::<Vec<f64>>();
423
763
  }
424
764
  const NATURAL_E: f64 = 2.718281828459;
425
765
  let log_f: Vec<f64> = f.clone().into_iter().map(|x| x.log(NATURAL_E)).collect();
@@ -427,6 +767,11 @@ fn tmm_normalization(
427
767
  let final_f: Vec<f64> = f.into_iter().map(|x| x / exp_mean_log_f).collect();
428
768
  final_f
429
769
  }
770
+ #[allow(non_camel_case_types)]
771
+ struct f_index {
772
+ f: f64,
773
+ ind: usize,
774
+ }
430
775
 
431
776
  fn calc_factor_tmm(
432
777
  obs_data: Matrix<f64, Dyn, Const<1>, ViewStorage<'_, f64, Dyn, Const<1>, Const<1>, Dyn>>,
package/src/indel.rs CHANGED
@@ -848,7 +848,7 @@ fn main() {
848
848
  let remainder: usize = iter % max_threads; // Calculate remainder of read number divided by max_threads to decide which thread parses this read
849
849
  //println!("iter:{}", iter);
850
850
  if remainder == thread_num {
851
- // Thread analyzing a particular read must have the same remainder as the thread_num, this avoids multiple reads from parsing the same read. Also checking if the read length > 0
851
+ // Thread analyzing a particular read must have the same remainder as the thread_num, this avoids multiple threads from parsing the same read. Also checking if the read length > 0
852
852
 
853
853
  //println!(
854
854
  // "start_positions_list:{}",