@sjcrh/proteinpaint-rust 2.40.6 → 2.49.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/DEanalysis.rs CHANGED
@@ -1,5 +1,5 @@
1
1
  // cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
2
- // cd .. && cargo build --release && cat ~/sjpp/test.txt | target/release/DEanalysis
2
+ // cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/DEanalysis
3
3
  #![allow(non_snake_case)]
4
4
  use json;
5
5
  use nalgebra::base::dimension::Const;
@@ -8,20 +8,52 @@ use nalgebra::base::Matrix;
8
8
  use nalgebra::base::VecStorage;
9
9
  use nalgebra::DMatrix;
10
10
  use nalgebra::ViewStorage;
11
- use r_mathlib;
12
11
  use serde::{Deserialize, Serialize};
13
12
  use serde_json;
14
13
  use statrs::statistics::Data;
15
14
  use statrs::statistics::Distribution;
16
15
  use statrs::statistics::Median;
17
16
  use std::cmp::Ordering;
18
- use std::path::Path;
17
+ use std::fs::File;
18
+ use std::io::Read;
19
19
  use std::str::FromStr;
20
+ use std::sync::{Arc, Mutex}; // Multithreading library
21
+ use std::thread;
20
22
  use std::time::Instant;
21
23
  //use std::cmp::Ordering;
22
24
  //use std::env;
23
25
  use std::io;
24
- //mod stats_functions; // Importing Wilcoxon function from stats_functions.rs
26
+ mod stats_functions; // Importing Wilcoxon function from stats_functions.rs
27
+ const PAR_CUTOFF: usize = 100000; // Cutoff for triggering multithreading processing of data
28
+
29
+ //const PAR_CUTOFF: usize = 1000000000000000;
30
+ #[allow(non_upper_case_globals)]
31
+ const max_threads: usize = 6; // Max number of threads in case the parallel processing of reads is invoked
32
+
33
+ fn binary_search(input: &Vec<usize>, y: usize) -> i64 {
34
+ let input_dup = &input[..];
35
+ let mut index: i64 = -1;
36
+ let mut l: usize = 0;
37
+ let mut r: usize = input_dup.len() - 1;
38
+ let mut m: usize;
39
+ while l <= r {
40
+ m = l + ((r - l) / 2);
41
+ if y == input_dup[m] {
42
+ index = m as i64;
43
+ break;
44
+ } else if y > input_dup[m] {
45
+ l = m + 1;
46
+ }
47
+ // If x is smaller, ignore right half
48
+ else {
49
+ if m == 0 as usize {
50
+ break;
51
+ }
52
+ r = m - 1;
53
+ }
54
+ }
55
+ index
56
+ }
25
57
 
26
58
  fn input_data(
27
59
  filename: &String,
@@ -34,9 +66,9 @@ fn input_data(
34
66
  Vec<String>,
35
67
  Vec<String>,
36
68
  ) {
37
- // Build the CSV reader and iterate over each record.
38
- let path = Path::new(filename);
39
- let mut rdr = csv::Reader::from_path(path).unwrap();
69
+ let input_time = Instant::now();
70
+ //let mut rdr = csv::Reader::from_path(path).unwrap();
71
+ let mut file = File::open(filename).unwrap();
40
72
  let mut num_lines: usize = 0;
41
73
  let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
42
74
  let mut gene_names: Vec<String> = Vec::with_capacity(65000);
@@ -44,11 +76,12 @@ fn input_data(
44
76
  let mut num_columns: usize = 0;
45
77
 
46
78
  // Check headers for samples
47
- let header_line = rdr.headers().unwrap();
48
- let mut headers: Vec<&str> = Vec::with_capacity(1500);
49
- for field in header_line.iter() {
50
- headers = field.split('\t').collect::<Vec<&str>>();
51
- }
79
+ let mut buffer = String::new();
80
+ file.read_to_string(&mut buffer).unwrap();
81
+ // Check headers for samples
82
+ let lines: Vec<&str> = buffer.split('\n').collect::<Vec<&str>>();
83
+ let total_lines = lines.len();
84
+ let headers: Vec<&str> = lines[0].split('\t').collect::<Vec<&str>>();
52
85
  //println!("headers:{:?}", headers);
53
86
  let mut case_indexes_original: Vec<usize> = Vec::with_capacity(case_list.len());
54
87
  let mut control_indexes_original: Vec<usize> = Vec::with_capacity(control_list.len());
@@ -68,6 +101,7 @@ fn input_data(
68
101
  }
69
102
  }
70
103
  }
104
+ let num_cases = case_list.len();
71
105
 
72
106
  for item in control_list {
73
107
  //println!("item:{}", item);
@@ -80,70 +114,223 @@ fn input_data(
80
114
  }
81
115
  }
82
116
  }
117
+ let num_controls = control_list.len();
83
118
  //println!("case_indexes_original:{:?}", case_indexes_original);
84
119
  //println!("control_indexes_original:{:?}", control_indexes_original);
85
-
120
+ case_indexes_original.sort();
121
+ case_indexes_original.dedup();
122
+ control_indexes_original.sort();
123
+ control_indexes_original.dedup();
86
124
  let mut case_indexes: Vec<usize> = Vec::with_capacity(case_list.len());
87
125
  let mut control_indexes: Vec<usize> = Vec::with_capacity(control_list.len());
88
- for result in rdr.records() {
89
- // The iterator yields Result<StringRecord, Error>, so we check the
90
- // error here.
91
- let record = result.unwrap();
92
- //println!("record:{:?}", record);
93
- let mut index = 0;
94
- for field in record[0].split('\t').collect::<Vec<&str>>() {
95
- if index == gene_name_index.unwrap() {
96
- gene_names.push(field.to_string());
97
- } else if index == gene_symbol_index.unwrap() {
98
- gene_symbols.push(field.to_string());
99
- } else if case_indexes_original.contains(&index) {
100
- let num = FromStr::from_str(field);
101
- match num {
102
- Ok(n) => {
103
- //println!("n:{}", n);
104
- input_vector.push(n);
105
- if num_lines == 0 {
106
- case_indexes.push(num_columns);
107
- num_columns += 1;
126
+ if lines.len() * (case_indexes_original.len() + control_indexes_original.len()) < PAR_CUTOFF {
127
+ // If number of lines is below this number
128
+ let lines_slice = &lines[..];
129
+ for line_iter in 1..lines_slice.len() - 1 {
130
+ // Subtracting 1 from total length of lines_slice because the last one will be empty
131
+ let line = lines_slice[line_iter];
132
+ let mut index = 0;
133
+ for field in line.split('\t').collect::<Vec<&str>>() {
134
+ if index == gene_name_index.unwrap() {
135
+ gene_names.push(field.to_string());
136
+ } else if index == gene_symbol_index.unwrap() {
137
+ gene_symbols.push(field.to_string());
138
+ } else if binary_search(&case_indexes_original, index) != -1 {
139
+ let num = FromStr::from_str(field);
140
+ match num {
141
+ Ok(n) => {
142
+ //println!("n:{}", n);
143
+ input_vector.push(n);
144
+ if num_lines == 0 {
145
+ case_indexes.push(num_columns);
146
+ num_columns += 1;
147
+ }
148
+ }
149
+ Err(_n) => {
150
+ panic!(
151
+ "Number {} in line {} and column {} is not a decimal number",
152
+ field,
153
+ num_lines + 1,
154
+ index + 1
155
+ );
108
156
  }
109
157
  }
110
- Err(_n) => {
111
- panic!(
112
- "Number {} in line {} and column {} is not a decimal number",
113
- field,
114
- num_lines + 1,
115
- index + 1
116
- );
158
+ } else if binary_search(&control_indexes_original, index) != -1 {
159
+ let num = FromStr::from_str(field);
160
+ match num {
161
+ Ok(n) => {
162
+ //println!("n:{}", n);
163
+ input_vector.push(n);
164
+ if num_lines == 0 {
165
+ control_indexes.push(num_columns);
166
+ num_columns += 1;
167
+ }
168
+ }
169
+ Err(_n) => {
170
+ panic!(
171
+ "Number {} in line {} and column {} is not a decimal number",
172
+ field,
173
+ num_lines + 1,
174
+ index + 1
175
+ );
176
+ }
117
177
  }
118
178
  }
119
- } else if control_indexes_original.contains(&index) {
120
- let num = FromStr::from_str(field);
121
- match num {
122
- Ok(n) => {
123
- //println!("n:{}", n);
124
- input_vector.push(n);
125
- if num_lines == 0 {
126
- control_indexes.push(num_columns);
127
- num_columns += 1;
179
+ index += 1;
180
+ }
181
+ num_lines += 1;
182
+ }
183
+ } else {
184
+ // Multithreaded implementation for parsing data in parallel starts from here
185
+ // Generally in rust one variable only own a data at a time, but `Arc` keyword is special and allows for multiple threads to access the same data.
186
+ let case_indexes_original = Arc::new(case_indexes_original);
187
+ let control_indexes_original = Arc::new(control_indexes_original);
188
+ let buffer = Arc::new(buffer);
189
+ let case_indexes_temp = Arc::new(Mutex::new(Vec::<usize>::with_capacity(case_list.len())));
190
+ let control_indexes_temp =
191
+ Arc::new(Mutex::new(Vec::<usize>::with_capacity(control_list.len())));
192
+ let num_lines_temp = Arc::new(Mutex::<usize>::new(0));
193
+ let num_columns_temp = Arc::new(Mutex::<usize>::new(0));
194
+ let genes_names_temp = Arc::new(Mutex::new(Vec::<String>::new()));
195
+ let genes_symbols_temp = Arc::new(Mutex::new(Vec::<String>::new()));
196
+ let input_vector_temp = Arc::new(Mutex::new(Vec::<f64>::new()));
197
+ let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
198
+ println!("Number of threads used:{}", max_threads);
199
+ for thread_num in 0..max_threads {
200
+ let case_indexes_original = Arc::clone(&case_indexes_original);
201
+ let control_indexes_original = Arc::clone(&control_indexes_original);
202
+ let case_indexes_temp = Arc::clone(&case_indexes_temp);
203
+ let control_indexes_temp = Arc::clone(&control_indexes_temp);
204
+ let input_vector_temp = Arc::clone(&input_vector_temp);
205
+ let genes_names_temp = Arc::clone(&genes_names_temp);
206
+ let genes_symbols_temp = Arc::clone(&genes_symbols_temp);
207
+ let num_lines_temp = Arc::clone(&num_lines_temp);
208
+ let num_columns_temp = Arc::clone(&num_columns_temp);
209
+ let buffer = Arc::clone(&buffer);
210
+ let handle = thread::spawn(move || {
211
+ let mut case_indexes_thread: Vec<usize> = Vec::with_capacity(num_cases);
212
+ let mut control_indexes_thread: Vec<usize> = Vec::with_capacity(num_controls);
213
+ let mut genes_names_thread: Vec<String> = Vec::with_capacity(65000);
214
+ let mut genes_symbols_thread: Vec<String> = Vec::with_capacity(65000);
215
+ let mut input_vector_thread: Vec<f64> = Vec::with_capacity(65000);
216
+ let mut num_columns_thread: usize = 0;
217
+ let mut num_lines_thread: usize = 0;
218
+ let lines: Vec<&str> = buffer.split('\n').collect();
219
+ //println!("case_indexes_original:{:?}", case_indexes_original);
220
+ //println!("control_indexes:{:?}", control_indexes);
221
+ for line_iter in 1..total_lines - 1 {
222
+ let remainder: usize = line_iter % max_threads; // Calculate remainder of line number divided by max_threads to decide which thread parses this line
223
+ if remainder == thread_num {
224
+ //println!("buffer:{}", buffer);
225
+ // Thread analyzing a particular line must have the same remainder as the thread_num, this avoids multiple threads from parsing the same line
226
+ let line = lines[line_iter];
227
+ let mut index = 0;
228
+ for field in line.split('\t').collect::<Vec<&str>>() {
229
+ if index == gene_name_index.unwrap() {
230
+ genes_names_thread.push(field.to_string());
231
+ } else if index == gene_symbol_index.unwrap() {
232
+ genes_symbols_thread.push(field.to_string());
233
+ } else if binary_search(&case_indexes_original, index) != -1 {
234
+ let num = FromStr::from_str(field);
235
+ match num {
236
+ Ok(n) => {
237
+ //println!("n:{}", n);
238
+ input_vector_thread.push(n);
239
+ if line_iter == 1 {
240
+ case_indexes_thread.push(num_columns_thread);
241
+ num_columns_thread += 1;
242
+ }
243
+ }
244
+ Err(_n) => {
245
+ panic!(
246
+ "Number {} in line {} and column {} is not a decimal number",
247
+ field,
248
+ num_lines_thread + 1,
249
+ index + 1
250
+ );
251
+ }
252
+ }
253
+ } else if binary_search(&control_indexes_original, index) != -1 {
254
+ let num = FromStr::from_str(field);
255
+ match num {
256
+ Ok(n) => {
257
+ //println!("n:{}", n);
258
+ input_vector_thread.push(n);
259
+ if line_iter == 1 {
260
+ control_indexes_thread.push(num_columns_thread);
261
+ num_columns_thread += 1;
262
+ }
263
+ }
264
+ Err(_n) => {
265
+ panic!(
266
+ "Number {} in line {} and column {} is not a decimal number",
267
+ field,
268
+ num_lines_thread + 1,
269
+ index + 1
270
+ );
271
+ }
272
+ }
273
+ }
274
+ index += 1;
128
275
  }
129
- }
130
- Err(_n) => {
131
- panic!(
132
- "Number {} in line {} and column {} is not a decimal number",
133
- field,
134
- num_lines + 1,
135
- index + 1
136
- );
276
+ num_lines_thread += 1;
137
277
  }
138
278
  }
139
- }
140
- index += 1;
279
+ input_vector_temp
280
+ .lock()
281
+ .unwrap()
282
+ .append(&mut input_vector_thread);
283
+ case_indexes_temp
284
+ .lock()
285
+ .unwrap()
286
+ .append(&mut case_indexes_thread);
287
+ control_indexes_temp
288
+ .lock()
289
+ .unwrap()
290
+ .append(&mut control_indexes_thread);
291
+ genes_names_temp
292
+ .lock()
293
+ .unwrap()
294
+ .append(&mut genes_names_thread);
295
+ genes_symbols_temp
296
+ .lock()
297
+ .unwrap()
298
+ .append(&mut genes_symbols_thread);
299
+ *num_lines_temp.lock().unwrap() += num_lines_thread;
300
+ if num_columns_thread > 0 {
301
+ *num_columns_temp.lock().unwrap() += num_columns_thread;
302
+ }
303
+ drop(input_vector_temp);
304
+ drop(case_indexes_temp);
305
+ drop(control_indexes_temp);
306
+ drop(genes_names_temp);
307
+ drop(genes_symbols_temp);
308
+ drop(num_lines_temp);
309
+ drop(num_columns_temp);
310
+ });
311
+ handles.push(handle); // The handle (which contains the thread) is stored in the handles vector
312
+ }
313
+ for handle in handles {
314
+ // Wait for all threads to finish before proceeding further
315
+ handle.join().unwrap();
141
316
  }
142
- num_lines += 1;
317
+ // Combining data from all different threads
318
+ input_vector.append(&mut *input_vector_temp.lock().unwrap());
319
+ case_indexes.append(&mut *case_indexes_temp.lock().unwrap());
320
+ control_indexes.append(&mut *control_indexes_temp.lock().unwrap());
321
+ gene_names.append(&mut *genes_names_temp.lock().unwrap());
322
+ gene_symbols.append(&mut *genes_symbols_temp.lock().unwrap());
323
+
324
+ num_lines += *num_lines_temp.lock().unwrap();
325
+ num_columns += *num_columns_temp.lock().unwrap();
143
326
  }
144
327
  //println!("case_indexes:{:?}", case_indexes);
145
328
  //println!("control_indexes:{:?}", control_indexes);
146
-
329
+ //println!("num_lines:{}", num_lines);
330
+ //println!("num_columns:{}", num_columns);
331
+ //println!("num_lines * num_columns:{}", num_lines * num_columns);
332
+ //println!("input_vector:{:?}", input_vector.len());
333
+ println!("Time for inputting data:{:?}", input_time.elapsed());
147
334
  let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
148
335
  //println!("dm:{:?}", dm);
149
336
  (dm, case_indexes, control_indexes, gene_names, gene_symbols)
@@ -198,6 +385,7 @@ fn main() {
198
385
  let control_list: Vec<&str> = control_string.split(",").collect();
199
386
  let (input_matrix, case_indexes, control_indexes, gene_names, gene_symbols) =
200
387
  input_data(file_name, &case_list, &control_list);
388
+ let filtering_time = Instant::now();
201
389
  let (filtered_matrix, lib_sizes, filtered_genes, filtered_gene_symbols) =
202
390
  filter_by_expr(
203
391
  &input_matrix,
@@ -206,10 +394,21 @@ fn main() {
206
394
  gene_names,
207
395
  gene_symbols,
208
396
  );
397
+ println!("filtering time:{:?}", filtering_time.elapsed());
209
398
  //println!("filtered_matrix_rows:{:?}", filtered_matrix.nrows());
210
399
  //println!("filtered_matrix_cols:{:?}", filtered_matrix.ncols());
400
+ let cpm_normalization_time = Instant::now();
211
401
  let mut normalized_matrix = cpm(&filtered_matrix);
402
+ println!(
403
+ "cpm normalization time:{:?}",
404
+ cpm_normalization_time.elapsed()
405
+ );
406
+ let tmm_normalization_time = Instant::now();
212
407
  let norm_factors = tmm_normalization(filtered_matrix, &lib_sizes);
408
+ println!(
409
+ "tmm normalization time:{:?}",
410
+ tmm_normalization_time.elapsed()
411
+ );
213
412
  //println!("norm_factors:{:?}", norm_factors);
214
413
 
215
414
  for col in 0..normalized_matrix.ncols() {
@@ -232,58 +431,154 @@ fn main() {
232
431
 
233
432
  //println!("case_indexes:{:?}", case_indexes);
234
433
  //println!("control_indexes:{:?}", control_indexes);
235
- for i in 0..normalized_matrix.nrows() {
236
- let row = normalized_matrix.row(i);
237
- //println!("row:{:?}", row);
238
- let mut treated = Vec::<f64>::new();
239
- let mut control = Vec::<f64>::new();
240
- //println!("conditions:{:?}", conditions);
241
- for j in 0..(case_indexes.len() + control_indexes.len()) {
242
- //println!("row[(0, j)]:{}", row[(0, j)]);
243
- if case_indexes.contains(&j) {
244
- treated.push(row[(0, j)]);
245
- //println!("{},{}", input_data_vec.0[i][j], "Diseased");
246
- } else if control_indexes.contains(&j) {
247
- // + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
248
- control.push(row[(0, j)]);
249
- //println!("{},{}", input_data_vec.0[i][j], "Control");
250
- } else {
251
- panic!("Column {} could not be classified into case/control", j);
434
+ let num_normalized_rows = normalized_matrix.nrows();
435
+ if normalized_matrix.nrows() * normalized_matrix.ncols() < PAR_CUTOFF {
436
+ for i in 0..normalized_matrix.nrows() {
437
+ let row = normalized_matrix.row(i);
438
+ //println!("row:{:?}", row);
439
+ let mut treated = Vec::<f64>::new();
440
+ let mut control = Vec::<f64>::new();
441
+ //println!("conditions:{:?}", conditions);
442
+ for j in 0..(case_indexes.len() + control_indexes.len()) {
443
+ //println!("row[(0, j)]:{}", row[(0, j)]);
444
+ if case_indexes.contains(&j) {
445
+ treated.push(row[(0, j)]);
446
+ //println!("{},{}", input_data_vec.0[i][j], "Diseased");
447
+ } else if control_indexes.contains(&j) {
448
+ // + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
449
+ control.push(row[(0, j)]);
450
+ //println!("{},{}", input_data_vec.0[i][j], "Control");
451
+ } else {
452
+ panic!(
453
+ "Column {} could not be classified into case/control",
454
+ j
455
+ );
456
+ }
252
457
  }
253
- }
254
- //println!("treated{:?}", treated);
255
- //println!("control{:?}", control);
256
- let p_value = wilcoxon_rank_sum_test(
257
- treated.clone(),
258
- control.clone(),
259
- THRESHOLD,
260
- 't',
261
- true,
262
- ); // Setting continuity correction to true in case of normal approximation
263
- let treated_mean = Data::new(treated).mean();
264
- let control_mean = Data::new(control).mean();
265
- if (treated_mean.unwrap() / control_mean.unwrap())
266
- .log2()
267
- .is_nan()
268
- == false
269
- && (treated_mean.unwrap() / control_mean.unwrap())
458
+ //println!("treated{:?}", treated);
459
+ //println!("control{:?}", control);
460
+ let p_value = stats_functions::wilcoxon_rank_sum_test(
461
+ treated.clone(),
462
+ control.clone(),
463
+ THRESHOLD,
464
+ 't',
465
+ true,
466
+ ); // Setting continuity correction to true in case of normal approximation
467
+ let treated_mean = Data::new(treated).mean();
468
+ let control_mean = Data::new(control).mean();
469
+ if (treated_mean.unwrap() / control_mean.unwrap())
270
470
  .log2()
271
- .is_infinite()
471
+ .is_nan()
272
472
  == false
273
- {
274
- p_values.push(PValueIndexes {
275
- index: i,
276
- gene_name: filtered_genes[i].to_owned(),
277
- gene_symbol: filtered_gene_symbols[i].to_owned(),
278
- fold_change: (treated_mean.unwrap() / control_mean.unwrap()).log2(),
279
- p_value: p_value,
473
+ && (treated_mean.unwrap() / control_mean.unwrap())
474
+ .log2()
475
+ .is_infinite()
476
+ == false
477
+ {
478
+ p_values.push(PValueIndexes {
479
+ index: i,
480
+ gene_name: filtered_genes[i].to_owned(),
481
+ gene_symbol: filtered_gene_symbols[i].to_owned(),
482
+ fold_change: (treated_mean.unwrap() / control_mean.unwrap())
483
+ .log2(),
484
+ p_value: p_value,
485
+ });
486
+ }
487
+ }
488
+ } else {
489
+ // Multithreaded implementation of calculating wilcoxon p-values
490
+ let normalized_matrix_temp = Arc::new(normalized_matrix);
491
+ let filtered_genes_temp = Arc::new(filtered_genes);
492
+ let filtered_gene_symbols_temp = Arc::new(filtered_gene_symbols);
493
+ let case_indexes_temp = Arc::new(case_indexes);
494
+ let control_indexes_temp = Arc::new(control_indexes);
495
+ let p_values_temp = Arc::new(Mutex::new(Vec::<PValueIndexes>::new()));
496
+ let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
497
+ for thread_num in 0..max_threads {
498
+ let normalized_matrix_temp = Arc::clone(&normalized_matrix_temp);
499
+ let case_indexes_temp = Arc::clone(&case_indexes_temp);
500
+ let control_indexes_temp = Arc::clone(&control_indexes_temp);
501
+ let p_values_temp = Arc::clone(&p_values_temp);
502
+ let filtered_genes_temp = Arc::clone(&filtered_genes_temp);
503
+ let filtered_gene_symbols_temp =
504
+ Arc::clone(&filtered_gene_symbols_temp);
505
+ let handle = thread::spawn(move || {
506
+ let mut p_values_thread: Vec<PValueIndexes> = Vec::with_capacity(
507
+ normalized_matrix_temp.nrows() / max_threads,
508
+ );
509
+ for i in 0..normalized_matrix_temp.nrows() {
510
+ let remainder: usize = i % max_threads; // Calculate remainder of iteration number divided by max_threads to decide which thread parses the row
511
+ if remainder == thread_num {
512
+ let row = normalized_matrix_temp.row(i);
513
+ //println!("row:{:?}", row);
514
+ let mut treated = Vec::<f64>::new();
515
+ let mut control = Vec::<f64>::new();
516
+ //println!("conditions:{:?}", conditions);
517
+ for j in 0..(case_indexes_temp.len()
518
+ + control_indexes_temp.len())
519
+ {
520
+ //println!("row[(0, j)]:{}", row[(0, j)]);
521
+ if case_indexes_temp.contains(&j) {
522
+ treated.push(row[(0, j)]);
523
+ //println!("{},{}", input_data_vec.0[i][j], "Diseased");
524
+ } else if control_indexes_temp.contains(&j) {
525
+ // + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
526
+ control.push(row[(0, j)]);
527
+ //println!("{},{}", input_data_vec.0[i][j], "Control");
528
+ } else {
529
+ panic!(
530
+ "Column {} could not be classified into case/control",
531
+ j
532
+ );
533
+ }
534
+ }
535
+ //println!("treated{:?}", treated);
536
+ //println!("control{:?}", control);
537
+ let p_value = stats_functions::wilcoxon_rank_sum_test(
538
+ treated.clone(),
539
+ control.clone(),
540
+ THRESHOLD,
541
+ 't',
542
+ true,
543
+ ); // Setting continuity correction to true in case of normal approximation
544
+ let treated_mean = Data::new(treated).mean();
545
+ let control_mean = Data::new(control).mean();
546
+ if (treated_mean.unwrap() / control_mean.unwrap())
547
+ .log2()
548
+ .is_nan()
549
+ == false
550
+ && (treated_mean.unwrap() / control_mean.unwrap())
551
+ .log2()
552
+ .is_infinite()
553
+ == false
554
+ {
555
+ p_values_thread.push(PValueIndexes {
556
+ index: i,
557
+ gene_name: filtered_genes_temp[i].to_owned(),
558
+ gene_symbol: filtered_gene_symbols_temp[i]
559
+ .to_owned(),
560
+ fold_change: (treated_mean.unwrap()
561
+ / control_mean.unwrap())
562
+ .log2(),
563
+ p_value: p_value,
564
+ });
565
+ }
566
+ }
567
+ }
568
+ p_values_temp.lock().unwrap().append(&mut p_values_thread);
280
569
  });
570
+ handles.push(handle);
571
+ }
572
+ for handle in handles {
573
+ // Wait for all threads to finish before proceeding further
574
+ handle.join().unwrap();
281
575
  }
576
+ p_values.append(&mut *p_values_temp.lock().unwrap());
282
577
  }
283
578
  //println!("p_values:{:?}", p_values);
284
579
  println!(
285
580
  "Time for running {} wilcoxon tests:{:?}",
286
- normalized_matrix.nrows(),
581
+ num_normalized_rows,
287
582
  now2.elapsed()
288
583
  );
289
584
  let adjusted_p_values = adjust_p_values(p_values);
@@ -408,18 +703,62 @@ fn tmm_normalization(
408
703
  }
409
704
  }
410
705
  //println!("ref_column:{}", ref_column);
411
- let ref_data = input_matrix.column(ref_column);
412
- let ref_lib_size = lib_sizes[ref_column];
706
+ let num_cols = input_matrix.ncols();
413
707
  let mut f: Vec<f64> = Vec::with_capacity(input_matrix.ncols());
414
- for col in 0..input_matrix.ncols() {
415
- let obs_data = input_matrix.column(col);
416
- let obs_lib_size = lib_sizes[col];
417
- f.push(calc_factor_tmm(
418
- obs_data,
419
- &ref_data,
420
- ref_lib_size,
421
- obs_lib_size,
422
- ));
708
+ if input_matrix.nrows() * input_matrix.ncols() < PAR_CUTOFF {
709
+ let ref_data = input_matrix.column(ref_column);
710
+ let ref_lib_size = lib_sizes[ref_column];
711
+ for col in 0..input_matrix.ncols() {
712
+ let obs_data = input_matrix.column(col);
713
+ let obs_lib_size = lib_sizes[col];
714
+ f.push(calc_factor_tmm(
715
+ obs_data,
716
+ &ref_data,
717
+ ref_lib_size,
718
+ obs_lib_size,
719
+ ));
720
+ }
721
+ } else {
722
+ // Multithreaded implementation of TMM normalization
723
+ let f_temp = Arc::new(Mutex::new(Vec::<f_index>::new()));
724
+ let lib_sizes_temp = Arc::new(lib_sizes.to_owned());
725
+ let input_matrix_temp = Arc::new(input_matrix);
726
+ let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
727
+ for thread_num in 0..max_threads {
728
+ let f_temp = Arc::clone(&f_temp);
729
+ let lib_sizes_temp = Arc::clone(&lib_sizes_temp);
730
+ let input_matrix_temp = Arc::clone(&input_matrix_temp);
731
+ let handle = thread::spawn(move || {
732
+ let mut f_thread: Vec<f_index> =
733
+ Vec::with_capacity(input_matrix_temp.ncols() / max_threads);
734
+ let ref_data = input_matrix_temp.column(ref_column);
735
+ let ref_lib_size = lib_sizes_temp[ref_column];
736
+ for col in 0..input_matrix_temp.ncols() {
737
+ let remainder: usize = col % max_threads; // Calculate remainder of column number divided by max_threads to decide which thread parses this column
738
+ if remainder == thread_num {
739
+ let obs_data = input_matrix_temp.column(col);
740
+ let obs_lib_size = lib_sizes_temp[col];
741
+ f_thread.push(f_index {
742
+ f: calc_factor_tmm(obs_data, &ref_data, ref_lib_size, obs_lib_size),
743
+ ind: col,
744
+ })
745
+ }
746
+ }
747
+ f_temp.lock().unwrap().append(&mut f_thread);
748
+ });
749
+ handles.push(handle);
750
+ }
751
+ for handle in handles {
752
+ // Wait for all threads to finish before proceeding further
753
+ handle.join().unwrap();
754
+ }
755
+ let mut f_orig: Vec<f_index> = Vec::with_capacity(num_cols);
756
+ f_orig.append(&mut *f_temp.lock().unwrap());
757
+ // Need to sort vector because the vector will not be ordered accord to ind because of multithreading
758
+ f_orig
759
+ .as_mut_slice()
760
+ .sort_by(|a, b| (a.ind).partial_cmp(&b.ind).unwrap_or(Ordering::Equal));
761
+ f = f_orig.into_iter().map(|x| x.f).collect::<Vec<f64>>();
423
762
  }
424
763
  const NATURAL_E: f64 = 2.718281828459;
425
764
  let log_f: Vec<f64> = f.clone().into_iter().map(|x| x.log(NATURAL_E)).collect();
@@ -427,6 +766,11 @@ fn tmm_normalization(
427
766
  let final_f: Vec<f64> = f.into_iter().map(|x| x / exp_mean_log_f).collect();
428
767
  final_f
429
768
  }
769
+ #[allow(non_camel_case_types)]
770
+ struct f_index {
771
+ f: f64,
772
+ ind: usize,
773
+ }
430
774
 
431
775
  fn calc_factor_tmm(
432
776
  obs_data: Matrix<f64, Dyn, Const<1>, ViewStorage<'_, f64, Dyn, Const<1>, Const<1>, Dyn>>,
@@ -537,7 +881,7 @@ fn rank_vector(input_vector: &Vec<f64>) -> Vec<f64> {
537
881
  rank: i as f64 + 1.0,
538
882
  });
539
883
  } else {
540
- frac_rank = calculate_frac_rank(i as f64 + 1.0, num_repeats);
884
+ frac_rank = stats_functions::calculate_frac_rank(i as f64 + 1.0, num_repeats);
541
885
  ranks.push(RankOutput {
542
886
  orig_index: input_vector_sorted[i].orig_index,
543
887
  rank: frac_rank,
@@ -750,274 +1094,3 @@ fn cpm(
750
1094
  //println!("output_matrix:{:?}", output_matrix);
751
1095
  output_matrix
752
1096
  }
753
-
754
- pub fn wilcoxon_rank_sum_test(
755
- mut group1: Vec<f64>,
756
- mut group2: Vec<f64>,
757
- threshold: usize,
758
- alternative: char,
759
- correct: bool,
760
- ) -> f64 {
761
- // Check if there are any ties between the two groups
762
-
763
- let mut combined = group1.clone();
764
- combined.extend(group2.iter().cloned());
765
- combined.sort_by(|a, b| a.partial_cmp(b).unwrap());
766
- //println!("combined:{:?}", combined);
767
-
768
- group1.sort_by(|a, b| a.partial_cmp(b).unwrap());
769
- group2.sort_by(|a, b| a.partial_cmp(b).unwrap());
770
- //println!("group1:{:?}", group1);
771
- //println!("group2:{:?}", group2);
772
-
773
- let mut group1_iter = 0;
774
- let mut group2_iter = 0;
775
- let mut xy: Vec<char> = Vec::with_capacity(combined.len()); // Stores X-Y classification
776
- let mut ranks: Vec<f64> = Vec::with_capacity(combined.len()); // Stores the rank of each element
777
- let mut is_repeat = false;
778
- let mut repeat_present = false;
779
- let mut frac_rank: f64 = 0.0;
780
- let mut num_repeats: f64 = 1.0;
781
- let mut repeat_iter: f64 = 1.0;
782
- #[allow(unused_variables)]
783
- let mut weight_x: f64 = 0.0;
784
- let mut weight_y: f64 = 0.0;
785
- let mut group_char: char = 'X';
786
- let mut rank_frequencies: Vec<f64> = Vec::with_capacity(combined.len());
787
- for i in 0..combined.len() {
788
- //println!("group1_iter:{}", group1_iter);
789
- //println!("group2_iter:{}", group2_iter);
790
- //println!("item1:{}", combined[i]);
791
- //println!("is_repeat:{}", is_repeat);
792
- if group1_iter < group1.len() && combined[i] == group1[group1_iter] {
793
- xy.push('X');
794
- group1_iter += 1;
795
- group_char = 'X';
796
- } else if group2_iter < group2.len() && combined[i] == group2[group2_iter] {
797
- xy.push('Y');
798
- group2_iter += 1;
799
- group_char = 'Y';
800
- }
801
-
802
- // Computing ranks
803
- if is_repeat == false {
804
- // Check if current element has other occurences
805
- num_repeats = 1.0;
806
- for j in i + 1..combined.len() {
807
- if combined[i] == combined[j] {
808
- is_repeat = true;
809
- repeat_present = true;
810
- repeat_iter = 1.0;
811
- num_repeats += 1.0;
812
- } else {
813
- break;
814
- }
815
- }
816
- //println!("num_repeats:{}", num_repeats);
817
- if is_repeat == false {
818
- ranks.push(i as f64 + 1.0);
819
- if group_char == 'X' {
820
- weight_x += i as f64 + 1.0;
821
- } else if group_char == 'Y' {
822
- weight_y += i as f64 + 1.0;
823
- }
824
- //rank_frequencies.push(RankFreq {
825
- // rank: i as f64 + 1.0,
826
- // freq: 1,
827
- //});
828
- rank_frequencies.push(1.0);
829
- } else {
830
- frac_rank = calculate_frac_rank(i as f64 + 1.0, num_repeats);
831
- ranks.push(frac_rank);
832
- if group_char == 'X' {
833
- weight_x += frac_rank;
834
- } else if group_char == 'Y' {
835
- weight_y += frac_rank
836
- }
837
- //rank_frequencies.push(RankFreq {
838
- // rank: frac_rank,
839
- // freq: num_repeats as usize,
840
- //});
841
- rank_frequencies.push(num_repeats);
842
- }
843
- } else if repeat_iter < num_repeats {
844
- // Repeat case
845
- ranks.push(frac_rank);
846
- repeat_iter += 1.0;
847
- if group_char == 'X' {
848
- weight_x += frac_rank;
849
- } else if group_char == 'Y' {
850
- weight_y += frac_rank
851
- }
852
- if repeat_iter == num_repeats {
853
- is_repeat = false;
854
- }
855
- } else {
856
- //println!("i:{}", i);
857
- ranks.push(i as f64 + 1.0);
858
- repeat_iter = 1.0;
859
- num_repeats = 1.0;
860
- if group_char == 'X' {
861
- weight_x += i as f64 + 1.0;
862
- } else if group_char == 'Y' {
863
- weight_y += i as f64 + 1.0;
864
- }
865
- }
866
- }
867
- //println!("rank_frequencies:{:?}", rank_frequencies);
868
- //println!("xy:{:?}", xy);
869
- //println!("ranks:{:?}", ranks);
870
- //println!("weight_x:{}", weight_x);
871
- //println!("weight_y:{}", weight_y);
872
-
873
- //u_dash (calculated below) calculates the "W Statistic" in wilcox.test function in R
874
-
875
- let u_y = weight_y - (group2.len() as f64 * (group2.len() as f64 + 1.0) / 2.0) as f64;
876
- let u_dash_y = (u_y - (group1.len() * group2.len()) as f64).abs();
877
- //println!("u_dash_y:{}", u_dash_y);
878
-
879
- let u_x = weight_x - (group1.len() as f64 * (group1.len() as f64 + 1.0) / 2.0) as f64;
880
- let _u_dash_x = (u_x - (group1.len() * group2.len()) as f64).abs();
881
- //println!("u_dash_x:{}", u_dash_x);
882
-
883
- // Calculate test_statistic
884
-
885
- //let t1 = weight_x - ((group1.len() as f64) * (group1.len() as f64 + 1.0)) / 2.0;
886
- //let t2 = weight_y - ((group2.len() as f64) * (group2.len() as f64 + 1.0)) / 2.0;
887
- //
888
- //let mut test_statistic = t1;
889
- //if t2 < t1 {
890
- // test_statistic = t2;
891
- //}
892
-
893
- //println!("test_statistic:{}", test_statistic);
894
-
895
- if group1.len() < threshold && group2.len() < threshold && repeat_present == false {
896
- // Compute exact p-values
897
-
898
- // Calculate conditional probability for weight_y
899
-
900
- if alternative == 'g' {
901
- // Alternative "greater"
902
- //if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
903
- // iterate_exact_p_values(ranks, weight_y, group2.len())
904
- //} else {
905
- calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
906
- //}
907
- } else if alternative == 'l' {
908
- // Alternative "lesser"
909
- //if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
910
- // iterate_exact_p_values(ranks, weight_x, group1.len())
911
- //} else {
912
- calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
913
- //}
914
- } else {
915
- // Two-sided distribution
916
- calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
917
- }
918
- } else {
919
- // Compute p-values from a normal distribution
920
- //println!("group1 length:{}", group1.len());
921
- //println!("group2 length:{}", group2.len());
922
-
923
- let mut z = u_dash_y - ((group1.len() * group2.len()) as f64) / 2.0;
924
- //println!("z_original:{}", z);
925
- let mut nties_sum: f64 = 0.0;
926
- for i in 0..rank_frequencies.len() {
927
- nties_sum += rank_frequencies[i] * rank_frequencies[i] * rank_frequencies[i]
928
- - rank_frequencies[i];
929
- }
930
-
931
- let sigma = (((group1.len() * group2.len()) as f64) / 12.0
932
- * ((group1.len() + group2.len() + 1) as f64
933
- - nties_sum
934
- / (((group1.len() + group2.len()) as f64)
935
- * ((group1.len() + group2.len() - 1) as f64))))
936
- .sqrt();
937
- //println!("sigma:{}", sigma);
938
- let mut correction: f64 = 0.0;
939
- if correct == true {
940
- if alternative == 'g' {
941
- // Alternative "greater"
942
- correction = 0.5;
943
- } else if alternative == 'l' {
944
- // Alternative "lesser"
945
- correction = -0.5;
946
- } else {
947
- // Alternative "two-sided"
948
- if z > 0.0 {
949
- correction = 0.5;
950
- } else if z < 0.0 {
951
- correction = -0.5;
952
- } else {
953
- // z=0
954
- correction = 0.0;
955
- }
956
- }
957
- }
958
- z = (z - correction) / sigma;
959
- //println!("z:{}", z);
960
- if alternative == 'g' {
961
- // Alternative "greater"
962
- //println!("greater:{}", n.cdf(weight_y));
963
- //1.0 - n.cdf(z) // Applying continuity correction
964
- r_mathlib::normal_cdf(z, 0.0, 1.0, false, false)
965
- } else if alternative == 'l' {
966
- // Alternative "lesser"
967
- //println!("lesser:{}", n.cdf(weight_x));
968
- //n.cdf(z) // Applying continuity coorection
969
- r_mathlib::normal_cdf(z, 0.0, 1.0, true, false)
970
- } else {
971
- // Alternative "two-sided"
972
- let p_g = r_mathlib::normal_cdf(z, 0.0, 1.0, false, false); // Applying continuity correction
973
- let p_l = r_mathlib::normal_cdf(z, 0.0, 1.0, true, false); // Applying continuity correction
974
- let mut p_value;
975
- if p_g < p_l {
976
- p_value = 2.0 * p_g;
977
- } else {
978
- p_value = 2.0 * p_l;
979
- }
980
- //println!("p_value:{}", p_value);
981
- if p_value > 1.0 {
982
- p_value = 1.0;
983
- }
984
- p_value
985
- }
986
- }
987
- }
988
-
989
- // To be used only when there are no ties in the input data
990
- #[allow(dead_code)]
991
- fn calculate_exact_probability(weight: f64, x: usize, y: usize, alternative: char) -> f64 {
992
- //println!("Using Wilcoxon CDF");
993
- let mut p_value;
994
- if alternative == 't' {
995
- if weight > ((x * y) as f64) / 2.0 {
996
- p_value = 2.0 * r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
997
- } else {
998
- p_value = 2.0 * r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
999
- }
1000
- if p_value > 1.0 {
1001
- p_value = 1.0;
1002
- }
1003
- } else if alternative == 'g' {
1004
- p_value = r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
1005
- } else if alternative == 'l' {
1006
- p_value = r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
1007
- } else {
1008
- // Should not happen
1009
- panic!("Unknown alternative option given, please check!");
1010
- }
1011
- //println!("p_value:{}", p_value);
1012
- p_value
1013
- }
1014
-
1015
- #[allow(dead_code)]
1016
- pub fn calculate_frac_rank(current_rank: f64, num_repeats: f64) -> f64 {
1017
- let mut sum = 0.0;
1018
- for i in 0..num_repeats as usize {
1019
- let rank = current_rank + i as f64;
1020
- sum += rank;
1021
- }
1022
- sum / num_repeats
1023
- }