@sjcrh/proteinpaint-rust 2.40.6 → 2.49.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +5 -6
- package/package.json +35 -35
- package/src/DEanalysis.rs +460 -387
- package/src/indel.rs +1 -1
- package/src/stats_functions.rs +270 -270
package/src/DEanalysis.rs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
// cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
|
|
2
|
-
// cd .. && cargo build --release && cat ~/sjpp/test.txt | target/release/DEanalysis
|
|
2
|
+
// cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/DEanalysis
|
|
3
3
|
#![allow(non_snake_case)]
|
|
4
4
|
use json;
|
|
5
5
|
use nalgebra::base::dimension::Const;
|
|
@@ -8,20 +8,52 @@ use nalgebra::base::Matrix;
|
|
|
8
8
|
use nalgebra::base::VecStorage;
|
|
9
9
|
use nalgebra::DMatrix;
|
|
10
10
|
use nalgebra::ViewStorage;
|
|
11
|
-
use r_mathlib;
|
|
12
11
|
use serde::{Deserialize, Serialize};
|
|
13
12
|
use serde_json;
|
|
14
13
|
use statrs::statistics::Data;
|
|
15
14
|
use statrs::statistics::Distribution;
|
|
16
15
|
use statrs::statistics::Median;
|
|
17
16
|
use std::cmp::Ordering;
|
|
18
|
-
use std::
|
|
17
|
+
use std::fs::File;
|
|
18
|
+
use std::io::Read;
|
|
19
19
|
use std::str::FromStr;
|
|
20
|
+
use std::sync::{Arc, Mutex}; // Multithreading library
|
|
21
|
+
use std::thread;
|
|
20
22
|
use std::time::Instant;
|
|
21
23
|
//use std::cmp::Ordering;
|
|
22
24
|
//use std::env;
|
|
23
25
|
use std::io;
|
|
24
|
-
|
|
26
|
+
mod stats_functions; // Importing Wilcoxon function from stats_functions.rs
|
|
27
|
+
const PAR_CUTOFF: usize = 100000; // Cutoff for triggering multithreading processing of data
|
|
28
|
+
|
|
29
|
+
//const PAR_CUTOFF: usize = 1000000000000000;
|
|
30
|
+
#[allow(non_upper_case_globals)]
|
|
31
|
+
const max_threads: usize = 6; // Max number of threads in case the parallel processing of reads is invoked
|
|
32
|
+
|
|
33
|
+
fn binary_search(input: &Vec<usize>, y: usize) -> i64 {
|
|
34
|
+
let input_dup = &input[..];
|
|
35
|
+
let mut index: i64 = -1;
|
|
36
|
+
let mut l: usize = 0;
|
|
37
|
+
let mut r: usize = input_dup.len() - 1;
|
|
38
|
+
let mut m: usize;
|
|
39
|
+
while l <= r {
|
|
40
|
+
m = l + ((r - l) / 2);
|
|
41
|
+
if y == input_dup[m] {
|
|
42
|
+
index = m as i64;
|
|
43
|
+
break;
|
|
44
|
+
} else if y > input_dup[m] {
|
|
45
|
+
l = m + 1;
|
|
46
|
+
}
|
|
47
|
+
// If x is smaller, ignore right half
|
|
48
|
+
else {
|
|
49
|
+
if m == 0 as usize {
|
|
50
|
+
break;
|
|
51
|
+
}
|
|
52
|
+
r = m - 1;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
index
|
|
56
|
+
}
|
|
25
57
|
|
|
26
58
|
fn input_data(
|
|
27
59
|
filename: &String,
|
|
@@ -34,9 +66,9 @@ fn input_data(
|
|
|
34
66
|
Vec<String>,
|
|
35
67
|
Vec<String>,
|
|
36
68
|
) {
|
|
37
|
-
|
|
38
|
-
let
|
|
39
|
-
let mut
|
|
69
|
+
let input_time = Instant::now();
|
|
70
|
+
//let mut rdr = csv::Reader::from_path(path).unwrap();
|
|
71
|
+
let mut file = File::open(filename).unwrap();
|
|
40
72
|
let mut num_lines: usize = 0;
|
|
41
73
|
let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
|
|
42
74
|
let mut gene_names: Vec<String> = Vec::with_capacity(65000);
|
|
@@ -44,11 +76,12 @@ fn input_data(
|
|
|
44
76
|
let mut num_columns: usize = 0;
|
|
45
77
|
|
|
46
78
|
// Check headers for samples
|
|
47
|
-
let
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
79
|
+
let mut buffer = String::new();
|
|
80
|
+
file.read_to_string(&mut buffer).unwrap();
|
|
81
|
+
// Check headers for samples
|
|
82
|
+
let lines: Vec<&str> = buffer.split('\n').collect::<Vec<&str>>();
|
|
83
|
+
let total_lines = lines.len();
|
|
84
|
+
let headers: Vec<&str> = lines[0].split('\t').collect::<Vec<&str>>();
|
|
52
85
|
//println!("headers:{:?}", headers);
|
|
53
86
|
let mut case_indexes_original: Vec<usize> = Vec::with_capacity(case_list.len());
|
|
54
87
|
let mut control_indexes_original: Vec<usize> = Vec::with_capacity(control_list.len());
|
|
@@ -68,6 +101,7 @@ fn input_data(
|
|
|
68
101
|
}
|
|
69
102
|
}
|
|
70
103
|
}
|
|
104
|
+
let num_cases = case_list.len();
|
|
71
105
|
|
|
72
106
|
for item in control_list {
|
|
73
107
|
//println!("item:{}", item);
|
|
@@ -80,70 +114,223 @@ fn input_data(
|
|
|
80
114
|
}
|
|
81
115
|
}
|
|
82
116
|
}
|
|
117
|
+
let num_controls = control_list.len();
|
|
83
118
|
//println!("case_indexes_original:{:?}", case_indexes_original);
|
|
84
119
|
//println!("control_indexes_original:{:?}", control_indexes_original);
|
|
85
|
-
|
|
120
|
+
case_indexes_original.sort();
|
|
121
|
+
case_indexes_original.dedup();
|
|
122
|
+
control_indexes_original.sort();
|
|
123
|
+
control_indexes_original.dedup();
|
|
86
124
|
let mut case_indexes: Vec<usize> = Vec::with_capacity(case_list.len());
|
|
87
125
|
let mut control_indexes: Vec<usize> = Vec::with_capacity(control_list.len());
|
|
88
|
-
|
|
89
|
-
//
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
126
|
+
if lines.len() * (case_indexes_original.len() + control_indexes_original.len()) < PAR_CUTOFF {
|
|
127
|
+
// If number of lines is below this number
|
|
128
|
+
let lines_slice = &lines[..];
|
|
129
|
+
for line_iter in 1..lines_slice.len() - 1 {
|
|
130
|
+
// Subtracting 1 from total length of lines_slice because the last one will be empty
|
|
131
|
+
let line = lines_slice[line_iter];
|
|
132
|
+
let mut index = 0;
|
|
133
|
+
for field in line.split('\t').collect::<Vec<&str>>() {
|
|
134
|
+
if index == gene_name_index.unwrap() {
|
|
135
|
+
gene_names.push(field.to_string());
|
|
136
|
+
} else if index == gene_symbol_index.unwrap() {
|
|
137
|
+
gene_symbols.push(field.to_string());
|
|
138
|
+
} else if binary_search(&case_indexes_original, index) != -1 {
|
|
139
|
+
let num = FromStr::from_str(field);
|
|
140
|
+
match num {
|
|
141
|
+
Ok(n) => {
|
|
142
|
+
//println!("n:{}", n);
|
|
143
|
+
input_vector.push(n);
|
|
144
|
+
if num_lines == 0 {
|
|
145
|
+
case_indexes.push(num_columns);
|
|
146
|
+
num_columns += 1;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
Err(_n) => {
|
|
150
|
+
panic!(
|
|
151
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
152
|
+
field,
|
|
153
|
+
num_lines + 1,
|
|
154
|
+
index + 1
|
|
155
|
+
);
|
|
108
156
|
}
|
|
109
157
|
}
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
158
|
+
} else if binary_search(&control_indexes_original, index) != -1 {
|
|
159
|
+
let num = FromStr::from_str(field);
|
|
160
|
+
match num {
|
|
161
|
+
Ok(n) => {
|
|
162
|
+
//println!("n:{}", n);
|
|
163
|
+
input_vector.push(n);
|
|
164
|
+
if num_lines == 0 {
|
|
165
|
+
control_indexes.push(num_columns);
|
|
166
|
+
num_columns += 1;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
Err(_n) => {
|
|
170
|
+
panic!(
|
|
171
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
172
|
+
field,
|
|
173
|
+
num_lines + 1,
|
|
174
|
+
index + 1
|
|
175
|
+
);
|
|
176
|
+
}
|
|
117
177
|
}
|
|
118
178
|
}
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
179
|
+
index += 1;
|
|
180
|
+
}
|
|
181
|
+
num_lines += 1;
|
|
182
|
+
}
|
|
183
|
+
} else {
|
|
184
|
+
// Multithreaded implementation for parsing data in parallel starts from here
|
|
185
|
+
// Generally in rust one variable only own a data at a time, but `Arc` keyword is special and allows for multiple threads to access the same data.
|
|
186
|
+
let case_indexes_original = Arc::new(case_indexes_original);
|
|
187
|
+
let control_indexes_original = Arc::new(control_indexes_original);
|
|
188
|
+
let buffer = Arc::new(buffer);
|
|
189
|
+
let case_indexes_temp = Arc::new(Mutex::new(Vec::<usize>::with_capacity(case_list.len())));
|
|
190
|
+
let control_indexes_temp =
|
|
191
|
+
Arc::new(Mutex::new(Vec::<usize>::with_capacity(control_list.len())));
|
|
192
|
+
let num_lines_temp = Arc::new(Mutex::<usize>::new(0));
|
|
193
|
+
let num_columns_temp = Arc::new(Mutex::<usize>::new(0));
|
|
194
|
+
let genes_names_temp = Arc::new(Mutex::new(Vec::<String>::new()));
|
|
195
|
+
let genes_symbols_temp = Arc::new(Mutex::new(Vec::<String>::new()));
|
|
196
|
+
let input_vector_temp = Arc::new(Mutex::new(Vec::<f64>::new()));
|
|
197
|
+
let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
|
|
198
|
+
println!("Number of threads used:{}", max_threads);
|
|
199
|
+
for thread_num in 0..max_threads {
|
|
200
|
+
let case_indexes_original = Arc::clone(&case_indexes_original);
|
|
201
|
+
let control_indexes_original = Arc::clone(&control_indexes_original);
|
|
202
|
+
let case_indexes_temp = Arc::clone(&case_indexes_temp);
|
|
203
|
+
let control_indexes_temp = Arc::clone(&control_indexes_temp);
|
|
204
|
+
let input_vector_temp = Arc::clone(&input_vector_temp);
|
|
205
|
+
let genes_names_temp = Arc::clone(&genes_names_temp);
|
|
206
|
+
let genes_symbols_temp = Arc::clone(&genes_symbols_temp);
|
|
207
|
+
let num_lines_temp = Arc::clone(&num_lines_temp);
|
|
208
|
+
let num_columns_temp = Arc::clone(&num_columns_temp);
|
|
209
|
+
let buffer = Arc::clone(&buffer);
|
|
210
|
+
let handle = thread::spawn(move || {
|
|
211
|
+
let mut case_indexes_thread: Vec<usize> = Vec::with_capacity(num_cases);
|
|
212
|
+
let mut control_indexes_thread: Vec<usize> = Vec::with_capacity(num_controls);
|
|
213
|
+
let mut genes_names_thread: Vec<String> = Vec::with_capacity(65000);
|
|
214
|
+
let mut genes_symbols_thread: Vec<String> = Vec::with_capacity(65000);
|
|
215
|
+
let mut input_vector_thread: Vec<f64> = Vec::with_capacity(65000);
|
|
216
|
+
let mut num_columns_thread: usize = 0;
|
|
217
|
+
let mut num_lines_thread: usize = 0;
|
|
218
|
+
let lines: Vec<&str> = buffer.split('\n').collect();
|
|
219
|
+
//println!("case_indexes_original:{:?}", case_indexes_original);
|
|
220
|
+
//println!("control_indexes:{:?}", control_indexes);
|
|
221
|
+
for line_iter in 1..total_lines - 1 {
|
|
222
|
+
let remainder: usize = line_iter % max_threads; // Calculate remainder of line number divided by max_threads to decide which thread parses this line
|
|
223
|
+
if remainder == thread_num {
|
|
224
|
+
//println!("buffer:{}", buffer);
|
|
225
|
+
// Thread analyzing a particular line must have the same remainder as the thread_num, this avoids multiple threads from parsing the same line
|
|
226
|
+
let line = lines[line_iter];
|
|
227
|
+
let mut index = 0;
|
|
228
|
+
for field in line.split('\t').collect::<Vec<&str>>() {
|
|
229
|
+
if index == gene_name_index.unwrap() {
|
|
230
|
+
genes_names_thread.push(field.to_string());
|
|
231
|
+
} else if index == gene_symbol_index.unwrap() {
|
|
232
|
+
genes_symbols_thread.push(field.to_string());
|
|
233
|
+
} else if binary_search(&case_indexes_original, index) != -1 {
|
|
234
|
+
let num = FromStr::from_str(field);
|
|
235
|
+
match num {
|
|
236
|
+
Ok(n) => {
|
|
237
|
+
//println!("n:{}", n);
|
|
238
|
+
input_vector_thread.push(n);
|
|
239
|
+
if line_iter == 1 {
|
|
240
|
+
case_indexes_thread.push(num_columns_thread);
|
|
241
|
+
num_columns_thread += 1;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
Err(_n) => {
|
|
245
|
+
panic!(
|
|
246
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
247
|
+
field,
|
|
248
|
+
num_lines_thread + 1,
|
|
249
|
+
index + 1
|
|
250
|
+
);
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
} else if binary_search(&control_indexes_original, index) != -1 {
|
|
254
|
+
let num = FromStr::from_str(field);
|
|
255
|
+
match num {
|
|
256
|
+
Ok(n) => {
|
|
257
|
+
//println!("n:{}", n);
|
|
258
|
+
input_vector_thread.push(n);
|
|
259
|
+
if line_iter == 1 {
|
|
260
|
+
control_indexes_thread.push(num_columns_thread);
|
|
261
|
+
num_columns_thread += 1;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
Err(_n) => {
|
|
265
|
+
panic!(
|
|
266
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
267
|
+
field,
|
|
268
|
+
num_lines_thread + 1,
|
|
269
|
+
index + 1
|
|
270
|
+
);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
index += 1;
|
|
128
275
|
}
|
|
129
|
-
|
|
130
|
-
Err(_n) => {
|
|
131
|
-
panic!(
|
|
132
|
-
"Number {} in line {} and column {} is not a decimal number",
|
|
133
|
-
field,
|
|
134
|
-
num_lines + 1,
|
|
135
|
-
index + 1
|
|
136
|
-
);
|
|
276
|
+
num_lines_thread += 1;
|
|
137
277
|
}
|
|
138
278
|
}
|
|
139
|
-
|
|
140
|
-
|
|
279
|
+
input_vector_temp
|
|
280
|
+
.lock()
|
|
281
|
+
.unwrap()
|
|
282
|
+
.append(&mut input_vector_thread);
|
|
283
|
+
case_indexes_temp
|
|
284
|
+
.lock()
|
|
285
|
+
.unwrap()
|
|
286
|
+
.append(&mut case_indexes_thread);
|
|
287
|
+
control_indexes_temp
|
|
288
|
+
.lock()
|
|
289
|
+
.unwrap()
|
|
290
|
+
.append(&mut control_indexes_thread);
|
|
291
|
+
genes_names_temp
|
|
292
|
+
.lock()
|
|
293
|
+
.unwrap()
|
|
294
|
+
.append(&mut genes_names_thread);
|
|
295
|
+
genes_symbols_temp
|
|
296
|
+
.lock()
|
|
297
|
+
.unwrap()
|
|
298
|
+
.append(&mut genes_symbols_thread);
|
|
299
|
+
*num_lines_temp.lock().unwrap() += num_lines_thread;
|
|
300
|
+
if num_columns_thread > 0 {
|
|
301
|
+
*num_columns_temp.lock().unwrap() += num_columns_thread;
|
|
302
|
+
}
|
|
303
|
+
drop(input_vector_temp);
|
|
304
|
+
drop(case_indexes_temp);
|
|
305
|
+
drop(control_indexes_temp);
|
|
306
|
+
drop(genes_names_temp);
|
|
307
|
+
drop(genes_symbols_temp);
|
|
308
|
+
drop(num_lines_temp);
|
|
309
|
+
drop(num_columns_temp);
|
|
310
|
+
});
|
|
311
|
+
handles.push(handle); // The handle (which contains the thread) is stored in the handles vector
|
|
312
|
+
}
|
|
313
|
+
for handle in handles {
|
|
314
|
+
// Wait for all threads to finish before proceeding further
|
|
315
|
+
handle.join().unwrap();
|
|
141
316
|
}
|
|
142
|
-
|
|
317
|
+
// Combining data from all different threads
|
|
318
|
+
input_vector.append(&mut *input_vector_temp.lock().unwrap());
|
|
319
|
+
case_indexes.append(&mut *case_indexes_temp.lock().unwrap());
|
|
320
|
+
control_indexes.append(&mut *control_indexes_temp.lock().unwrap());
|
|
321
|
+
gene_names.append(&mut *genes_names_temp.lock().unwrap());
|
|
322
|
+
gene_symbols.append(&mut *genes_symbols_temp.lock().unwrap());
|
|
323
|
+
|
|
324
|
+
num_lines += *num_lines_temp.lock().unwrap();
|
|
325
|
+
num_columns += *num_columns_temp.lock().unwrap();
|
|
143
326
|
}
|
|
144
327
|
//println!("case_indexes:{:?}", case_indexes);
|
|
145
328
|
//println!("control_indexes:{:?}", control_indexes);
|
|
146
|
-
|
|
329
|
+
//println!("num_lines:{}", num_lines);
|
|
330
|
+
//println!("num_columns:{}", num_columns);
|
|
331
|
+
//println!("num_lines * num_columns:{}", num_lines * num_columns);
|
|
332
|
+
//println!("input_vector:{:?}", input_vector.len());
|
|
333
|
+
println!("Time for inputting data:{:?}", input_time.elapsed());
|
|
147
334
|
let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
|
|
148
335
|
//println!("dm:{:?}", dm);
|
|
149
336
|
(dm, case_indexes, control_indexes, gene_names, gene_symbols)
|
|
@@ -198,6 +385,7 @@ fn main() {
|
|
|
198
385
|
let control_list: Vec<&str> = control_string.split(",").collect();
|
|
199
386
|
let (input_matrix, case_indexes, control_indexes, gene_names, gene_symbols) =
|
|
200
387
|
input_data(file_name, &case_list, &control_list);
|
|
388
|
+
let filtering_time = Instant::now();
|
|
201
389
|
let (filtered_matrix, lib_sizes, filtered_genes, filtered_gene_symbols) =
|
|
202
390
|
filter_by_expr(
|
|
203
391
|
&input_matrix,
|
|
@@ -206,10 +394,21 @@ fn main() {
|
|
|
206
394
|
gene_names,
|
|
207
395
|
gene_symbols,
|
|
208
396
|
);
|
|
397
|
+
println!("filtering time:{:?}", filtering_time.elapsed());
|
|
209
398
|
//println!("filtered_matrix_rows:{:?}", filtered_matrix.nrows());
|
|
210
399
|
//println!("filtered_matrix_cols:{:?}", filtered_matrix.ncols());
|
|
400
|
+
let cpm_normalization_time = Instant::now();
|
|
211
401
|
let mut normalized_matrix = cpm(&filtered_matrix);
|
|
402
|
+
println!(
|
|
403
|
+
"cpm normalization time:{:?}",
|
|
404
|
+
cpm_normalization_time.elapsed()
|
|
405
|
+
);
|
|
406
|
+
let tmm_normalization_time = Instant::now();
|
|
212
407
|
let norm_factors = tmm_normalization(filtered_matrix, &lib_sizes);
|
|
408
|
+
println!(
|
|
409
|
+
"tmm normalization time:{:?}",
|
|
410
|
+
tmm_normalization_time.elapsed()
|
|
411
|
+
);
|
|
213
412
|
//println!("norm_factors:{:?}", norm_factors);
|
|
214
413
|
|
|
215
414
|
for col in 0..normalized_matrix.ncols() {
|
|
@@ -232,58 +431,154 @@ fn main() {
|
|
|
232
431
|
|
|
233
432
|
//println!("case_indexes:{:?}", case_indexes);
|
|
234
433
|
//println!("control_indexes:{:?}", control_indexes);
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
//println!("
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
434
|
+
let num_normalized_rows = normalized_matrix.nrows();
|
|
435
|
+
if normalized_matrix.nrows() * normalized_matrix.ncols() < PAR_CUTOFF {
|
|
436
|
+
for i in 0..normalized_matrix.nrows() {
|
|
437
|
+
let row = normalized_matrix.row(i);
|
|
438
|
+
//println!("row:{:?}", row);
|
|
439
|
+
let mut treated = Vec::<f64>::new();
|
|
440
|
+
let mut control = Vec::<f64>::new();
|
|
441
|
+
//println!("conditions:{:?}", conditions);
|
|
442
|
+
for j in 0..(case_indexes.len() + control_indexes.len()) {
|
|
443
|
+
//println!("row[(0, j)]:{}", row[(0, j)]);
|
|
444
|
+
if case_indexes.contains(&j) {
|
|
445
|
+
treated.push(row[(0, j)]);
|
|
446
|
+
//println!("{},{}", input_data_vec.0[i][j], "Diseased");
|
|
447
|
+
} else if control_indexes.contains(&j) {
|
|
448
|
+
// + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
|
|
449
|
+
control.push(row[(0, j)]);
|
|
450
|
+
//println!("{},{}", input_data_vec.0[i][j], "Control");
|
|
451
|
+
} else {
|
|
452
|
+
panic!(
|
|
453
|
+
"Column {} could not be classified into case/control",
|
|
454
|
+
j
|
|
455
|
+
);
|
|
456
|
+
}
|
|
252
457
|
}
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
true
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
if (treated_mean.unwrap() / control_mean.unwrap())
|
|
266
|
-
.log2()
|
|
267
|
-
.is_nan()
|
|
268
|
-
== false
|
|
269
|
-
&& (treated_mean.unwrap() / control_mean.unwrap())
|
|
458
|
+
//println!("treated{:?}", treated);
|
|
459
|
+
//println!("control{:?}", control);
|
|
460
|
+
let p_value = stats_functions::wilcoxon_rank_sum_test(
|
|
461
|
+
treated.clone(),
|
|
462
|
+
control.clone(),
|
|
463
|
+
THRESHOLD,
|
|
464
|
+
't',
|
|
465
|
+
true,
|
|
466
|
+
); // Setting continuity correction to true in case of normal approximation
|
|
467
|
+
let treated_mean = Data::new(treated).mean();
|
|
468
|
+
let control_mean = Data::new(control).mean();
|
|
469
|
+
if (treated_mean.unwrap() / control_mean.unwrap())
|
|
270
470
|
.log2()
|
|
271
|
-
.
|
|
471
|
+
.is_nan()
|
|
272
472
|
== false
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
473
|
+
&& (treated_mean.unwrap() / control_mean.unwrap())
|
|
474
|
+
.log2()
|
|
475
|
+
.is_infinite()
|
|
476
|
+
== false
|
|
477
|
+
{
|
|
478
|
+
p_values.push(PValueIndexes {
|
|
479
|
+
index: i,
|
|
480
|
+
gene_name: filtered_genes[i].to_owned(),
|
|
481
|
+
gene_symbol: filtered_gene_symbols[i].to_owned(),
|
|
482
|
+
fold_change: (treated_mean.unwrap() / control_mean.unwrap())
|
|
483
|
+
.log2(),
|
|
484
|
+
p_value: p_value,
|
|
485
|
+
});
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
} else {
|
|
489
|
+
// Multithreaded implementation of calculating wilcoxon p-values
|
|
490
|
+
let normalized_matrix_temp = Arc::new(normalized_matrix);
|
|
491
|
+
let filtered_genes_temp = Arc::new(filtered_genes);
|
|
492
|
+
let filtered_gene_symbols_temp = Arc::new(filtered_gene_symbols);
|
|
493
|
+
let case_indexes_temp = Arc::new(case_indexes);
|
|
494
|
+
let control_indexes_temp = Arc::new(control_indexes);
|
|
495
|
+
let p_values_temp = Arc::new(Mutex::new(Vec::<PValueIndexes>::new()));
|
|
496
|
+
let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
|
|
497
|
+
for thread_num in 0..max_threads {
|
|
498
|
+
let normalized_matrix_temp = Arc::clone(&normalized_matrix_temp);
|
|
499
|
+
let case_indexes_temp = Arc::clone(&case_indexes_temp);
|
|
500
|
+
let control_indexes_temp = Arc::clone(&control_indexes_temp);
|
|
501
|
+
let p_values_temp = Arc::clone(&p_values_temp);
|
|
502
|
+
let filtered_genes_temp = Arc::clone(&filtered_genes_temp);
|
|
503
|
+
let filtered_gene_symbols_temp =
|
|
504
|
+
Arc::clone(&filtered_gene_symbols_temp);
|
|
505
|
+
let handle = thread::spawn(move || {
|
|
506
|
+
let mut p_values_thread: Vec<PValueIndexes> = Vec::with_capacity(
|
|
507
|
+
normalized_matrix_temp.nrows() / max_threads,
|
|
508
|
+
);
|
|
509
|
+
for i in 0..normalized_matrix_temp.nrows() {
|
|
510
|
+
let remainder: usize = i % max_threads; // Calculate remainder of iteration number divided by max_threads to decide which thread parses the row
|
|
511
|
+
if remainder == thread_num {
|
|
512
|
+
let row = normalized_matrix_temp.row(i);
|
|
513
|
+
//println!("row:{:?}", row);
|
|
514
|
+
let mut treated = Vec::<f64>::new();
|
|
515
|
+
let mut control = Vec::<f64>::new();
|
|
516
|
+
//println!("conditions:{:?}", conditions);
|
|
517
|
+
for j in 0..(case_indexes_temp.len()
|
|
518
|
+
+ control_indexes_temp.len())
|
|
519
|
+
{
|
|
520
|
+
//println!("row[(0, j)]:{}", row[(0, j)]);
|
|
521
|
+
if case_indexes_temp.contains(&j) {
|
|
522
|
+
treated.push(row[(0, j)]);
|
|
523
|
+
//println!("{},{}", input_data_vec.0[i][j], "Diseased");
|
|
524
|
+
} else if control_indexes_temp.contains(&j) {
|
|
525
|
+
// + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
|
|
526
|
+
control.push(row[(0, j)]);
|
|
527
|
+
//println!("{},{}", input_data_vec.0[i][j], "Control");
|
|
528
|
+
} else {
|
|
529
|
+
panic!(
|
|
530
|
+
"Column {} could not be classified into case/control",
|
|
531
|
+
j
|
|
532
|
+
);
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
//println!("treated{:?}", treated);
|
|
536
|
+
//println!("control{:?}", control);
|
|
537
|
+
let p_value = stats_functions::wilcoxon_rank_sum_test(
|
|
538
|
+
treated.clone(),
|
|
539
|
+
control.clone(),
|
|
540
|
+
THRESHOLD,
|
|
541
|
+
't',
|
|
542
|
+
true,
|
|
543
|
+
); // Setting continuity correction to true in case of normal approximation
|
|
544
|
+
let treated_mean = Data::new(treated).mean();
|
|
545
|
+
let control_mean = Data::new(control).mean();
|
|
546
|
+
if (treated_mean.unwrap() / control_mean.unwrap())
|
|
547
|
+
.log2()
|
|
548
|
+
.is_nan()
|
|
549
|
+
== false
|
|
550
|
+
&& (treated_mean.unwrap() / control_mean.unwrap())
|
|
551
|
+
.log2()
|
|
552
|
+
.is_infinite()
|
|
553
|
+
== false
|
|
554
|
+
{
|
|
555
|
+
p_values_thread.push(PValueIndexes {
|
|
556
|
+
index: i,
|
|
557
|
+
gene_name: filtered_genes_temp[i].to_owned(),
|
|
558
|
+
gene_symbol: filtered_gene_symbols_temp[i]
|
|
559
|
+
.to_owned(),
|
|
560
|
+
fold_change: (treated_mean.unwrap()
|
|
561
|
+
/ control_mean.unwrap())
|
|
562
|
+
.log2(),
|
|
563
|
+
p_value: p_value,
|
|
564
|
+
});
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
p_values_temp.lock().unwrap().append(&mut p_values_thread);
|
|
280
569
|
});
|
|
570
|
+
handles.push(handle);
|
|
571
|
+
}
|
|
572
|
+
for handle in handles {
|
|
573
|
+
// Wait for all threads to finish before proceeding further
|
|
574
|
+
handle.join().unwrap();
|
|
281
575
|
}
|
|
576
|
+
p_values.append(&mut *p_values_temp.lock().unwrap());
|
|
282
577
|
}
|
|
283
578
|
//println!("p_values:{:?}", p_values);
|
|
284
579
|
println!(
|
|
285
580
|
"Time for running {} wilcoxon tests:{:?}",
|
|
286
|
-
|
|
581
|
+
num_normalized_rows,
|
|
287
582
|
now2.elapsed()
|
|
288
583
|
);
|
|
289
584
|
let adjusted_p_values = adjust_p_values(p_values);
|
|
@@ -408,18 +703,62 @@ fn tmm_normalization(
|
|
|
408
703
|
}
|
|
409
704
|
}
|
|
410
705
|
//println!("ref_column:{}", ref_column);
|
|
411
|
-
let
|
|
412
|
-
let ref_lib_size = lib_sizes[ref_column];
|
|
706
|
+
let num_cols = input_matrix.ncols();
|
|
413
707
|
let mut f: Vec<f64> = Vec::with_capacity(input_matrix.ncols());
|
|
414
|
-
|
|
415
|
-
let
|
|
416
|
-
let
|
|
417
|
-
|
|
418
|
-
obs_data
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
708
|
+
if input_matrix.nrows() * input_matrix.ncols() < PAR_CUTOFF {
|
|
709
|
+
let ref_data = input_matrix.column(ref_column);
|
|
710
|
+
let ref_lib_size = lib_sizes[ref_column];
|
|
711
|
+
for col in 0..input_matrix.ncols() {
|
|
712
|
+
let obs_data = input_matrix.column(col);
|
|
713
|
+
let obs_lib_size = lib_sizes[col];
|
|
714
|
+
f.push(calc_factor_tmm(
|
|
715
|
+
obs_data,
|
|
716
|
+
&ref_data,
|
|
717
|
+
ref_lib_size,
|
|
718
|
+
obs_lib_size,
|
|
719
|
+
));
|
|
720
|
+
}
|
|
721
|
+
} else {
|
|
722
|
+
// Multithreaded implementation of TMM normalization
|
|
723
|
+
let f_temp = Arc::new(Mutex::new(Vec::<f_index>::new()));
|
|
724
|
+
let lib_sizes_temp = Arc::new(lib_sizes.to_owned());
|
|
725
|
+
let input_matrix_temp = Arc::new(input_matrix);
|
|
726
|
+
let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
|
|
727
|
+
for thread_num in 0..max_threads {
|
|
728
|
+
let f_temp = Arc::clone(&f_temp);
|
|
729
|
+
let lib_sizes_temp = Arc::clone(&lib_sizes_temp);
|
|
730
|
+
let input_matrix_temp = Arc::clone(&input_matrix_temp);
|
|
731
|
+
let handle = thread::spawn(move || {
|
|
732
|
+
let mut f_thread: Vec<f_index> =
|
|
733
|
+
Vec::with_capacity(input_matrix_temp.ncols() / max_threads);
|
|
734
|
+
let ref_data = input_matrix_temp.column(ref_column);
|
|
735
|
+
let ref_lib_size = lib_sizes_temp[ref_column];
|
|
736
|
+
for col in 0..input_matrix_temp.ncols() {
|
|
737
|
+
let remainder: usize = col % max_threads; // Calculate remainder of column number divided by max_threads to decide which thread parses this column
|
|
738
|
+
if remainder == thread_num {
|
|
739
|
+
let obs_data = input_matrix_temp.column(col);
|
|
740
|
+
let obs_lib_size = lib_sizes_temp[col];
|
|
741
|
+
f_thread.push(f_index {
|
|
742
|
+
f: calc_factor_tmm(obs_data, &ref_data, ref_lib_size, obs_lib_size),
|
|
743
|
+
ind: col,
|
|
744
|
+
})
|
|
745
|
+
}
|
|
746
|
+
}
|
|
747
|
+
f_temp.lock().unwrap().append(&mut f_thread);
|
|
748
|
+
});
|
|
749
|
+
handles.push(handle);
|
|
750
|
+
}
|
|
751
|
+
for handle in handles {
|
|
752
|
+
// Wait for all threads to finish before proceeding further
|
|
753
|
+
handle.join().unwrap();
|
|
754
|
+
}
|
|
755
|
+
let mut f_orig: Vec<f_index> = Vec::with_capacity(num_cols);
|
|
756
|
+
f_orig.append(&mut *f_temp.lock().unwrap());
|
|
757
|
+
// Need to sort vector because the vector will not be ordered accord to ind because of multithreading
|
|
758
|
+
f_orig
|
|
759
|
+
.as_mut_slice()
|
|
760
|
+
.sort_by(|a, b| (a.ind).partial_cmp(&b.ind).unwrap_or(Ordering::Equal));
|
|
761
|
+
f = f_orig.into_iter().map(|x| x.f).collect::<Vec<f64>>();
|
|
423
762
|
}
|
|
424
763
|
const NATURAL_E: f64 = 2.718281828459;
|
|
425
764
|
let log_f: Vec<f64> = f.clone().into_iter().map(|x| x.log(NATURAL_E)).collect();
|
|
@@ -427,6 +766,11 @@ fn tmm_normalization(
|
|
|
427
766
|
let final_f: Vec<f64> = f.into_iter().map(|x| x / exp_mean_log_f).collect();
|
|
428
767
|
final_f
|
|
429
768
|
}
|
|
769
|
+
#[allow(non_camel_case_types)]
|
|
770
|
+
struct f_index {
|
|
771
|
+
f: f64,
|
|
772
|
+
ind: usize,
|
|
773
|
+
}
|
|
430
774
|
|
|
431
775
|
fn calc_factor_tmm(
|
|
432
776
|
obs_data: Matrix<f64, Dyn, Const<1>, ViewStorage<'_, f64, Dyn, Const<1>, Const<1>, Dyn>>,
|
|
@@ -537,7 +881,7 @@ fn rank_vector(input_vector: &Vec<f64>) -> Vec<f64> {
|
|
|
537
881
|
rank: i as f64 + 1.0,
|
|
538
882
|
});
|
|
539
883
|
} else {
|
|
540
|
-
frac_rank = calculate_frac_rank(i as f64 + 1.0, num_repeats);
|
|
884
|
+
frac_rank = stats_functions::calculate_frac_rank(i as f64 + 1.0, num_repeats);
|
|
541
885
|
ranks.push(RankOutput {
|
|
542
886
|
orig_index: input_vector_sorted[i].orig_index,
|
|
543
887
|
rank: frac_rank,
|
|
@@ -750,274 +1094,3 @@ fn cpm(
|
|
|
750
1094
|
//println!("output_matrix:{:?}", output_matrix);
|
|
751
1095
|
output_matrix
|
|
752
1096
|
}
|
|
753
|
-
|
|
754
|
-
pub fn wilcoxon_rank_sum_test(
|
|
755
|
-
mut group1: Vec<f64>,
|
|
756
|
-
mut group2: Vec<f64>,
|
|
757
|
-
threshold: usize,
|
|
758
|
-
alternative: char,
|
|
759
|
-
correct: bool,
|
|
760
|
-
) -> f64 {
|
|
761
|
-
// Check if there are any ties between the two groups
|
|
762
|
-
|
|
763
|
-
let mut combined = group1.clone();
|
|
764
|
-
combined.extend(group2.iter().cloned());
|
|
765
|
-
combined.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
766
|
-
//println!("combined:{:?}", combined);
|
|
767
|
-
|
|
768
|
-
group1.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
769
|
-
group2.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
770
|
-
//println!("group1:{:?}", group1);
|
|
771
|
-
//println!("group2:{:?}", group2);
|
|
772
|
-
|
|
773
|
-
let mut group1_iter = 0;
|
|
774
|
-
let mut group2_iter = 0;
|
|
775
|
-
let mut xy: Vec<char> = Vec::with_capacity(combined.len()); // Stores X-Y classification
|
|
776
|
-
let mut ranks: Vec<f64> = Vec::with_capacity(combined.len()); // Stores the rank of each element
|
|
777
|
-
let mut is_repeat = false;
|
|
778
|
-
let mut repeat_present = false;
|
|
779
|
-
let mut frac_rank: f64 = 0.0;
|
|
780
|
-
let mut num_repeats: f64 = 1.0;
|
|
781
|
-
let mut repeat_iter: f64 = 1.0;
|
|
782
|
-
#[allow(unused_variables)]
|
|
783
|
-
let mut weight_x: f64 = 0.0;
|
|
784
|
-
let mut weight_y: f64 = 0.0;
|
|
785
|
-
let mut group_char: char = 'X';
|
|
786
|
-
let mut rank_frequencies: Vec<f64> = Vec::with_capacity(combined.len());
|
|
787
|
-
for i in 0..combined.len() {
|
|
788
|
-
//println!("group1_iter:{}", group1_iter);
|
|
789
|
-
//println!("group2_iter:{}", group2_iter);
|
|
790
|
-
//println!("item1:{}", combined[i]);
|
|
791
|
-
//println!("is_repeat:{}", is_repeat);
|
|
792
|
-
if group1_iter < group1.len() && combined[i] == group1[group1_iter] {
|
|
793
|
-
xy.push('X');
|
|
794
|
-
group1_iter += 1;
|
|
795
|
-
group_char = 'X';
|
|
796
|
-
} else if group2_iter < group2.len() && combined[i] == group2[group2_iter] {
|
|
797
|
-
xy.push('Y');
|
|
798
|
-
group2_iter += 1;
|
|
799
|
-
group_char = 'Y';
|
|
800
|
-
}
|
|
801
|
-
|
|
802
|
-
// Computing ranks
|
|
803
|
-
if is_repeat == false {
|
|
804
|
-
// Check if current element has other occurences
|
|
805
|
-
num_repeats = 1.0;
|
|
806
|
-
for j in i + 1..combined.len() {
|
|
807
|
-
if combined[i] == combined[j] {
|
|
808
|
-
is_repeat = true;
|
|
809
|
-
repeat_present = true;
|
|
810
|
-
repeat_iter = 1.0;
|
|
811
|
-
num_repeats += 1.0;
|
|
812
|
-
} else {
|
|
813
|
-
break;
|
|
814
|
-
}
|
|
815
|
-
}
|
|
816
|
-
//println!("num_repeats:{}", num_repeats);
|
|
817
|
-
if is_repeat == false {
|
|
818
|
-
ranks.push(i as f64 + 1.0);
|
|
819
|
-
if group_char == 'X' {
|
|
820
|
-
weight_x += i as f64 + 1.0;
|
|
821
|
-
} else if group_char == 'Y' {
|
|
822
|
-
weight_y += i as f64 + 1.0;
|
|
823
|
-
}
|
|
824
|
-
//rank_frequencies.push(RankFreq {
|
|
825
|
-
// rank: i as f64 + 1.0,
|
|
826
|
-
// freq: 1,
|
|
827
|
-
//});
|
|
828
|
-
rank_frequencies.push(1.0);
|
|
829
|
-
} else {
|
|
830
|
-
frac_rank = calculate_frac_rank(i as f64 + 1.0, num_repeats);
|
|
831
|
-
ranks.push(frac_rank);
|
|
832
|
-
if group_char == 'X' {
|
|
833
|
-
weight_x += frac_rank;
|
|
834
|
-
} else if group_char == 'Y' {
|
|
835
|
-
weight_y += frac_rank
|
|
836
|
-
}
|
|
837
|
-
//rank_frequencies.push(RankFreq {
|
|
838
|
-
// rank: frac_rank,
|
|
839
|
-
// freq: num_repeats as usize,
|
|
840
|
-
//});
|
|
841
|
-
rank_frequencies.push(num_repeats);
|
|
842
|
-
}
|
|
843
|
-
} else if repeat_iter < num_repeats {
|
|
844
|
-
// Repeat case
|
|
845
|
-
ranks.push(frac_rank);
|
|
846
|
-
repeat_iter += 1.0;
|
|
847
|
-
if group_char == 'X' {
|
|
848
|
-
weight_x += frac_rank;
|
|
849
|
-
} else if group_char == 'Y' {
|
|
850
|
-
weight_y += frac_rank
|
|
851
|
-
}
|
|
852
|
-
if repeat_iter == num_repeats {
|
|
853
|
-
is_repeat = false;
|
|
854
|
-
}
|
|
855
|
-
} else {
|
|
856
|
-
//println!("i:{}", i);
|
|
857
|
-
ranks.push(i as f64 + 1.0);
|
|
858
|
-
repeat_iter = 1.0;
|
|
859
|
-
num_repeats = 1.0;
|
|
860
|
-
if group_char == 'X' {
|
|
861
|
-
weight_x += i as f64 + 1.0;
|
|
862
|
-
} else if group_char == 'Y' {
|
|
863
|
-
weight_y += i as f64 + 1.0;
|
|
864
|
-
}
|
|
865
|
-
}
|
|
866
|
-
}
|
|
867
|
-
//println!("rank_frequencies:{:?}", rank_frequencies);
|
|
868
|
-
//println!("xy:{:?}", xy);
|
|
869
|
-
//println!("ranks:{:?}", ranks);
|
|
870
|
-
//println!("weight_x:{}", weight_x);
|
|
871
|
-
//println!("weight_y:{}", weight_y);
|
|
872
|
-
|
|
873
|
-
//u_dash (calculated below) calculates the "W Statistic" in wilcox.test function in R
|
|
874
|
-
|
|
875
|
-
let u_y = weight_y - (group2.len() as f64 * (group2.len() as f64 + 1.0) / 2.0) as f64;
|
|
876
|
-
let u_dash_y = (u_y - (group1.len() * group2.len()) as f64).abs();
|
|
877
|
-
//println!("u_dash_y:{}", u_dash_y);
|
|
878
|
-
|
|
879
|
-
let u_x = weight_x - (group1.len() as f64 * (group1.len() as f64 + 1.0) / 2.0) as f64;
|
|
880
|
-
let _u_dash_x = (u_x - (group1.len() * group2.len()) as f64).abs();
|
|
881
|
-
//println!("u_dash_x:{}", u_dash_x);
|
|
882
|
-
|
|
883
|
-
// Calculate test_statistic
|
|
884
|
-
|
|
885
|
-
//let t1 = weight_x - ((group1.len() as f64) * (group1.len() as f64 + 1.0)) / 2.0;
|
|
886
|
-
//let t2 = weight_y - ((group2.len() as f64) * (group2.len() as f64 + 1.0)) / 2.0;
|
|
887
|
-
//
|
|
888
|
-
//let mut test_statistic = t1;
|
|
889
|
-
//if t2 < t1 {
|
|
890
|
-
// test_statistic = t2;
|
|
891
|
-
//}
|
|
892
|
-
|
|
893
|
-
//println!("test_statistic:{}", test_statistic);
|
|
894
|
-
|
|
895
|
-
if group1.len() < threshold && group2.len() < threshold && repeat_present == false {
|
|
896
|
-
// Compute exact p-values
|
|
897
|
-
|
|
898
|
-
// Calculate conditional probability for weight_y
|
|
899
|
-
|
|
900
|
-
if alternative == 'g' {
|
|
901
|
-
// Alternative "greater"
|
|
902
|
-
//if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
|
|
903
|
-
// iterate_exact_p_values(ranks, weight_y, group2.len())
|
|
904
|
-
//} else {
|
|
905
|
-
calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
|
|
906
|
-
//}
|
|
907
|
-
} else if alternative == 'l' {
|
|
908
|
-
// Alternative "lesser"
|
|
909
|
-
//if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
|
|
910
|
-
// iterate_exact_p_values(ranks, weight_x, group1.len())
|
|
911
|
-
//} else {
|
|
912
|
-
calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
|
|
913
|
-
//}
|
|
914
|
-
} else {
|
|
915
|
-
// Two-sided distribution
|
|
916
|
-
calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
|
|
917
|
-
}
|
|
918
|
-
} else {
|
|
919
|
-
// Compute p-values from a normal distribution
|
|
920
|
-
//println!("group1 length:{}", group1.len());
|
|
921
|
-
//println!("group2 length:{}", group2.len());
|
|
922
|
-
|
|
923
|
-
let mut z = u_dash_y - ((group1.len() * group2.len()) as f64) / 2.0;
|
|
924
|
-
//println!("z_original:{}", z);
|
|
925
|
-
let mut nties_sum: f64 = 0.0;
|
|
926
|
-
for i in 0..rank_frequencies.len() {
|
|
927
|
-
nties_sum += rank_frequencies[i] * rank_frequencies[i] * rank_frequencies[i]
|
|
928
|
-
- rank_frequencies[i];
|
|
929
|
-
}
|
|
930
|
-
|
|
931
|
-
let sigma = (((group1.len() * group2.len()) as f64) / 12.0
|
|
932
|
-
* ((group1.len() + group2.len() + 1) as f64
|
|
933
|
-
- nties_sum
|
|
934
|
-
/ (((group1.len() + group2.len()) as f64)
|
|
935
|
-
* ((group1.len() + group2.len() - 1) as f64))))
|
|
936
|
-
.sqrt();
|
|
937
|
-
//println!("sigma:{}", sigma);
|
|
938
|
-
let mut correction: f64 = 0.0;
|
|
939
|
-
if correct == true {
|
|
940
|
-
if alternative == 'g' {
|
|
941
|
-
// Alternative "greater"
|
|
942
|
-
correction = 0.5;
|
|
943
|
-
} else if alternative == 'l' {
|
|
944
|
-
// Alternative "lesser"
|
|
945
|
-
correction = -0.5;
|
|
946
|
-
} else {
|
|
947
|
-
// Alternative "two-sided"
|
|
948
|
-
if z > 0.0 {
|
|
949
|
-
correction = 0.5;
|
|
950
|
-
} else if z < 0.0 {
|
|
951
|
-
correction = -0.5;
|
|
952
|
-
} else {
|
|
953
|
-
// z=0
|
|
954
|
-
correction = 0.0;
|
|
955
|
-
}
|
|
956
|
-
}
|
|
957
|
-
}
|
|
958
|
-
z = (z - correction) / sigma;
|
|
959
|
-
//println!("z:{}", z);
|
|
960
|
-
if alternative == 'g' {
|
|
961
|
-
// Alternative "greater"
|
|
962
|
-
//println!("greater:{}", n.cdf(weight_y));
|
|
963
|
-
//1.0 - n.cdf(z) // Applying continuity correction
|
|
964
|
-
r_mathlib::normal_cdf(z, 0.0, 1.0, false, false)
|
|
965
|
-
} else if alternative == 'l' {
|
|
966
|
-
// Alternative "lesser"
|
|
967
|
-
//println!("lesser:{}", n.cdf(weight_x));
|
|
968
|
-
//n.cdf(z) // Applying continuity coorection
|
|
969
|
-
r_mathlib::normal_cdf(z, 0.0, 1.0, true, false)
|
|
970
|
-
} else {
|
|
971
|
-
// Alternative "two-sided"
|
|
972
|
-
let p_g = r_mathlib::normal_cdf(z, 0.0, 1.0, false, false); // Applying continuity correction
|
|
973
|
-
let p_l = r_mathlib::normal_cdf(z, 0.0, 1.0, true, false); // Applying continuity correction
|
|
974
|
-
let mut p_value;
|
|
975
|
-
if p_g < p_l {
|
|
976
|
-
p_value = 2.0 * p_g;
|
|
977
|
-
} else {
|
|
978
|
-
p_value = 2.0 * p_l;
|
|
979
|
-
}
|
|
980
|
-
//println!("p_value:{}", p_value);
|
|
981
|
-
if p_value > 1.0 {
|
|
982
|
-
p_value = 1.0;
|
|
983
|
-
}
|
|
984
|
-
p_value
|
|
985
|
-
}
|
|
986
|
-
}
|
|
987
|
-
}
|
|
988
|
-
|
|
989
|
-
// To be used only when there are no ties in the input data
|
|
990
|
-
#[allow(dead_code)]
|
|
991
|
-
fn calculate_exact_probability(weight: f64, x: usize, y: usize, alternative: char) -> f64 {
|
|
992
|
-
//println!("Using Wilcoxon CDF");
|
|
993
|
-
let mut p_value;
|
|
994
|
-
if alternative == 't' {
|
|
995
|
-
if weight > ((x * y) as f64) / 2.0 {
|
|
996
|
-
p_value = 2.0 * r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
|
|
997
|
-
} else {
|
|
998
|
-
p_value = 2.0 * r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
|
|
999
|
-
}
|
|
1000
|
-
if p_value > 1.0 {
|
|
1001
|
-
p_value = 1.0;
|
|
1002
|
-
}
|
|
1003
|
-
} else if alternative == 'g' {
|
|
1004
|
-
p_value = r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
|
|
1005
|
-
} else if alternative == 'l' {
|
|
1006
|
-
p_value = r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
|
|
1007
|
-
} else {
|
|
1008
|
-
// Should not happen
|
|
1009
|
-
panic!("Unknown alternative option given, please check!");
|
|
1010
|
-
}
|
|
1011
|
-
//println!("p_value:{}", p_value);
|
|
1012
|
-
p_value
|
|
1013
|
-
}
|
|
1014
|
-
|
|
1015
|
-
#[allow(dead_code)]
|
|
1016
|
-
pub fn calculate_frac_rank(current_rank: f64, num_repeats: f64) -> f64 {
|
|
1017
|
-
let mut sum = 0.0;
|
|
1018
|
-
for i in 0..num_repeats as usize {
|
|
1019
|
-
let rank = current_rank + i as f64;
|
|
1020
|
-
sum += rank;
|
|
1021
|
-
}
|
|
1022
|
-
sum / num_repeats
|
|
1023
|
-
}
|