@sjcrh/proteinpaint-rust 2.27.0 → 2.29.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +7 -2
- package/package.json +1 -1
- package/src/DEanalysis.rs +1010 -0
- package/src/stats_functions.rs +275 -0
- package/src/wilcoxon.rs +2 -272
|
@@ -0,0 +1,1010 @@
|
|
|
1
|
+
// cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/expression
|
|
2
|
+
#![allow(non_snake_case)]
|
|
3
|
+
use json;
|
|
4
|
+
use nalgebra::base::dimension::Const;
|
|
5
|
+
use nalgebra::base::dimension::Dyn;
|
|
6
|
+
use nalgebra::base::Matrix;
|
|
7
|
+
use nalgebra::base::VecStorage;
|
|
8
|
+
use nalgebra::DMatrix;
|
|
9
|
+
use nalgebra::ViewStorage;
|
|
10
|
+
use r_mathlib;
|
|
11
|
+
use serde::{Deserialize, Serialize};
|
|
12
|
+
use serde_json;
|
|
13
|
+
use statrs::statistics::Data;
|
|
14
|
+
use statrs::statistics::Distribution;
|
|
15
|
+
use statrs::statistics::Median;
|
|
16
|
+
use std::cmp::Ordering;
|
|
17
|
+
use std::path::Path;
|
|
18
|
+
use std::str::FromStr;
|
|
19
|
+
use std::time::Instant;
|
|
20
|
+
//use std::cmp::Ordering;
|
|
21
|
+
//use std::env;
|
|
22
|
+
use std::io;
|
|
23
|
+
//mod stats_functions; // Importing Wilcoxon function from stats_functions.rs
|
|
24
|
+
|
|
25
|
+
fn input_data(
|
|
26
|
+
filename: &String,
|
|
27
|
+
case_list: &Vec<&str>,
|
|
28
|
+
control_list: &Vec<&str>,
|
|
29
|
+
) -> (
|
|
30
|
+
Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
31
|
+
Vec<usize>,
|
|
32
|
+
Vec<usize>,
|
|
33
|
+
Vec<String>,
|
|
34
|
+
Vec<String>,
|
|
35
|
+
) {
|
|
36
|
+
// Build the CSV reader and iterate over each record.
|
|
37
|
+
let path = Path::new(filename);
|
|
38
|
+
let mut rdr = csv::Reader::from_path(path).unwrap();
|
|
39
|
+
let mut num_lines: usize = 0;
|
|
40
|
+
let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
|
|
41
|
+
let mut gene_names: Vec<String> = Vec::with_capacity(65000);
|
|
42
|
+
let mut gene_symbols: Vec<String> = Vec::with_capacity(65000);
|
|
43
|
+
let mut num_columns: usize = 0;
|
|
44
|
+
|
|
45
|
+
// Check headers for samples
|
|
46
|
+
let header_line = rdr.headers().unwrap();
|
|
47
|
+
let mut headers: Vec<&str> = Vec::with_capacity(1500);
|
|
48
|
+
for field in header_line.iter() {
|
|
49
|
+
headers = field.split('\t').collect::<Vec<&str>>();
|
|
50
|
+
}
|
|
51
|
+
//println!("headers:{:?}", headers);
|
|
52
|
+
let mut case_indexes_original: Vec<usize> = Vec::with_capacity(case_list.len());
|
|
53
|
+
let mut control_indexes_original: Vec<usize> = Vec::with_capacity(control_list.len());
|
|
54
|
+
let gene_name_index = headers.iter().position(|r| r == &"geneID");
|
|
55
|
+
let gene_symbol_index = headers.iter().position(|r| r == &"geneSymbol");
|
|
56
|
+
|
|
57
|
+
for item in case_list {
|
|
58
|
+
//println!("item:{}", item);
|
|
59
|
+
let index = headers.iter().position(|r| r == item);
|
|
60
|
+
match index {
|
|
61
|
+
Some(n) => case_indexes_original.push(n),
|
|
62
|
+
None => {
|
|
63
|
+
// When sample not found, give error stating the sample name is not found
|
|
64
|
+
panic!("Case sample not found:{}", item);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
for item in control_list {
|
|
70
|
+
//println!("item:{}", item);
|
|
71
|
+
let index = headers.iter().position(|r| r == item);
|
|
72
|
+
match index {
|
|
73
|
+
Some(n) => control_indexes_original.push(n),
|
|
74
|
+
None => {
|
|
75
|
+
// When sample not found, give error stating the sample name is not found
|
|
76
|
+
panic!("Control sample not found:{}", item);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
//println!("case_indexes_original:{:?}", case_indexes_original);
|
|
81
|
+
//println!("control_indexes_original:{:?}", control_indexes_original);
|
|
82
|
+
|
|
83
|
+
let mut case_indexes: Vec<usize> = Vec::with_capacity(case_list.len());
|
|
84
|
+
let mut control_indexes: Vec<usize> = Vec::with_capacity(control_list.len());
|
|
85
|
+
for result in rdr.records() {
|
|
86
|
+
// The iterator yields Result<StringRecord, Error>, so we check the
|
|
87
|
+
// error here.
|
|
88
|
+
let record = result.unwrap();
|
|
89
|
+
//println!("record:{:?}", record);
|
|
90
|
+
let mut index = 0;
|
|
91
|
+
for field in record[0].split('\t').collect::<Vec<&str>>() {
|
|
92
|
+
if index == gene_name_index.unwrap() {
|
|
93
|
+
gene_names.push(field.to_string());
|
|
94
|
+
} else if index == gene_symbol_index.unwrap() {
|
|
95
|
+
gene_symbols.push(field.to_string());
|
|
96
|
+
} else if case_indexes_original.contains(&index) {
|
|
97
|
+
let num = FromStr::from_str(field);
|
|
98
|
+
match num {
|
|
99
|
+
Ok(n) => {
|
|
100
|
+
//println!("n:{}", n);
|
|
101
|
+
input_vector.push(n);
|
|
102
|
+
if num_lines == 0 {
|
|
103
|
+
case_indexes.push(num_columns);
|
|
104
|
+
num_columns += 1;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
Err(_n) => {
|
|
108
|
+
panic!(
|
|
109
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
110
|
+
field,
|
|
111
|
+
num_lines + 1,
|
|
112
|
+
index + 1
|
|
113
|
+
);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
} else if control_indexes_original.contains(&index) {
|
|
117
|
+
let num = FromStr::from_str(field);
|
|
118
|
+
match num {
|
|
119
|
+
Ok(n) => {
|
|
120
|
+
//println!("n:{}", n);
|
|
121
|
+
input_vector.push(n);
|
|
122
|
+
if num_lines == 0 {
|
|
123
|
+
control_indexes.push(num_columns);
|
|
124
|
+
num_columns += 1;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
Err(_n) => {
|
|
128
|
+
panic!(
|
|
129
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
130
|
+
field,
|
|
131
|
+
num_lines + 1,
|
|
132
|
+
index + 1
|
|
133
|
+
);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
index += 1;
|
|
138
|
+
}
|
|
139
|
+
num_lines += 1;
|
|
140
|
+
}
|
|
141
|
+
//println!("case_indexes:{:?}", case_indexes);
|
|
142
|
+
//println!("control_indexes:{:?}", control_indexes);
|
|
143
|
+
|
|
144
|
+
let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
|
|
145
|
+
//println!("dm:{:?}", dm);
|
|
146
|
+
(dm, case_indexes, control_indexes, gene_names, gene_symbols)
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
#[allow(dead_code)]
|
|
150
|
+
#[derive(Debug, Serialize, Deserialize)]
|
|
151
|
+
struct AdjustedPValueIndexes {
|
|
152
|
+
index: usize,
|
|
153
|
+
gene_name: String,
|
|
154
|
+
gene_symbol: String,
|
|
155
|
+
fold_change: f64,
|
|
156
|
+
original_p_value: f64,
|
|
157
|
+
adjusted_p_value: f64,
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
struct PValueIndexes {
|
|
161
|
+
index: usize,
|
|
162
|
+
gene_name: String,
|
|
163
|
+
gene_symbol: String,
|
|
164
|
+
fold_change: f64,
|
|
165
|
+
p_value: f64,
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
fn main() {
|
|
169
|
+
//env::set_var("RUST_BACKTRACE", "full");
|
|
170
|
+
let mut input = String::new();
|
|
171
|
+
//env::set_var("RUST_BACKTRACE", "1");
|
|
172
|
+
match io::stdin().read_line(&mut input) {
|
|
173
|
+
// Accepting the piped input from nodejs (or command line from testing)
|
|
174
|
+
Ok(_bytes_read) => {
|
|
175
|
+
//println!("{} bytes read", bytes_read);
|
|
176
|
+
//println!("{}", input);
|
|
177
|
+
let input_json = json::parse(&input);
|
|
178
|
+
match input_json {
|
|
179
|
+
Ok(json_string) => {
|
|
180
|
+
let now = Instant::now();
|
|
181
|
+
let case_string = &json_string["case"].to_owned().as_str().unwrap().to_string();
|
|
182
|
+
let control_string = &json_string["control"]
|
|
183
|
+
.to_owned()
|
|
184
|
+
.as_str()
|
|
185
|
+
.unwrap()
|
|
186
|
+
.to_string();
|
|
187
|
+
let file_name = &json_string["input_file"]
|
|
188
|
+
.to_owned()
|
|
189
|
+
.as_str()
|
|
190
|
+
.unwrap()
|
|
191
|
+
.to_string()
|
|
192
|
+
.split(",")
|
|
193
|
+
.collect();
|
|
194
|
+
let case_list: Vec<&str> = case_string.split(",").collect();
|
|
195
|
+
let control_list: Vec<&str> = control_string.split(",").collect();
|
|
196
|
+
let (input_matrix, case_indexes, control_indexes, gene_names, gene_symbols) =
|
|
197
|
+
input_data(file_name, &case_list, &control_list);
|
|
198
|
+
let (filtered_matrix, lib_sizes, filtered_genes, filtered_gene_symbols) =
|
|
199
|
+
filter_by_expr(
|
|
200
|
+
&input_matrix,
|
|
201
|
+
case_list.len(),
|
|
202
|
+
control_list.len(),
|
|
203
|
+
gene_names,
|
|
204
|
+
gene_symbols,
|
|
205
|
+
);
|
|
206
|
+
//println!("filtered_matrix_rows:{:?}", filtered_matrix.nrows());
|
|
207
|
+
//println!("filtered_matrix_cols:{:?}", filtered_matrix.ncols());
|
|
208
|
+
let mut normalized_matrix = cpm(&filtered_matrix);
|
|
209
|
+
let norm_factors = tmm_normalization(filtered_matrix, &lib_sizes);
|
|
210
|
+
//println!("norm_factors:{:?}", norm_factors);
|
|
211
|
+
|
|
212
|
+
for col in 0..normalized_matrix.ncols() {
|
|
213
|
+
let norm_factor = norm_factors[col];
|
|
214
|
+
for row in 0..normalized_matrix.nrows() {
|
|
215
|
+
normalized_matrix[(row, col)] =
|
|
216
|
+
normalized_matrix[(row, col)] / norm_factor;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
//println!("normalized_matrix:{:?}", normalized_matrix);
|
|
220
|
+
println!("Number of cases:{}", case_list.len());
|
|
221
|
+
println!("Number of controls:{}", control_list.len());
|
|
222
|
+
println!("Time for pre-processing:{:?}", now.elapsed());
|
|
223
|
+
// Using Wilcoxon test for differential gene expression
|
|
224
|
+
|
|
225
|
+
let now2 = Instant::now();
|
|
226
|
+
let mut p_values: Vec<PValueIndexes> =
|
|
227
|
+
Vec::with_capacity(normalized_matrix.nrows());
|
|
228
|
+
const THRESHOLD: usize = 50; // This determines whether the Wilcoxon exact test or the normal test will be used based on sample size.
|
|
229
|
+
|
|
230
|
+
//println!("case_indexes:{:?}", case_indexes);
|
|
231
|
+
//println!("control_indexes:{:?}", control_indexes);
|
|
232
|
+
for i in 0..normalized_matrix.nrows() {
|
|
233
|
+
let row = normalized_matrix.row(i);
|
|
234
|
+
//println!("row:{:?}", row);
|
|
235
|
+
let mut treated = Vec::<f64>::new();
|
|
236
|
+
let mut control = Vec::<f64>::new();
|
|
237
|
+
//println!("conditions:{:?}", conditions);
|
|
238
|
+
for j in 0..(case_indexes.len() + control_indexes.len()) {
|
|
239
|
+
//println!("row[(0, j)]:{}", row[(0, j)]);
|
|
240
|
+
if case_indexes.contains(&j) {
|
|
241
|
+
treated.push(row[(0, j)]);
|
|
242
|
+
//println!("{},{}", input_data_vec.0[i][j], "Diseased");
|
|
243
|
+
} else if control_indexes.contains(&j) {
|
|
244
|
+
// + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
|
|
245
|
+
control.push(row[(0, j)]);
|
|
246
|
+
//println!("{},{}", input_data_vec.0[i][j], "Control");
|
|
247
|
+
} else {
|
|
248
|
+
panic!("Column {} could not be classified into case/control", j);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
//println!("treated{:?}", treated);
|
|
252
|
+
//println!("control{:?}", control);
|
|
253
|
+
let p_value = wilcoxon_rank_sum_test(
|
|
254
|
+
treated.clone(),
|
|
255
|
+
control.clone(),
|
|
256
|
+
THRESHOLD,
|
|
257
|
+
't',
|
|
258
|
+
true,
|
|
259
|
+
); // Setting continuity correction to true in case of normal approximation
|
|
260
|
+
let treated_mean = Data::new(treated).mean();
|
|
261
|
+
let control_mean = Data::new(control).mean();
|
|
262
|
+
p_values.push(PValueIndexes {
|
|
263
|
+
index: i,
|
|
264
|
+
gene_name: filtered_genes[i].to_owned(),
|
|
265
|
+
gene_symbol: filtered_gene_symbols[i].to_owned(),
|
|
266
|
+
fold_change: (treated_mean.unwrap() / control_mean.unwrap()).log2(),
|
|
267
|
+
p_value: p_value,
|
|
268
|
+
});
|
|
269
|
+
}
|
|
270
|
+
//println!("p_values:{:?}", p_values);
|
|
271
|
+
println!(
|
|
272
|
+
"Time for running {} wilcoxon tests:{:?}",
|
|
273
|
+
normalized_matrix.nrows(),
|
|
274
|
+
now2.elapsed()
|
|
275
|
+
);
|
|
276
|
+
let adjusted_p_values = adjust_p_values(p_values);
|
|
277
|
+
println!("adjusted_p_values:{}", adjusted_p_values);
|
|
278
|
+
//let fold_changes =
|
|
279
|
+
// calculate_fold_change(normalized_matrix, case_indexes, control_indexes);
|
|
280
|
+
}
|
|
281
|
+
Err(error) => println!("Incorrect json: {}", error),
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
Err(error) => println!("Piping error: {}", error),
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
fn adjust_p_values(mut original_p_values: Vec<PValueIndexes>) -> String {
|
|
289
|
+
// Sorting p-values in ascending order
|
|
290
|
+
original_p_values.as_mut_slice().sort_by(|a, b| {
|
|
291
|
+
(a.p_value)
|
|
292
|
+
.partial_cmp(&b.p_value)
|
|
293
|
+
.unwrap_or(Ordering::Equal)
|
|
294
|
+
});
|
|
295
|
+
|
|
296
|
+
let mut adjusted_p_values: Vec<AdjustedPValueIndexes> =
|
|
297
|
+
Vec::with_capacity(original_p_values.len());
|
|
298
|
+
let mut old_p_value: f64 = 0.0;
|
|
299
|
+
let mut rank: f64 = original_p_values.len() as f64;
|
|
300
|
+
for j in 0..original_p_values.len() {
|
|
301
|
+
let i = original_p_values.len() - j - 1;
|
|
302
|
+
|
|
303
|
+
//println!("p_val:{}", p_val);
|
|
304
|
+
let mut adjusted_p_val: f64 =
|
|
305
|
+
original_p_values[i].p_value * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
|
|
306
|
+
if adjusted_p_val > 1.0 {
|
|
307
|
+
// p_value should NEVER be greater than 1
|
|
308
|
+
adjusted_p_val = 1.0;
|
|
309
|
+
}
|
|
310
|
+
//println!("Original p_value:{}", original_p_values[i].p_value);
|
|
311
|
+
//println!("Raw adjusted p_value:{}", adjusted_p_value);
|
|
312
|
+
if i != original_p_values.len() - 1 {
|
|
313
|
+
if adjusted_p_val > old_p_value {
|
|
314
|
+
adjusted_p_val = old_p_value;
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
old_p_value = adjusted_p_val;
|
|
318
|
+
//println!("adjusted_p_value:{}", adjusted_p_val);
|
|
319
|
+
rank -= 1.0;
|
|
320
|
+
|
|
321
|
+
adjusted_p_values.push(AdjustedPValueIndexes {
|
|
322
|
+
index: original_p_values[i].index,
|
|
323
|
+
fold_change: original_p_values[i].fold_change,
|
|
324
|
+
gene_name: original_p_values[i].gene_name.to_owned(),
|
|
325
|
+
gene_symbol: original_p_values[i].gene_symbol.to_owned(),
|
|
326
|
+
original_p_value: (-1.0) * original_p_values[i].p_value.log10(),
|
|
327
|
+
adjusted_p_value: (-1.0) * adjusted_p_val.log10(),
|
|
328
|
+
});
|
|
329
|
+
}
|
|
330
|
+
adjusted_p_values.sort_by(|a, b| a.index.cmp(&b.index));
|
|
331
|
+
|
|
332
|
+
let mut output_string = "[".to_string();
|
|
333
|
+
for i in 0..adjusted_p_values.len() {
|
|
334
|
+
output_string += &serde_json::to_string(&adjusted_p_values[i]).unwrap();
|
|
335
|
+
if i != adjusted_p_values.len() - 1 {
|
|
336
|
+
output_string += &",".to_string();
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
output_string += &"]".to_string();
|
|
340
|
+
output_string
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
#[allow(dead_code)]
|
|
344
|
+
fn adjust_p_values_bonferroni(original_p_values: Vec<PValueIndexes>) -> Vec<AdjustedPValueIndexes> {
|
|
345
|
+
let mut adjusted_p_values: Vec<AdjustedPValueIndexes> =
|
|
346
|
+
Vec::with_capacity(original_p_values.len());
|
|
347
|
+
for i in 0..original_p_values.len() {
|
|
348
|
+
let mut adjusted_p_value: f64 =
|
|
349
|
+
original_p_values[i].p_value * original_p_values.len() as f64; // In bonferroni correction, multiplying p_value by number of tests (excluding those with low sample sizes)
|
|
350
|
+
if adjusted_p_value > 1.0 {
|
|
351
|
+
// p_value should NEVER be greater than 1
|
|
352
|
+
adjusted_p_value = 1.0;
|
|
353
|
+
}
|
|
354
|
+
adjusted_p_values.push(AdjustedPValueIndexes {
|
|
355
|
+
index: original_p_values[i].index,
|
|
356
|
+
gene_name: original_p_values[i].gene_name.to_owned(),
|
|
357
|
+
gene_symbol: original_p_values[i].gene_symbol.to_owned(),
|
|
358
|
+
fold_change: original_p_values[i].fold_change,
|
|
359
|
+
original_p_value: (-1.0) * original_p_values[i].p_value.log10(),
|
|
360
|
+
adjusted_p_value: (-1.0) * adjusted_p_value.log10(),
|
|
361
|
+
});
|
|
362
|
+
}
|
|
363
|
+
adjusted_p_values
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
fn tmm_normalization(
|
|
367
|
+
input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
368
|
+
lib_sizes: &Vec<f64>,
|
|
369
|
+
) -> Vec<f64> {
|
|
370
|
+
//println!("Unnormalized matrix:{:?}", input_matrix);
|
|
371
|
+
let f75 = calc_factor_quantile(&input_matrix, lib_sizes);
|
|
372
|
+
//println!("f75:{:?}", f75);
|
|
373
|
+
let mut ref_column = 0;
|
|
374
|
+
if Data::new(f75.clone()).median() < 1e-20 {
|
|
375
|
+
let mut max = 0.0;
|
|
376
|
+
for col in 0..input_matrix.ncols() {
|
|
377
|
+
let mut col_sum = 0.0;
|
|
378
|
+
for row in 0..input_matrix.nrows() {
|
|
379
|
+
col_sum += (input_matrix[(row, col)] as f64).sqrt();
|
|
380
|
+
}
|
|
381
|
+
if col_sum > max {
|
|
382
|
+
max = col_sum;
|
|
383
|
+
ref_column = col;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
} else {
|
|
387
|
+
let mut min = f64::INFINITY;
|
|
388
|
+
let f75_mean = Data::new(f75.clone()).mean();
|
|
389
|
+
for i in 0..f75.len() {
|
|
390
|
+
let num = (f75[i] - f75_mean.unwrap()).abs();
|
|
391
|
+
if num < min {
|
|
392
|
+
min = num;
|
|
393
|
+
ref_column = i;
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
//println!("ref_column:{}", ref_column);
|
|
398
|
+
let ref_data = input_matrix.column(ref_column);
|
|
399
|
+
let ref_lib_size = lib_sizes[ref_column];
|
|
400
|
+
let mut f: Vec<f64> = Vec::with_capacity(input_matrix.ncols());
|
|
401
|
+
for col in 0..input_matrix.ncols() {
|
|
402
|
+
let obs_data = input_matrix.column(col);
|
|
403
|
+
let obs_lib_size = lib_sizes[col];
|
|
404
|
+
f.push(calc_factor_tmm(
|
|
405
|
+
obs_data,
|
|
406
|
+
&ref_data,
|
|
407
|
+
ref_lib_size,
|
|
408
|
+
obs_lib_size,
|
|
409
|
+
));
|
|
410
|
+
}
|
|
411
|
+
const NATURAL_E: f64 = 2.718281828459;
|
|
412
|
+
let log_f: Vec<f64> = f.clone().into_iter().map(|x| x.log(NATURAL_E)).collect();
|
|
413
|
+
let exp_mean_log_f = Data::new(log_f).mean().unwrap().exp();
|
|
414
|
+
let final_f: Vec<f64> = f.into_iter().map(|x| x / exp_mean_log_f).collect();
|
|
415
|
+
final_f
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
fn calc_factor_tmm(
|
|
419
|
+
obs_data: Matrix<f64, Dyn, Const<1>, ViewStorage<'_, f64, Dyn, Const<1>, Const<1>, Dyn>>,
|
|
420
|
+
ref_data: &Matrix<f64, Dyn, Const<1>, ViewStorage<'_, f64, Dyn, Const<1>, Const<1>, Dyn>>,
|
|
421
|
+
n_r: f64,
|
|
422
|
+
n_o: f64,
|
|
423
|
+
) -> f64 {
|
|
424
|
+
let mut log_r: Vec<f64> = Vec::with_capacity(obs_data.nrows());
|
|
425
|
+
let mut abs_e: Vec<f64> = Vec::with_capacity(obs_data.nrows());
|
|
426
|
+
let mut v: Vec<f64> = Vec::with_capacity(obs_data.nrows());
|
|
427
|
+
const A_CUTOFF: f64 = -1e10; // Value of constant from R implementation
|
|
428
|
+
|
|
429
|
+
let mut max_log_r: f64 = 0.0;
|
|
430
|
+
for i in 0..obs_data.nrows() {
|
|
431
|
+
let obs_f = obs_data[(i, 0)] as f64;
|
|
432
|
+
let ref_f = ref_data[(i, 0)] as f64;
|
|
433
|
+
let obs_n_o = obs_f / n_o;
|
|
434
|
+
let ref_n_r = ref_f / n_r;
|
|
435
|
+
let logr = (obs_n_o / ref_n_r).log2();
|
|
436
|
+
let abse = (obs_n_o.log2() + ref_n_r.log2()) / 2.0;
|
|
437
|
+
if logr != f64::INFINITY && abse != f64::INFINITY && abse > A_CUTOFF {
|
|
438
|
+
log_r.push(logr);
|
|
439
|
+
if logr.abs() > max_log_r {
|
|
440
|
+
max_log_r = logr.abs();
|
|
441
|
+
}
|
|
442
|
+
abs_e.push(abse);
|
|
443
|
+
v.push(((n_o - obs_f) / n_o) / obs_f + ((n_r - ref_f) / n_r) / ref_f);
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
//println!("log_r:{:?}", log_r);
|
|
447
|
+
//println!("abs_e:{:?}", abs_e);
|
|
448
|
+
//println!("v:{:?}", v);
|
|
449
|
+
|
|
450
|
+
if max_log_r < 1e-6 {
|
|
451
|
+
// Value of constant from R implementation
|
|
452
|
+
1.0
|
|
453
|
+
} else {
|
|
454
|
+
const LOG_RATIO_TRIM: f64 = 0.3; // Value of constant from R implementation
|
|
455
|
+
const SUM_TRIM: f64 = 0.05; // Value of constant from R implementation
|
|
456
|
+
let n = log_r.len() as f64;
|
|
457
|
+
let lo_l = (n * LOG_RATIO_TRIM).floor() + 1.0;
|
|
458
|
+
let hi_l = n + 1.0 - lo_l;
|
|
459
|
+
let lo_s = (n * SUM_TRIM).floor() + 1.0;
|
|
460
|
+
let hi_s = n + 1.0 - lo_s;
|
|
461
|
+
|
|
462
|
+
let log_r_log = rank_vector(&log_r);
|
|
463
|
+
let abs_e_log = rank_vector(&abs_e);
|
|
464
|
+
let mut num: f64 = 0.0;
|
|
465
|
+
let mut den: f64 = 0.0;
|
|
466
|
+
for i in 0..log_r.len() {
|
|
467
|
+
if log_r_log[i] >= lo_l
|
|
468
|
+
&& log_r_log[i] <= hi_l
|
|
469
|
+
&& abs_e_log[i] >= lo_s
|
|
470
|
+
&& abs_e_log[i] <= hi_s
|
|
471
|
+
{
|
|
472
|
+
num += log_r[i] / v[i];
|
|
473
|
+
den += 1.0 / v[i];
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
f64::powf(2.0, num / den)
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
#[derive(PartialEq, PartialOrd)]
|
|
481
|
+
struct RankInput {
|
|
482
|
+
val: f64,
|
|
483
|
+
orig_index: usize,
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
struct RankOutput {
|
|
487
|
+
orig_index: usize,
|
|
488
|
+
rank: f64,
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
fn rank_vector(input_vector: &Vec<f64>) -> Vec<f64> {
|
|
492
|
+
let mut input_vector_sorted: Vec<RankInput> = Vec::with_capacity(input_vector.len());
|
|
493
|
+
for i in 0..input_vector.len() {
|
|
494
|
+
input_vector_sorted.push(RankInput {
|
|
495
|
+
val: input_vector[i],
|
|
496
|
+
orig_index: i,
|
|
497
|
+
})
|
|
498
|
+
}
|
|
499
|
+
input_vector_sorted.sort_by(|a, b| a.val.partial_cmp(&b.val).unwrap());
|
|
500
|
+
|
|
501
|
+
let mut ranks: Vec<RankOutput> = Vec::with_capacity(input_vector_sorted.len()); // Stores the rank of each element
|
|
502
|
+
let mut is_repeat = false;
|
|
503
|
+
let mut frac_rank: f64 = 0.0;
|
|
504
|
+
let mut num_repeats: f64 = 1.0;
|
|
505
|
+
let mut repeat_iter: f64 = 1.0;
|
|
506
|
+
for i in 0..input_vector_sorted.len() {
|
|
507
|
+
// Computing ranks
|
|
508
|
+
if is_repeat == false {
|
|
509
|
+
// Check if current element has other occurences
|
|
510
|
+
num_repeats = 1.0;
|
|
511
|
+
for j in i + 1..input_vector_sorted.len() {
|
|
512
|
+
if input_vector_sorted[i].val == input_vector_sorted[j].val {
|
|
513
|
+
is_repeat = true;
|
|
514
|
+
repeat_iter = 1.0;
|
|
515
|
+
num_repeats += 1.0;
|
|
516
|
+
} else {
|
|
517
|
+
break;
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
//println!("num_repeats:{}", num_repeats);
|
|
521
|
+
if is_repeat == false {
|
|
522
|
+
ranks.push(RankOutput {
|
|
523
|
+
orig_index: input_vector_sorted[i].orig_index,
|
|
524
|
+
rank: i as f64 + 1.0,
|
|
525
|
+
});
|
|
526
|
+
} else {
|
|
527
|
+
frac_rank = calculate_frac_rank(i as f64 + 1.0, num_repeats);
|
|
528
|
+
ranks.push(RankOutput {
|
|
529
|
+
orig_index: input_vector_sorted[i].orig_index,
|
|
530
|
+
rank: frac_rank,
|
|
531
|
+
});
|
|
532
|
+
}
|
|
533
|
+
} else if repeat_iter < num_repeats {
|
|
534
|
+
// Repeat case
|
|
535
|
+
ranks.push(RankOutput {
|
|
536
|
+
orig_index: input_vector_sorted[i].orig_index,
|
|
537
|
+
rank: frac_rank,
|
|
538
|
+
});
|
|
539
|
+
repeat_iter += 1.0;
|
|
540
|
+
if repeat_iter == num_repeats {
|
|
541
|
+
is_repeat = false;
|
|
542
|
+
}
|
|
543
|
+
} else {
|
|
544
|
+
//println!("i:{}", i);
|
|
545
|
+
ranks.push(RankOutput {
|
|
546
|
+
orig_index: input_vector_sorted[i].orig_index,
|
|
547
|
+
rank: i as f64 + 1.0,
|
|
548
|
+
});
|
|
549
|
+
repeat_iter = 1.0;
|
|
550
|
+
num_repeats = 1.0;
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
ranks.sort_by(|a, b| a.orig_index.cmp(&b.orig_index));
|
|
554
|
+
let output_vec: Vec<f64> = ranks.into_iter().map(|x| x.rank).collect();
|
|
555
|
+
output_vec
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
fn calc_factor_quantile(
|
|
559
|
+
input_matrix: &Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
560
|
+
lib_sizes: &Vec<f64>,
|
|
561
|
+
) -> Vec<f64> {
|
|
562
|
+
let mut f = Vec::with_capacity(input_matrix.ncols());
|
|
563
|
+
const P: f64 = 0.75; // Value of constant from R implementation
|
|
564
|
+
for j in 0..input_matrix.ncols() {
|
|
565
|
+
let mut row_vec = Vec::with_capacity(input_matrix.nrows());
|
|
566
|
+
for i in 0..input_matrix.nrows() {
|
|
567
|
+
row_vec.push(input_matrix[(i, j)] as f64);
|
|
568
|
+
}
|
|
569
|
+
//println!("row_vec:{:?}", row_vec);
|
|
570
|
+
let quan = calc_quantile(row_vec, P);
|
|
571
|
+
//println!("quan:{}", quan);
|
|
572
|
+
let num = quan / lib_sizes[j];
|
|
573
|
+
f.push(num);
|
|
574
|
+
//if num == 0.0 {
|
|
575
|
+
// println!("One or more quantiles are zero");
|
|
576
|
+
//}
|
|
577
|
+
}
|
|
578
|
+
//println!("quantiles:{:?}", f);
|
|
579
|
+
f
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
fn calc_quantile(mut input: Vec<f64>, p: f64) -> f64 {
|
|
583
|
+
let index: f64 = 1.0 + ((input.len() - 1) as f64) * p;
|
|
584
|
+
let lo: f64 = index.floor();
|
|
585
|
+
let hi: f64 = index.ceil();
|
|
586
|
+
input.sort_by(|a, b| a.partial_cmp(&b).unwrap()); // In R implementation "partial sort" was carried out which is upposed to be faster. This might be very slow for very large number of genes. Need to test this out with large number of genes later
|
|
587
|
+
let qs = input[lo as usize - 1];
|
|
588
|
+
let h: f64 = index - lo;
|
|
589
|
+
let qs_final = (1.0 - h) * qs + h * input[hi as usize - 1];
|
|
590
|
+
qs_final
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
fn filter_by_expr(
|
|
594
|
+
raw_data: &Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
595
|
+
num_diseased: usize,
|
|
596
|
+
num_control: usize,
|
|
597
|
+
gene_names: Vec<String>,
|
|
598
|
+
gene_symbols: Vec<String>,
|
|
599
|
+
) -> (
|
|
600
|
+
Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
601
|
+
Vec<f64>,
|
|
602
|
+
Vec<String>,
|
|
603
|
+
Vec<String>,
|
|
604
|
+
) {
|
|
605
|
+
// Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>
|
|
606
|
+
#[allow(non_upper_case_globals)]
|
|
607
|
+
const MIN_COUNT: f64 = 10.0; // Value of constant from R implementation
|
|
608
|
+
const MIN_TOTAL_COUNT: f64 = 15.0; // Value of constant from R implementation
|
|
609
|
+
const LARGE_N: f64 = 10.0; // Value of constant from R implementation
|
|
610
|
+
const MIN_PROP: f64 = 0.7; // Value of constant from R implementation
|
|
611
|
+
|
|
612
|
+
let mut min_sample_size;
|
|
613
|
+
if num_control < num_diseased {
|
|
614
|
+
min_sample_size = num_control as f64
|
|
615
|
+
} else {
|
|
616
|
+
min_sample_size = num_diseased as f64
|
|
617
|
+
}
|
|
618
|
+
if min_sample_size == 0.0 {
|
|
619
|
+
panic!("Only one condition present in groups");
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
if min_sample_size > LARGE_N {
|
|
623
|
+
min_sample_size = LARGE_N + (min_sample_size - LARGE_N) * MIN_PROP;
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
let mut lib_sizes = Vec::<f64>::new();
|
|
627
|
+
let lib_sizes_vector = raw_data.row_sum();
|
|
628
|
+
//println!("lib_sizes_vector:{:?}", lib_sizes_vector);
|
|
629
|
+
for i in 0..lib_sizes_vector.ncols() {
|
|
630
|
+
lib_sizes.push(lib_sizes_vector[(0, i)].into());
|
|
631
|
+
}
|
|
632
|
+
//println!("lib_sizes:{:?}", lib_sizes);
|
|
633
|
+
//println!("min_sample_size:{}", min_sample_size);
|
|
634
|
+
let median_lib_size = Data::new(lib_sizes.clone()).median();
|
|
635
|
+
let cpm_cutoff = (MIN_COUNT / median_lib_size) * 1000000.0;
|
|
636
|
+
//println!("cpm_cutoff:{}", cpm_cutoff);
|
|
637
|
+
let cpm_matrix = cpm(&raw_data);
|
|
638
|
+
const TOL: f64 = 1e-14; // Value of constant from R implementation
|
|
639
|
+
|
|
640
|
+
//let mut keep_cpm = Vec::<bool>::new();
|
|
641
|
+
//let mut keep_total = Vec::<bool>::new();
|
|
642
|
+
//let mut positive_cpm: usize = 0;
|
|
643
|
+
//let mut positive_total: usize = 0;
|
|
644
|
+
let mut positives = Vec::<usize>::new();
|
|
645
|
+
let row_sums = raw_data.column_sum();
|
|
646
|
+
for row in 0..cpm_matrix.nrows() {
|
|
647
|
+
let mut trues = 0.0;
|
|
648
|
+
for col in 0..cpm_matrix.ncols() {
|
|
649
|
+
if cpm_matrix[(row, col)] >= cpm_cutoff {
|
|
650
|
+
trues += 1.0;
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
let mut keep_cpm_bool = false;
|
|
654
|
+
if trues >= min_sample_size - TOL {
|
|
655
|
+
keep_cpm_bool = true;
|
|
656
|
+
//keep_cpm.push(keep_cpm_bool);
|
|
657
|
+
//positive_cpm += 1;
|
|
658
|
+
}
|
|
659
|
+
//else {
|
|
660
|
+
// keep_cpm.push(false)
|
|
661
|
+
//}
|
|
662
|
+
|
|
663
|
+
let mut keep_total_bool = false;
|
|
664
|
+
if row_sums[(row, 0)] as f64 >= MIN_TOTAL_COUNT - TOL {
|
|
665
|
+
keep_total_bool = true;
|
|
666
|
+
//keep_total.push(keep_total_bool);
|
|
667
|
+
//positive_total += 1;
|
|
668
|
+
}
|
|
669
|
+
//else {
|
|
670
|
+
// keep_total.push(false)
|
|
671
|
+
//}
|
|
672
|
+
|
|
673
|
+
if keep_cpm_bool == true && keep_total_bool == true {
|
|
674
|
+
positives.push(row);
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
//println!("row_sums:{:?}", row_sums);
|
|
678
|
+
//println!("keep_cpm:{:?}", keep_cpm);
|
|
679
|
+
//println!("positive_cpm:{}", positive_cpm);
|
|
680
|
+
//println!("negative_cpm:{}", keep_cpm.len() - positive_cpm);
|
|
681
|
+
//println!("keep_total:{:?}", keep_total);
|
|
682
|
+
//println!("positive_total:{}", positive_total);
|
|
683
|
+
//println!("negative_total:{}", keep_total.len() - positive_total);
|
|
684
|
+
let mut blank = Vec::with_capacity(positives.len() * (num_diseased + num_control));
|
|
685
|
+
for _i in 0..positives.len() * (num_diseased + num_control) {
|
|
686
|
+
blank.push(0.0);
|
|
687
|
+
}
|
|
688
|
+
let mut filtered_matrix = DMatrix::from_vec(positives.len(), num_diseased + num_control, blank);
|
|
689
|
+
let mut filtered_genes: Vec<String> = Vec::with_capacity(positives.len());
|
|
690
|
+
let mut filtered_gene_symbols: Vec<String> = Vec::with_capacity(positives.len());
|
|
691
|
+
let mut i = 0;
|
|
692
|
+
for index in positives {
|
|
693
|
+
let row = raw_data.row(index);
|
|
694
|
+
filtered_genes.push(gene_names[index].to_owned());
|
|
695
|
+
filtered_gene_symbols.push(gene_symbols[index].to_owned());
|
|
696
|
+
let mut j = 0;
|
|
697
|
+
for item in &row {
|
|
698
|
+
filtered_matrix[(i, j)] = *item;
|
|
699
|
+
j += 1;
|
|
700
|
+
}
|
|
701
|
+
i += 1
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
// Modifying lib sizes with only those rows that have been retained
|
|
705
|
+
let modified_lib_sizes_vector = filtered_matrix.row_sum();
|
|
706
|
+
let mut modified_lib_sizes: Vec<f64> = Vec::with_capacity(modified_lib_sizes_vector.ncols());
|
|
707
|
+
//println!("lib_sizes_vector:{:?}", lib_sizes_vector);
|
|
708
|
+
for i in 0..modified_lib_sizes_vector.ncols() {
|
|
709
|
+
modified_lib_sizes.push(modified_lib_sizes_vector[(0, i)].into());
|
|
710
|
+
}
|
|
711
|
+
//println!("filtered_matrix:{:?}", filtered_matrix);
|
|
712
|
+
(
|
|
713
|
+
filtered_matrix,
|
|
714
|
+
modified_lib_sizes,
|
|
715
|
+
filtered_genes,
|
|
716
|
+
filtered_gene_symbols,
|
|
717
|
+
)
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
fn cpm(
|
|
721
|
+
input_matrix: &Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
722
|
+
) -> Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>> {
|
|
723
|
+
//let mut blank = Vec::<f64>::new();
|
|
724
|
+
let mut blank = Vec::with_capacity(input_matrix.nrows() * input_matrix.ncols());
|
|
725
|
+
for _i in 0..input_matrix.nrows() * input_matrix.ncols() {
|
|
726
|
+
blank.push(0.0);
|
|
727
|
+
}
|
|
728
|
+
let mut output_matrix = DMatrix::from_vec(input_matrix.nrows(), input_matrix.ncols(), blank);
|
|
729
|
+
let column_sums = input_matrix.row_sum();
|
|
730
|
+
for col in 0..input_matrix.ncols() {
|
|
731
|
+
let norm_factor = column_sums[(0, col)];
|
|
732
|
+
for row in 0..input_matrix.nrows() {
|
|
733
|
+
output_matrix[(row, col)] =
|
|
734
|
+
(input_matrix[(row, col)] as f64 * 1000000.0) / norm_factor as f64;
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
//println!("output_matrix:{:?}", output_matrix);
|
|
738
|
+
output_matrix
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
pub fn wilcoxon_rank_sum_test(
|
|
742
|
+
mut group1: Vec<f64>,
|
|
743
|
+
mut group2: Vec<f64>,
|
|
744
|
+
threshold: usize,
|
|
745
|
+
alternative: char,
|
|
746
|
+
correct: bool,
|
|
747
|
+
) -> f64 {
|
|
748
|
+
// Check if there are any ties between the two groups
|
|
749
|
+
|
|
750
|
+
let mut combined = group1.clone();
|
|
751
|
+
combined.extend(group2.iter().cloned());
|
|
752
|
+
combined.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
753
|
+
//println!("combined:{:?}", combined);
|
|
754
|
+
|
|
755
|
+
group1.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
756
|
+
group2.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
757
|
+
//println!("group1:{:?}", group1);
|
|
758
|
+
//println!("group2:{:?}", group2);
|
|
759
|
+
|
|
760
|
+
let mut group1_iter = 0;
|
|
761
|
+
let mut group2_iter = 0;
|
|
762
|
+
let mut xy: Vec<char> = Vec::with_capacity(combined.len()); // Stores X-Y classification
|
|
763
|
+
let mut ranks: Vec<f64> = Vec::with_capacity(combined.len()); // Stores the rank of each element
|
|
764
|
+
let mut is_repeat = false;
|
|
765
|
+
let mut repeat_present = false;
|
|
766
|
+
let mut frac_rank: f64 = 0.0;
|
|
767
|
+
let mut num_repeats: f64 = 1.0;
|
|
768
|
+
let mut repeat_iter: f64 = 1.0;
|
|
769
|
+
#[allow(unused_variables)]
|
|
770
|
+
let mut weight_x: f64 = 0.0;
|
|
771
|
+
let mut weight_y: f64 = 0.0;
|
|
772
|
+
let mut group_char: char = 'X';
|
|
773
|
+
let mut rank_frequencies: Vec<f64> = Vec::with_capacity(combined.len());
|
|
774
|
+
for i in 0..combined.len() {
|
|
775
|
+
//println!("group1_iter:{}", group1_iter);
|
|
776
|
+
//println!("group2_iter:{}", group2_iter);
|
|
777
|
+
//println!("item1:{}", combined[i]);
|
|
778
|
+
//println!("is_repeat:{}", is_repeat);
|
|
779
|
+
if group1_iter < group1.len() && combined[i] == group1[group1_iter] {
|
|
780
|
+
xy.push('X');
|
|
781
|
+
group1_iter += 1;
|
|
782
|
+
group_char = 'X';
|
|
783
|
+
} else if group2_iter < group2.len() && combined[i] == group2[group2_iter] {
|
|
784
|
+
xy.push('Y');
|
|
785
|
+
group2_iter += 1;
|
|
786
|
+
group_char = 'Y';
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
// Computing ranks
|
|
790
|
+
if is_repeat == false {
|
|
791
|
+
// Check if current element has other occurences
|
|
792
|
+
num_repeats = 1.0;
|
|
793
|
+
for j in i + 1..combined.len() {
|
|
794
|
+
if combined[i] == combined[j] {
|
|
795
|
+
is_repeat = true;
|
|
796
|
+
repeat_present = true;
|
|
797
|
+
repeat_iter = 1.0;
|
|
798
|
+
num_repeats += 1.0;
|
|
799
|
+
} else {
|
|
800
|
+
break;
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
//println!("num_repeats:{}", num_repeats);
|
|
804
|
+
if is_repeat == false {
|
|
805
|
+
ranks.push(i as f64 + 1.0);
|
|
806
|
+
if group_char == 'X' {
|
|
807
|
+
weight_x += i as f64 + 1.0;
|
|
808
|
+
} else if group_char == 'Y' {
|
|
809
|
+
weight_y += i as f64 + 1.0;
|
|
810
|
+
}
|
|
811
|
+
//rank_frequencies.push(RankFreq {
|
|
812
|
+
// rank: i as f64 + 1.0,
|
|
813
|
+
// freq: 1,
|
|
814
|
+
//});
|
|
815
|
+
rank_frequencies.push(1.0);
|
|
816
|
+
} else {
|
|
817
|
+
frac_rank = calculate_frac_rank(i as f64 + 1.0, num_repeats);
|
|
818
|
+
ranks.push(frac_rank);
|
|
819
|
+
if group_char == 'X' {
|
|
820
|
+
weight_x += frac_rank;
|
|
821
|
+
} else if group_char == 'Y' {
|
|
822
|
+
weight_y += frac_rank
|
|
823
|
+
}
|
|
824
|
+
//rank_frequencies.push(RankFreq {
|
|
825
|
+
// rank: frac_rank,
|
|
826
|
+
// freq: num_repeats as usize,
|
|
827
|
+
//});
|
|
828
|
+
rank_frequencies.push(num_repeats);
|
|
829
|
+
}
|
|
830
|
+
} else if repeat_iter < num_repeats {
|
|
831
|
+
// Repeat case
|
|
832
|
+
ranks.push(frac_rank);
|
|
833
|
+
repeat_iter += 1.0;
|
|
834
|
+
if group_char == 'X' {
|
|
835
|
+
weight_x += frac_rank;
|
|
836
|
+
} else if group_char == 'Y' {
|
|
837
|
+
weight_y += frac_rank
|
|
838
|
+
}
|
|
839
|
+
if repeat_iter == num_repeats {
|
|
840
|
+
is_repeat = false;
|
|
841
|
+
}
|
|
842
|
+
} else {
|
|
843
|
+
//println!("i:{}", i);
|
|
844
|
+
ranks.push(i as f64 + 1.0);
|
|
845
|
+
repeat_iter = 1.0;
|
|
846
|
+
num_repeats = 1.0;
|
|
847
|
+
if group_char == 'X' {
|
|
848
|
+
weight_x += i as f64 + 1.0;
|
|
849
|
+
} else if group_char == 'Y' {
|
|
850
|
+
weight_y += i as f64 + 1.0;
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
}
|
|
854
|
+
//println!("rank_frequencies:{:?}", rank_frequencies);
|
|
855
|
+
//println!("xy:{:?}", xy);
|
|
856
|
+
//println!("ranks:{:?}", ranks);
|
|
857
|
+
//println!("weight_x:{}", weight_x);
|
|
858
|
+
//println!("weight_y:{}", weight_y);
|
|
859
|
+
|
|
860
|
+
//u_dash (calculated below) calculates the "W Statistic" in wilcox.test function in R
|
|
861
|
+
|
|
862
|
+
let u_y = weight_y - (group2.len() as f64 * (group2.len() as f64 + 1.0) / 2.0) as f64;
|
|
863
|
+
let u_dash_y = (u_y - (group1.len() * group2.len()) as f64).abs();
|
|
864
|
+
//println!("u_dash_y:{}", u_dash_y);
|
|
865
|
+
|
|
866
|
+
let u_x = weight_x - (group1.len() as f64 * (group1.len() as f64 + 1.0) / 2.0) as f64;
|
|
867
|
+
let _u_dash_x = (u_x - (group1.len() * group2.len()) as f64).abs();
|
|
868
|
+
//println!("u_dash_x:{}", u_dash_x);
|
|
869
|
+
|
|
870
|
+
// Calculate test_statistic
|
|
871
|
+
|
|
872
|
+
//let t1 = weight_x - ((group1.len() as f64) * (group1.len() as f64 + 1.0)) / 2.0;
|
|
873
|
+
//let t2 = weight_y - ((group2.len() as f64) * (group2.len() as f64 + 1.0)) / 2.0;
|
|
874
|
+
//
|
|
875
|
+
//let mut test_statistic = t1;
|
|
876
|
+
//if t2 < t1 {
|
|
877
|
+
// test_statistic = t2;
|
|
878
|
+
//}
|
|
879
|
+
|
|
880
|
+
//println!("test_statistic:{}", test_statistic);
|
|
881
|
+
|
|
882
|
+
if group1.len() < threshold && group2.len() < threshold && repeat_present == false {
|
|
883
|
+
// Compute exact p-values
|
|
884
|
+
|
|
885
|
+
// Calculate conditional probability for weight_y
|
|
886
|
+
|
|
887
|
+
if alternative == 'g' {
|
|
888
|
+
// Alternative "greater"
|
|
889
|
+
//if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
|
|
890
|
+
// iterate_exact_p_values(ranks, weight_y, group2.len())
|
|
891
|
+
//} else {
|
|
892
|
+
calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
|
|
893
|
+
//}
|
|
894
|
+
} else if alternative == 'l' {
|
|
895
|
+
// Alternative "lesser"
|
|
896
|
+
//if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
|
|
897
|
+
// iterate_exact_p_values(ranks, weight_x, group1.len())
|
|
898
|
+
//} else {
|
|
899
|
+
calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
|
|
900
|
+
//}
|
|
901
|
+
} else {
|
|
902
|
+
// Two-sided distribution
|
|
903
|
+
calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
|
|
904
|
+
}
|
|
905
|
+
} else {
|
|
906
|
+
// Compute p-values from a normal distribution
|
|
907
|
+
//println!("group1 length:{}", group1.len());
|
|
908
|
+
//println!("group2 length:{}", group2.len());
|
|
909
|
+
|
|
910
|
+
let mut z = u_dash_y - ((group1.len() * group2.len()) as f64) / 2.0;
|
|
911
|
+
//println!("z_original:{}", z);
|
|
912
|
+
let mut nties_sum: f64 = 0.0;
|
|
913
|
+
for i in 0..rank_frequencies.len() {
|
|
914
|
+
nties_sum += rank_frequencies[i] * rank_frequencies[i] * rank_frequencies[i]
|
|
915
|
+
- rank_frequencies[i];
|
|
916
|
+
}
|
|
917
|
+
|
|
918
|
+
let sigma = (((group1.len() * group2.len()) as f64) / 12.0
|
|
919
|
+
* ((group1.len() + group2.len() + 1) as f64
|
|
920
|
+
- nties_sum
|
|
921
|
+
/ (((group1.len() + group2.len()) as f64)
|
|
922
|
+
* ((group1.len() + group2.len() - 1) as f64))))
|
|
923
|
+
.sqrt();
|
|
924
|
+
//println!("sigma:{}", sigma);
|
|
925
|
+
let mut correction: f64 = 0.0;
|
|
926
|
+
if correct == true {
|
|
927
|
+
if alternative == 'g' {
|
|
928
|
+
// Alternative "greater"
|
|
929
|
+
correction = 0.5;
|
|
930
|
+
} else if alternative == 'l' {
|
|
931
|
+
// Alternative "lesser"
|
|
932
|
+
correction = -0.5;
|
|
933
|
+
} else {
|
|
934
|
+
// Alternative "two-sided"
|
|
935
|
+
if z > 0.0 {
|
|
936
|
+
correction = 0.5;
|
|
937
|
+
} else if z < 0.0 {
|
|
938
|
+
correction = -0.5;
|
|
939
|
+
} else {
|
|
940
|
+
// z=0
|
|
941
|
+
correction = 0.0;
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
z = (z - correction) / sigma;
|
|
946
|
+
//println!("z:{}", z);
|
|
947
|
+
if alternative == 'g' {
|
|
948
|
+
// Alternative "greater"
|
|
949
|
+
//println!("greater:{}", n.cdf(weight_y));
|
|
950
|
+
//1.0 - n.cdf(z) // Applying continuity correction
|
|
951
|
+
r_mathlib::normal_cdf(z, 0.0, 1.0, false, false)
|
|
952
|
+
} else if alternative == 'l' {
|
|
953
|
+
// Alternative "lesser"
|
|
954
|
+
//println!("lesser:{}", n.cdf(weight_x));
|
|
955
|
+
//n.cdf(z) // Applying continuity coorection
|
|
956
|
+
r_mathlib::normal_cdf(z, 0.0, 1.0, true, false)
|
|
957
|
+
} else {
|
|
958
|
+
// Alternative "two-sided"
|
|
959
|
+
let p_g = r_mathlib::normal_cdf(z, 0.0, 1.0, false, false); // Applying continuity correction
|
|
960
|
+
let p_l = r_mathlib::normal_cdf(z, 0.0, 1.0, true, false); // Applying continuity correction
|
|
961
|
+
let mut p_value;
|
|
962
|
+
if p_g < p_l {
|
|
963
|
+
p_value = 2.0 * p_g;
|
|
964
|
+
} else {
|
|
965
|
+
p_value = 2.0 * p_l;
|
|
966
|
+
}
|
|
967
|
+
//println!("p_value:{}", p_value);
|
|
968
|
+
if p_value > 1.0 {
|
|
969
|
+
p_value = 1.0;
|
|
970
|
+
}
|
|
971
|
+
p_value
|
|
972
|
+
}
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
// To be used only when there are no ties in the input data
|
|
977
|
+
#[allow(dead_code)]
|
|
978
|
+
fn calculate_exact_probability(weight: f64, x: usize, y: usize, alternative: char) -> f64 {
|
|
979
|
+
//println!("Using Wilcoxon CDF");
|
|
980
|
+
let mut p_value;
|
|
981
|
+
if alternative == 't' {
|
|
982
|
+
if weight > ((x * y) as f64) / 2.0 {
|
|
983
|
+
p_value = 2.0 * r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
|
|
984
|
+
} else {
|
|
985
|
+
p_value = 2.0 * r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
|
|
986
|
+
}
|
|
987
|
+
if p_value > 1.0 {
|
|
988
|
+
p_value = 1.0;
|
|
989
|
+
}
|
|
990
|
+
} else if alternative == 'g' {
|
|
991
|
+
p_value = r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
|
|
992
|
+
} else if alternative == 'l' {
|
|
993
|
+
p_value = r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
|
|
994
|
+
} else {
|
|
995
|
+
// Should not happen
|
|
996
|
+
panic!("Unknown alternative option given, please check!");
|
|
997
|
+
}
|
|
998
|
+
//println!("p_value:{}", p_value);
|
|
999
|
+
p_value
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
#[allow(dead_code)]
|
|
1003
|
+
pub fn calculate_frac_rank(current_rank: f64, num_repeats: f64) -> f64 {
|
|
1004
|
+
let mut sum = 0.0;
|
|
1005
|
+
for i in 0..num_repeats as usize {
|
|
1006
|
+
let rank = current_rank + i as f64;
|
|
1007
|
+
sum += rank;
|
|
1008
|
+
}
|
|
1009
|
+
sum / num_repeats
|
|
1010
|
+
}
|