@sjcrh/proteinpaint-rust 2.44.0 → 2.55.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +4 -4
- package/package.json +4 -4
- package/src/DEanalysis.rs +4 -276
- package/src/gdcmaf.rs +1 -1
- package/src/stats_functions.rs +270 -270
package/Cargo.toml
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
[package]
|
|
2
|
-
name = "
|
|
2
|
+
name = "PP_rust_utilities"
|
|
3
3
|
version = "0.1.0"
|
|
4
4
|
authors = ["rpaul1 <rpaul1@stjude.org>"]
|
|
5
5
|
edition = "2018"
|
|
@@ -76,9 +76,9 @@ path="src/gdcmaf.rs"
|
|
|
76
76
|
name="topGeneByExpressionVariance"
|
|
77
77
|
path="src/topGeneByExpressionVariance.rs"
|
|
78
78
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
79
|
+
[[bin]]
|
|
80
|
+
name="wilcoxon"
|
|
81
|
+
path="src/wilcoxon.rs"
|
|
82
82
|
|
|
83
83
|
[[bin]]
|
|
84
84
|
name="DEanalysis"
|
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.
|
|
2
|
+
"version": "2.55.0",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"description": "Rust-based utilities for proteinpaint",
|
|
5
5
|
"main": "index.js",
|
|
@@ -7,9 +7,9 @@
|
|
|
7
7
|
"proteinpaint-rust": "index.js"
|
|
8
8
|
},
|
|
9
9
|
"scripts": {
|
|
10
|
-
"dev": "cargo
|
|
11
|
-
"build": "cargo
|
|
12
|
-
"postinstall": "if [ ! -d ./test ] & [ ! -d ./target/release ]; then cargo
|
|
10
|
+
"dev": "cargo build --release",
|
|
11
|
+
"build": "cargo build --release",
|
|
12
|
+
"postinstall": "if [ ! -d ./test ] & [ ! -d ./target/release ]; then cargo build --release; fi",
|
|
13
13
|
"test": "tape **/test/*.spec.js",
|
|
14
14
|
"test:unit": "tape **/test/*.unit.spec.js",
|
|
15
15
|
"test:integration": "echo 'TODO: rust integration tests'"
|
package/src/DEanalysis.rs
CHANGED
|
@@ -8,7 +8,6 @@ use nalgebra::base::Matrix;
|
|
|
8
8
|
use nalgebra::base::VecStorage;
|
|
9
9
|
use nalgebra::DMatrix;
|
|
10
10
|
use nalgebra::ViewStorage;
|
|
11
|
-
use r_mathlib;
|
|
12
11
|
use serde::{Deserialize, Serialize};
|
|
13
12
|
use serde_json;
|
|
14
13
|
use statrs::statistics::Data;
|
|
@@ -24,7 +23,7 @@ use std::time::Instant;
|
|
|
24
23
|
//use std::cmp::Ordering;
|
|
25
24
|
//use std::env;
|
|
26
25
|
use std::io;
|
|
27
|
-
|
|
26
|
+
mod stats_functions; // Importing Wilcoxon function from stats_functions.rs
|
|
28
27
|
const PAR_CUTOFF: usize = 100000; // Cutoff for triggering multithreading processing of data
|
|
29
28
|
|
|
30
29
|
//const PAR_CUTOFF: usize = 1000000000000000;
|
|
@@ -458,7 +457,7 @@ fn main() {
|
|
|
458
457
|
}
|
|
459
458
|
//println!("treated{:?}", treated);
|
|
460
459
|
//println!("control{:?}", control);
|
|
461
|
-
let p_value = wilcoxon_rank_sum_test(
|
|
460
|
+
let p_value = stats_functions::wilcoxon_rank_sum_test(
|
|
462
461
|
treated.clone(),
|
|
463
462
|
control.clone(),
|
|
464
463
|
THRESHOLD,
|
|
@@ -535,7 +534,7 @@ fn main() {
|
|
|
535
534
|
}
|
|
536
535
|
//println!("treated{:?}", treated);
|
|
537
536
|
//println!("control{:?}", control);
|
|
538
|
-
let p_value = wilcoxon_rank_sum_test(
|
|
537
|
+
let p_value = stats_functions::wilcoxon_rank_sum_test(
|
|
539
538
|
treated.clone(),
|
|
540
539
|
control.clone(),
|
|
541
540
|
THRESHOLD,
|
|
@@ -882,7 +881,7 @@ fn rank_vector(input_vector: &Vec<f64>) -> Vec<f64> {
|
|
|
882
881
|
rank: i as f64 + 1.0,
|
|
883
882
|
});
|
|
884
883
|
} else {
|
|
885
|
-
frac_rank = calculate_frac_rank(i as f64 + 1.0, num_repeats);
|
|
884
|
+
frac_rank = stats_functions::calculate_frac_rank(i as f64 + 1.0, num_repeats);
|
|
886
885
|
ranks.push(RankOutput {
|
|
887
886
|
orig_index: input_vector_sorted[i].orig_index,
|
|
888
887
|
rank: frac_rank,
|
|
@@ -1095,274 +1094,3 @@ fn cpm(
|
|
|
1095
1094
|
//println!("output_matrix:{:?}", output_matrix);
|
|
1096
1095
|
output_matrix
|
|
1097
1096
|
}
|
|
1098
|
-
|
|
1099
|
-
pub fn wilcoxon_rank_sum_test(
|
|
1100
|
-
mut group1: Vec<f64>,
|
|
1101
|
-
mut group2: Vec<f64>,
|
|
1102
|
-
threshold: usize,
|
|
1103
|
-
alternative: char,
|
|
1104
|
-
correct: bool,
|
|
1105
|
-
) -> f64 {
|
|
1106
|
-
// Check if there are any ties between the two groups
|
|
1107
|
-
|
|
1108
|
-
let mut combined = group1.clone();
|
|
1109
|
-
combined.extend(group2.iter().cloned());
|
|
1110
|
-
combined.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
1111
|
-
//println!("combined:{:?}", combined);
|
|
1112
|
-
|
|
1113
|
-
group1.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
1114
|
-
group2.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
1115
|
-
//println!("group1:{:?}", group1);
|
|
1116
|
-
//println!("group2:{:?}", group2);
|
|
1117
|
-
|
|
1118
|
-
let mut group1_iter = 0;
|
|
1119
|
-
let mut group2_iter = 0;
|
|
1120
|
-
let mut xy: Vec<char> = Vec::with_capacity(combined.len()); // Stores X-Y classification
|
|
1121
|
-
let mut ranks: Vec<f64> = Vec::with_capacity(combined.len()); // Stores the rank of each element
|
|
1122
|
-
let mut is_repeat = false;
|
|
1123
|
-
let mut repeat_present = false;
|
|
1124
|
-
let mut frac_rank: f64 = 0.0;
|
|
1125
|
-
let mut num_repeats: f64 = 1.0;
|
|
1126
|
-
let mut repeat_iter: f64 = 1.0;
|
|
1127
|
-
#[allow(unused_variables)]
|
|
1128
|
-
let mut weight_x: f64 = 0.0;
|
|
1129
|
-
let mut weight_y: f64 = 0.0;
|
|
1130
|
-
let mut group_char: char = 'X';
|
|
1131
|
-
let mut rank_frequencies: Vec<f64> = Vec::with_capacity(combined.len());
|
|
1132
|
-
for i in 0..combined.len() {
|
|
1133
|
-
//println!("group1_iter:{}", group1_iter);
|
|
1134
|
-
//println!("group2_iter:{}", group2_iter);
|
|
1135
|
-
//println!("item1:{}", combined[i]);
|
|
1136
|
-
//println!("is_repeat:{}", is_repeat);
|
|
1137
|
-
if group1_iter < group1.len() && combined[i] == group1[group1_iter] {
|
|
1138
|
-
xy.push('X');
|
|
1139
|
-
group1_iter += 1;
|
|
1140
|
-
group_char = 'X';
|
|
1141
|
-
} else if group2_iter < group2.len() && combined[i] == group2[group2_iter] {
|
|
1142
|
-
xy.push('Y');
|
|
1143
|
-
group2_iter += 1;
|
|
1144
|
-
group_char = 'Y';
|
|
1145
|
-
}
|
|
1146
|
-
|
|
1147
|
-
// Computing ranks
|
|
1148
|
-
if is_repeat == false {
|
|
1149
|
-
// Check if current element has other occurences
|
|
1150
|
-
num_repeats = 1.0;
|
|
1151
|
-
for j in i + 1..combined.len() {
|
|
1152
|
-
if combined[i] == combined[j] {
|
|
1153
|
-
is_repeat = true;
|
|
1154
|
-
repeat_present = true;
|
|
1155
|
-
repeat_iter = 1.0;
|
|
1156
|
-
num_repeats += 1.0;
|
|
1157
|
-
} else {
|
|
1158
|
-
break;
|
|
1159
|
-
}
|
|
1160
|
-
}
|
|
1161
|
-
//println!("num_repeats:{}", num_repeats);
|
|
1162
|
-
if is_repeat == false {
|
|
1163
|
-
ranks.push(i as f64 + 1.0);
|
|
1164
|
-
if group_char == 'X' {
|
|
1165
|
-
weight_x += i as f64 + 1.0;
|
|
1166
|
-
} else if group_char == 'Y' {
|
|
1167
|
-
weight_y += i as f64 + 1.0;
|
|
1168
|
-
}
|
|
1169
|
-
//rank_frequencies.push(RankFreq {
|
|
1170
|
-
// rank: i as f64 + 1.0,
|
|
1171
|
-
// freq: 1,
|
|
1172
|
-
//});
|
|
1173
|
-
rank_frequencies.push(1.0);
|
|
1174
|
-
} else {
|
|
1175
|
-
frac_rank = calculate_frac_rank(i as f64 + 1.0, num_repeats);
|
|
1176
|
-
ranks.push(frac_rank);
|
|
1177
|
-
if group_char == 'X' {
|
|
1178
|
-
weight_x += frac_rank;
|
|
1179
|
-
} else if group_char == 'Y' {
|
|
1180
|
-
weight_y += frac_rank
|
|
1181
|
-
}
|
|
1182
|
-
//rank_frequencies.push(RankFreq {
|
|
1183
|
-
// rank: frac_rank,
|
|
1184
|
-
// freq: num_repeats as usize,
|
|
1185
|
-
//});
|
|
1186
|
-
rank_frequencies.push(num_repeats);
|
|
1187
|
-
}
|
|
1188
|
-
} else if repeat_iter < num_repeats {
|
|
1189
|
-
// Repeat case
|
|
1190
|
-
ranks.push(frac_rank);
|
|
1191
|
-
repeat_iter += 1.0;
|
|
1192
|
-
if group_char == 'X' {
|
|
1193
|
-
weight_x += frac_rank;
|
|
1194
|
-
} else if group_char == 'Y' {
|
|
1195
|
-
weight_y += frac_rank
|
|
1196
|
-
}
|
|
1197
|
-
if repeat_iter == num_repeats {
|
|
1198
|
-
is_repeat = false;
|
|
1199
|
-
}
|
|
1200
|
-
} else {
|
|
1201
|
-
//println!("i:{}", i);
|
|
1202
|
-
ranks.push(i as f64 + 1.0);
|
|
1203
|
-
repeat_iter = 1.0;
|
|
1204
|
-
num_repeats = 1.0;
|
|
1205
|
-
if group_char == 'X' {
|
|
1206
|
-
weight_x += i as f64 + 1.0;
|
|
1207
|
-
} else if group_char == 'Y' {
|
|
1208
|
-
weight_y += i as f64 + 1.0;
|
|
1209
|
-
}
|
|
1210
|
-
}
|
|
1211
|
-
}
|
|
1212
|
-
//println!("rank_frequencies:{:?}", rank_frequencies);
|
|
1213
|
-
//println!("xy:{:?}", xy);
|
|
1214
|
-
//println!("ranks:{:?}", ranks);
|
|
1215
|
-
//println!("weight_x:{}", weight_x);
|
|
1216
|
-
//println!("weight_y:{}", weight_y);
|
|
1217
|
-
|
|
1218
|
-
//u_dash (calculated below) calculates the "W Statistic" in wilcox.test function in R
|
|
1219
|
-
|
|
1220
|
-
let u_y = weight_y - (group2.len() as f64 * (group2.len() as f64 + 1.0) / 2.0) as f64;
|
|
1221
|
-
let u_dash_y = (u_y - (group1.len() * group2.len()) as f64).abs();
|
|
1222
|
-
//println!("u_dash_y:{}", u_dash_y);
|
|
1223
|
-
|
|
1224
|
-
let u_x = weight_x - (group1.len() as f64 * (group1.len() as f64 + 1.0) / 2.0) as f64;
|
|
1225
|
-
let _u_dash_x = (u_x - (group1.len() * group2.len()) as f64).abs();
|
|
1226
|
-
//println!("u_dash_x:{}", u_dash_x);
|
|
1227
|
-
|
|
1228
|
-
// Calculate test_statistic
|
|
1229
|
-
|
|
1230
|
-
//let t1 = weight_x - ((group1.len() as f64) * (group1.len() as f64 + 1.0)) / 2.0;
|
|
1231
|
-
//let t2 = weight_y - ((group2.len() as f64) * (group2.len() as f64 + 1.0)) / 2.0;
|
|
1232
|
-
//
|
|
1233
|
-
//let mut test_statistic = t1;
|
|
1234
|
-
//if t2 < t1 {
|
|
1235
|
-
// test_statistic = t2;
|
|
1236
|
-
//}
|
|
1237
|
-
|
|
1238
|
-
//println!("test_statistic:{}", test_statistic);
|
|
1239
|
-
|
|
1240
|
-
if group1.len() < threshold && group2.len() < threshold && repeat_present == false {
|
|
1241
|
-
// Compute exact p-values
|
|
1242
|
-
|
|
1243
|
-
// Calculate conditional probability for weight_y
|
|
1244
|
-
|
|
1245
|
-
if alternative == 'g' {
|
|
1246
|
-
// Alternative "greater"
|
|
1247
|
-
//if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
|
|
1248
|
-
// iterate_exact_p_values(ranks, weight_y, group2.len())
|
|
1249
|
-
//} else {
|
|
1250
|
-
calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
|
|
1251
|
-
//}
|
|
1252
|
-
} else if alternative == 'l' {
|
|
1253
|
-
// Alternative "lesser"
|
|
1254
|
-
//if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
|
|
1255
|
-
// iterate_exact_p_values(ranks, weight_x, group1.len())
|
|
1256
|
-
//} else {
|
|
1257
|
-
calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
|
|
1258
|
-
//}
|
|
1259
|
-
} else {
|
|
1260
|
-
// Two-sided distribution
|
|
1261
|
-
calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
|
|
1262
|
-
}
|
|
1263
|
-
} else {
|
|
1264
|
-
// Compute p-values from a normal distribution
|
|
1265
|
-
//println!("group1 length:{}", group1.len());
|
|
1266
|
-
//println!("group2 length:{}", group2.len());
|
|
1267
|
-
|
|
1268
|
-
let mut z = u_dash_y - ((group1.len() * group2.len()) as f64) / 2.0;
|
|
1269
|
-
//println!("z_original:{}", z);
|
|
1270
|
-
let mut nties_sum: f64 = 0.0;
|
|
1271
|
-
for i in 0..rank_frequencies.len() {
|
|
1272
|
-
nties_sum += rank_frequencies[i] * rank_frequencies[i] * rank_frequencies[i]
|
|
1273
|
-
- rank_frequencies[i];
|
|
1274
|
-
}
|
|
1275
|
-
|
|
1276
|
-
let sigma = (((group1.len() * group2.len()) as f64) / 12.0
|
|
1277
|
-
* ((group1.len() + group2.len() + 1) as f64
|
|
1278
|
-
- nties_sum
|
|
1279
|
-
/ (((group1.len() + group2.len()) as f64)
|
|
1280
|
-
* ((group1.len() + group2.len() - 1) as f64))))
|
|
1281
|
-
.sqrt();
|
|
1282
|
-
//println!("sigma:{}", sigma);
|
|
1283
|
-
let mut correction: f64 = 0.0;
|
|
1284
|
-
if correct == true {
|
|
1285
|
-
if alternative == 'g' {
|
|
1286
|
-
// Alternative "greater"
|
|
1287
|
-
correction = 0.5;
|
|
1288
|
-
} else if alternative == 'l' {
|
|
1289
|
-
// Alternative "lesser"
|
|
1290
|
-
correction = -0.5;
|
|
1291
|
-
} else {
|
|
1292
|
-
// Alternative "two-sided"
|
|
1293
|
-
if z > 0.0 {
|
|
1294
|
-
correction = 0.5;
|
|
1295
|
-
} else if z < 0.0 {
|
|
1296
|
-
correction = -0.5;
|
|
1297
|
-
} else {
|
|
1298
|
-
// z=0
|
|
1299
|
-
correction = 0.0;
|
|
1300
|
-
}
|
|
1301
|
-
}
|
|
1302
|
-
}
|
|
1303
|
-
z = (z - correction) / sigma;
|
|
1304
|
-
//println!("z:{}", z);
|
|
1305
|
-
if alternative == 'g' {
|
|
1306
|
-
// Alternative "greater"
|
|
1307
|
-
//println!("greater:{}", n.cdf(weight_y));
|
|
1308
|
-
//1.0 - n.cdf(z) // Applying continuity correction
|
|
1309
|
-
r_mathlib::normal_cdf(z, 0.0, 1.0, false, false)
|
|
1310
|
-
} else if alternative == 'l' {
|
|
1311
|
-
// Alternative "lesser"
|
|
1312
|
-
//println!("lesser:{}", n.cdf(weight_x));
|
|
1313
|
-
//n.cdf(z) // Applying continuity coorection
|
|
1314
|
-
r_mathlib::normal_cdf(z, 0.0, 1.0, true, false)
|
|
1315
|
-
} else {
|
|
1316
|
-
// Alternative "two-sided"
|
|
1317
|
-
let p_g = r_mathlib::normal_cdf(z, 0.0, 1.0, false, false); // Applying continuity correction
|
|
1318
|
-
let p_l = r_mathlib::normal_cdf(z, 0.0, 1.0, true, false); // Applying continuity correction
|
|
1319
|
-
let mut p_value;
|
|
1320
|
-
if p_g < p_l {
|
|
1321
|
-
p_value = 2.0 * p_g;
|
|
1322
|
-
} else {
|
|
1323
|
-
p_value = 2.0 * p_l;
|
|
1324
|
-
}
|
|
1325
|
-
//println!("p_value:{}", p_value);
|
|
1326
|
-
if p_value > 1.0 {
|
|
1327
|
-
p_value = 1.0;
|
|
1328
|
-
}
|
|
1329
|
-
p_value
|
|
1330
|
-
}
|
|
1331
|
-
}
|
|
1332
|
-
}
|
|
1333
|
-
|
|
1334
|
-
// To be used only when there are no ties in the input data
|
|
1335
|
-
#[allow(dead_code)]
|
|
1336
|
-
fn calculate_exact_probability(weight: f64, x: usize, y: usize, alternative: char) -> f64 {
|
|
1337
|
-
//println!("Using Wilcoxon CDF");
|
|
1338
|
-
let mut p_value;
|
|
1339
|
-
if alternative == 't' {
|
|
1340
|
-
if weight > ((x * y) as f64) / 2.0 {
|
|
1341
|
-
p_value = 2.0 * r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
|
|
1342
|
-
} else {
|
|
1343
|
-
p_value = 2.0 * r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
|
|
1344
|
-
}
|
|
1345
|
-
if p_value > 1.0 {
|
|
1346
|
-
p_value = 1.0;
|
|
1347
|
-
}
|
|
1348
|
-
} else if alternative == 'g' {
|
|
1349
|
-
p_value = r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
|
|
1350
|
-
} else if alternative == 'l' {
|
|
1351
|
-
p_value = r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
|
|
1352
|
-
} else {
|
|
1353
|
-
// Should not happen
|
|
1354
|
-
panic!("Unknown alternative option given, please check!");
|
|
1355
|
-
}
|
|
1356
|
-
//println!("p_value:{}", p_value);
|
|
1357
|
-
p_value
|
|
1358
|
-
}
|
|
1359
|
-
|
|
1360
|
-
#[allow(dead_code)]
|
|
1361
|
-
pub fn calculate_frac_rank(current_rank: f64, num_repeats: f64) -> f64 {
|
|
1362
|
-
let mut sum = 0.0;
|
|
1363
|
-
for i in 0..num_repeats as usize {
|
|
1364
|
-
let rank = current_rank + i as f64;
|
|
1365
|
-
sum += rank;
|
|
1366
|
-
}
|
|
1367
|
-
sum / num_repeats
|
|
1368
|
-
}
|
package/src/gdcmaf.rs
CHANGED
|
@@ -42,8 +42,8 @@ fn select_maf_col(d:String,columns:&Vec<String>) -> Vec<u8> {
|
|
|
42
42
|
for x in header_indices.iter() {
|
|
43
43
|
maf_out_lst.push(maf_cont_lst[*x].to_string());
|
|
44
44
|
};
|
|
45
|
-
maf_out_lst.push("\n".to_string());
|
|
46
45
|
maf_str.push_str(maf_out_lst.join("\t").as_str());
|
|
46
|
+
maf_str.push_str("\n");
|
|
47
47
|
}
|
|
48
48
|
};
|
|
49
49
|
maf_str.as_bytes().to_vec()
|
package/src/stats_functions.rs
CHANGED
|
@@ -141,274 +141,274 @@ fn chi_square_test(
|
|
|
141
141
|
}
|
|
142
142
|
}
|
|
143
143
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
//
|
|
322
|
-
// let sigma = (((group1.len() * group2.len()) as f64) / 12.0
|
|
323
|
-
// * ((group1.len() + group2.len() + 1) as f64
|
|
324
|
-
// - nties_sum
|
|
325
|
-
// / (((group1.len() + group2.len()) as f64)
|
|
326
|
-
// * ((group1.len() + group2.len() - 1) as f64))))
|
|
327
|
-
// .sqrt();
|
|
328
|
-
// //println!("sigma:{}", sigma);
|
|
329
|
-
// let mut correction: f64 = 0.0;
|
|
330
|
-
// if correct == true {
|
|
331
|
-
// if alternative == 'g' {
|
|
332
|
-
// // Alternative "greater"
|
|
333
|
-
// correction = 0.5;
|
|
334
|
-
// } else if alternative == 'l' {
|
|
335
|
-
// // Alternative "lesser"
|
|
336
|
-
// correction = -0.5;
|
|
337
|
-
// } else {
|
|
338
|
-
// // Alternative "two-sided"
|
|
339
|
-
// if z > 0.0 {
|
|
340
|
-
// correction = 0.5;
|
|
341
|
-
// } else if z < 0.0 {
|
|
342
|
-
// correction = -0.5;
|
|
343
|
-
// } else {
|
|
344
|
-
// // z=0
|
|
345
|
-
// correction = 0.0;
|
|
346
|
-
// }
|
|
347
|
-
// }
|
|
348
|
-
// }
|
|
349
|
-
// z = (z - correction) / sigma;
|
|
350
|
-
// //println!("z:{}", z);
|
|
351
|
-
// if alternative == 'g' {
|
|
352
|
-
// // Alternative "greater"
|
|
353
|
-
// //println!("greater:{}", n.cdf(weight_y));
|
|
354
|
-
// //1.0 - n.cdf(z) // Applying continuity correction
|
|
355
|
-
// r_mathlib::normal_cdf(z, 0.0, 1.0, false, false)
|
|
356
|
-
// } else if alternative == 'l' {
|
|
357
|
-
// // Alternative "lesser"
|
|
358
|
-
// //println!("lesser:{}", n.cdf(weight_x));
|
|
359
|
-
// //n.cdf(z) // Applying continuity coorection
|
|
360
|
-
// r_mathlib::normal_cdf(z, 0.0, 1.0, true, false)
|
|
361
|
-
// } else {
|
|
362
|
-
// // Alternative "two-sided"
|
|
363
|
-
// let p_g = r_mathlib::normal_cdf(z, 0.0, 1.0, false, false); // Applying continuity correction
|
|
364
|
-
// let p_l = r_mathlib::normal_cdf(z, 0.0, 1.0, true, false); // Applying continuity correction
|
|
365
|
-
// let mut p_value;
|
|
366
|
-
// if p_g < p_l {
|
|
367
|
-
// p_value = 2.0 * p_g;
|
|
368
|
-
// } else {
|
|
369
|
-
// p_value = 2.0 * p_l;
|
|
370
|
-
// }
|
|
371
|
-
// //println!("p_value:{}", p_value);
|
|
372
|
-
// if p_value > 1.0 {
|
|
373
|
-
// p_value = 1.0;
|
|
374
|
-
// }
|
|
375
|
-
// p_value
|
|
376
|
-
// }
|
|
377
|
-
// }
|
|
378
|
-
//}
|
|
144
|
+
#[allow(dead_code)]
|
|
145
|
+
pub fn wilcoxon_rank_sum_test(
|
|
146
|
+
mut group1: Vec<f64>,
|
|
147
|
+
mut group2: Vec<f64>,
|
|
148
|
+
threshold: usize,
|
|
149
|
+
alternative: char,
|
|
150
|
+
correct: bool,
|
|
151
|
+
) -> f64 {
|
|
152
|
+
// Check if there are any ties between the two groups
|
|
153
|
+
|
|
154
|
+
let mut combined = group1.clone();
|
|
155
|
+
combined.extend(group2.iter().cloned());
|
|
156
|
+
combined.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
157
|
+
//println!("combined:{:?}", combined);
|
|
158
|
+
|
|
159
|
+
group1.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
160
|
+
group2.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
161
|
+
//println!("group1:{:?}", group1);
|
|
162
|
+
//println!("group2:{:?}", group2);
|
|
163
|
+
|
|
164
|
+
let mut group1_iter = 0;
|
|
165
|
+
let mut group2_iter = 0;
|
|
166
|
+
let mut xy: Vec<char> = Vec::with_capacity(combined.len()); // Stores X-Y classification
|
|
167
|
+
let mut ranks: Vec<f64> = Vec::with_capacity(combined.len()); // Stores the rank of each element
|
|
168
|
+
let mut is_repeat = false;
|
|
169
|
+
let mut repeat_present = false;
|
|
170
|
+
let mut frac_rank: f64 = 0.0;
|
|
171
|
+
let mut num_repeats: f64 = 1.0;
|
|
172
|
+
let mut repeat_iter: f64 = 1.0;
|
|
173
|
+
#[allow(unused_variables)]
|
|
174
|
+
let mut weight_x: f64 = 0.0;
|
|
175
|
+
let mut weight_y: f64 = 0.0;
|
|
176
|
+
let mut group_char: char = 'X';
|
|
177
|
+
let mut rank_frequencies: Vec<f64> = Vec::with_capacity(combined.len());
|
|
178
|
+
for i in 0..combined.len() {
|
|
179
|
+
//println!("group1_iter:{}", group1_iter);
|
|
180
|
+
//println!("group2_iter:{}", group2_iter);
|
|
181
|
+
//println!("item1:{}", combined[i]);
|
|
182
|
+
//println!("is_repeat:{}", is_repeat);
|
|
183
|
+
if group1_iter < group1.len() && combined[i] == group1[group1_iter] {
|
|
184
|
+
xy.push('X');
|
|
185
|
+
group1_iter += 1;
|
|
186
|
+
group_char = 'X';
|
|
187
|
+
} else if group2_iter < group2.len() && combined[i] == group2[group2_iter] {
|
|
188
|
+
xy.push('Y');
|
|
189
|
+
group2_iter += 1;
|
|
190
|
+
group_char = 'Y';
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Computing ranks
|
|
194
|
+
if is_repeat == false {
|
|
195
|
+
// Check if current element has other occurences
|
|
196
|
+
num_repeats = 1.0;
|
|
197
|
+
for j in i + 1..combined.len() {
|
|
198
|
+
if combined[i] == combined[j] {
|
|
199
|
+
is_repeat = true;
|
|
200
|
+
repeat_present = true;
|
|
201
|
+
repeat_iter = 1.0;
|
|
202
|
+
num_repeats += 1.0;
|
|
203
|
+
} else {
|
|
204
|
+
break;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
//println!("num_repeats:{}", num_repeats);
|
|
208
|
+
if is_repeat == false {
|
|
209
|
+
ranks.push(i as f64 + 1.0);
|
|
210
|
+
if group_char == 'X' {
|
|
211
|
+
weight_x += i as f64 + 1.0;
|
|
212
|
+
} else if group_char == 'Y' {
|
|
213
|
+
weight_y += i as f64 + 1.0;
|
|
214
|
+
}
|
|
215
|
+
//rank_frequencies.push(RankFreq {
|
|
216
|
+
// rank: i as f64 + 1.0,
|
|
217
|
+
// freq: 1,
|
|
218
|
+
//});
|
|
219
|
+
rank_frequencies.push(1.0);
|
|
220
|
+
} else {
|
|
221
|
+
frac_rank = calculate_frac_rank(i as f64 + 1.0, num_repeats);
|
|
222
|
+
ranks.push(frac_rank);
|
|
223
|
+
if group_char == 'X' {
|
|
224
|
+
weight_x += frac_rank;
|
|
225
|
+
} else if group_char == 'Y' {
|
|
226
|
+
weight_y += frac_rank
|
|
227
|
+
}
|
|
228
|
+
//rank_frequencies.push(RankFreq {
|
|
229
|
+
// rank: frac_rank,
|
|
230
|
+
// freq: num_repeats as usize,
|
|
231
|
+
//});
|
|
232
|
+
rank_frequencies.push(num_repeats);
|
|
233
|
+
}
|
|
234
|
+
} else if repeat_iter < num_repeats {
|
|
235
|
+
// Repeat case
|
|
236
|
+
ranks.push(frac_rank);
|
|
237
|
+
repeat_iter += 1.0;
|
|
238
|
+
if group_char == 'X' {
|
|
239
|
+
weight_x += frac_rank;
|
|
240
|
+
} else if group_char == 'Y' {
|
|
241
|
+
weight_y += frac_rank
|
|
242
|
+
}
|
|
243
|
+
if repeat_iter == num_repeats {
|
|
244
|
+
is_repeat = false;
|
|
245
|
+
}
|
|
246
|
+
} else {
|
|
247
|
+
//println!("i:{}", i);
|
|
248
|
+
ranks.push(i as f64 + 1.0);
|
|
249
|
+
repeat_iter = 1.0;
|
|
250
|
+
num_repeats = 1.0;
|
|
251
|
+
if group_char == 'X' {
|
|
252
|
+
weight_x += i as f64 + 1.0;
|
|
253
|
+
} else if group_char == 'Y' {
|
|
254
|
+
weight_y += i as f64 + 1.0;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
//println!("rank_frequencies:{:?}", rank_frequencies);
|
|
259
|
+
//println!("xy:{:?}", xy);
|
|
260
|
+
//println!("ranks:{:?}", ranks);
|
|
261
|
+
//println!("weight_x:{}", weight_x);
|
|
262
|
+
//println!("weight_y:{}", weight_y);
|
|
263
|
+
|
|
264
|
+
//u_dash (calculated below) calculates the "W Statistic" in wilcox.test function in R
|
|
265
|
+
|
|
266
|
+
let u_y = weight_y - (group2.len() as f64 * (group2.len() as f64 + 1.0) / 2.0) as f64;
|
|
267
|
+
let u_dash_y = (u_y - (group1.len() * group2.len()) as f64).abs();
|
|
268
|
+
//println!("u_dash_y:{}", u_dash_y);
|
|
269
|
+
|
|
270
|
+
let u_x = weight_x - (group1.len() as f64 * (group1.len() as f64 + 1.0) / 2.0) as f64;
|
|
271
|
+
let _u_dash_x = (u_x - (group1.len() * group2.len()) as f64).abs();
|
|
272
|
+
//println!("u_dash_x:{}", u_dash_x);
|
|
273
|
+
|
|
274
|
+
// Calculate test_statistic
|
|
275
|
+
|
|
276
|
+
//let t1 = weight_x - ((group1.len() as f64) * (group1.len() as f64 + 1.0)) / 2.0;
|
|
277
|
+
//let t2 = weight_y - ((group2.len() as f64) * (group2.len() as f64 + 1.0)) / 2.0;
|
|
278
|
+
//
|
|
279
|
+
//let mut test_statistic = t1;
|
|
280
|
+
//if t2 < t1 {
|
|
281
|
+
// test_statistic = t2;
|
|
282
|
+
//}
|
|
283
|
+
|
|
284
|
+
//println!("test_statistic:{}", test_statistic);
|
|
285
|
+
|
|
286
|
+
if group1.len() < threshold && group2.len() < threshold && repeat_present == false {
|
|
287
|
+
// Compute exact p-values
|
|
288
|
+
|
|
289
|
+
// Calculate conditional probability for weight_y
|
|
290
|
+
|
|
291
|
+
if alternative == 'g' {
|
|
292
|
+
// Alternative "greater"
|
|
293
|
+
//if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
|
|
294
|
+
// iterate_exact_p_values(ranks, weight_y, group2.len())
|
|
295
|
+
//} else {
|
|
296
|
+
calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
|
|
297
|
+
//}
|
|
298
|
+
} else if alternative == 'l' {
|
|
299
|
+
// Alternative "lesser"
|
|
300
|
+
//if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
|
|
301
|
+
// iterate_exact_p_values(ranks, weight_x, group1.len())
|
|
302
|
+
//} else {
|
|
303
|
+
calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
|
|
304
|
+
//}
|
|
305
|
+
} else {
|
|
306
|
+
// Two-sided distribution
|
|
307
|
+
calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
|
|
308
|
+
}
|
|
309
|
+
} else {
|
|
310
|
+
// Compute p-values from a normal distribution
|
|
311
|
+
//println!("group1 length:{}", group1.len());
|
|
312
|
+
//println!("group2 length:{}", group2.len());
|
|
313
|
+
|
|
314
|
+
let mut z = u_dash_y - ((group1.len() * group2.len()) as f64) / 2.0;
|
|
315
|
+
//println!("z_original:{}", z);
|
|
316
|
+
let mut nties_sum: f64 = 0.0;
|
|
317
|
+
for i in 0..rank_frequencies.len() {
|
|
318
|
+
nties_sum += rank_frequencies[i] * rank_frequencies[i] * rank_frequencies[i]
|
|
319
|
+
- rank_frequencies[i];
|
|
320
|
+
}
|
|
379
321
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
//
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
//
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
//
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
//
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
//
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
//
|
|
409
|
-
|
|
410
|
-
//
|
|
411
|
-
//
|
|
412
|
-
//
|
|
413
|
-
|
|
414
|
-
|
|
322
|
+
let sigma = (((group1.len() * group2.len()) as f64) / 12.0
|
|
323
|
+
* ((group1.len() + group2.len() + 1) as f64
|
|
324
|
+
- nties_sum
|
|
325
|
+
/ (((group1.len() + group2.len()) as f64)
|
|
326
|
+
* ((group1.len() + group2.len() - 1) as f64))))
|
|
327
|
+
.sqrt();
|
|
328
|
+
//println!("sigma:{}", sigma);
|
|
329
|
+
let mut correction: f64 = 0.0;
|
|
330
|
+
if correct == true {
|
|
331
|
+
if alternative == 'g' {
|
|
332
|
+
// Alternative "greater"
|
|
333
|
+
correction = 0.5;
|
|
334
|
+
} else if alternative == 'l' {
|
|
335
|
+
// Alternative "lesser"
|
|
336
|
+
correction = -0.5;
|
|
337
|
+
} else {
|
|
338
|
+
// Alternative "two-sided"
|
|
339
|
+
if z > 0.0 {
|
|
340
|
+
correction = 0.5;
|
|
341
|
+
} else if z < 0.0 {
|
|
342
|
+
correction = -0.5;
|
|
343
|
+
} else {
|
|
344
|
+
// z=0
|
|
345
|
+
correction = 0.0;
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
z = (z - correction) / sigma;
|
|
350
|
+
//println!("z:{}", z);
|
|
351
|
+
if alternative == 'g' {
|
|
352
|
+
// Alternative "greater"
|
|
353
|
+
//println!("greater:{}", n.cdf(weight_y));
|
|
354
|
+
//1.0 - n.cdf(z) // Applying continuity correction
|
|
355
|
+
r_mathlib::normal_cdf(z, 0.0, 1.0, false, false)
|
|
356
|
+
} else if alternative == 'l' {
|
|
357
|
+
// Alternative "lesser"
|
|
358
|
+
//println!("lesser:{}", n.cdf(weight_x));
|
|
359
|
+
//n.cdf(z) // Applying continuity coorection
|
|
360
|
+
r_mathlib::normal_cdf(z, 0.0, 1.0, true, false)
|
|
361
|
+
} else {
|
|
362
|
+
// Alternative "two-sided"
|
|
363
|
+
let p_g = r_mathlib::normal_cdf(z, 0.0, 1.0, false, false); // Applying continuity correction
|
|
364
|
+
let p_l = r_mathlib::normal_cdf(z, 0.0, 1.0, true, false); // Applying continuity correction
|
|
365
|
+
let mut p_value;
|
|
366
|
+
if p_g < p_l {
|
|
367
|
+
p_value = 2.0 * p_g;
|
|
368
|
+
} else {
|
|
369
|
+
p_value = 2.0 * p_l;
|
|
370
|
+
}
|
|
371
|
+
//println!("p_value:{}", p_value);
|
|
372
|
+
if p_value > 1.0 {
|
|
373
|
+
p_value = 1.0;
|
|
374
|
+
}
|
|
375
|
+
p_value
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// To be used only when there are no ties in the input data
|
|
381
|
+
#[allow(dead_code)]
|
|
382
|
+
fn calculate_exact_probability(weight: f64, x: usize, y: usize, alternative: char) -> f64 {
|
|
383
|
+
//println!("Using Wilcoxon CDF");
|
|
384
|
+
let mut p_value;
|
|
385
|
+
if alternative == 't' {
|
|
386
|
+
if weight > ((x * y) as f64) / 2.0 {
|
|
387
|
+
p_value = 2.0 * r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
|
|
388
|
+
} else {
|
|
389
|
+
p_value = 2.0 * r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
|
|
390
|
+
}
|
|
391
|
+
if p_value > 1.0 {
|
|
392
|
+
p_value = 1.0;
|
|
393
|
+
}
|
|
394
|
+
} else if alternative == 'g' {
|
|
395
|
+
p_value = r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
|
|
396
|
+
} else if alternative == 'l' {
|
|
397
|
+
p_value = r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
|
|
398
|
+
} else {
|
|
399
|
+
// Should not happen
|
|
400
|
+
panic!("Unknown alternative option given, please check!");
|
|
401
|
+
}
|
|
402
|
+
//println!("p_value:{}", p_value);
|
|
403
|
+
p_value
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
#[allow(dead_code)]
|
|
407
|
+
pub fn calculate_frac_rank(current_rank: f64, num_repeats: f64) -> f64 {
|
|
408
|
+
let mut sum = 0.0;
|
|
409
|
+
for i in 0..num_repeats as usize {
|
|
410
|
+
let rank = current_rank + i as f64;
|
|
411
|
+
sum += rank;
|
|
412
|
+
}
|
|
413
|
+
sum / num_repeats
|
|
414
|
+
}
|