@sjcrh/proteinpaint-rust 2.44.0 → 2.49.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -1,5 +1,5 @@
1
1
  [package]
2
- name = "bam"
2
+ name = "PP_rust_utilities"
3
3
  version = "0.1.0"
4
4
  authors = ["rpaul1 <rpaul1@stjude.org>"]
5
5
  edition = "2018"
@@ -76,9 +76,9 @@ path="src/gdcmaf.rs"
76
76
  name="topGeneByExpressionVariance"
77
77
  path="src/topGeneByExpressionVariance.rs"
78
78
 
79
- #[[bin]]
80
- #name="wilcoxon"
81
- #path="src/wilcoxon.rs"
79
+ [[bin]]
80
+ name="wilcoxon"
81
+ path="src/wilcoxon.rs"
82
82
 
83
83
  [[bin]]
84
84
  name="DEanalysis"
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.44.0",
2
+ "version": "2.49.0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
@@ -7,9 +7,9 @@
7
7
  "proteinpaint-rust": "index.js"
8
8
  },
9
9
  "scripts": {
10
- "dev": "cargo clean && cargo build --release",
11
- "build": "cargo clean && cargo build --release",
12
- "postinstall": "if [ ! -d ./test ] & [ ! -d ./target/release ]; then cargo clean && cargo build --release; fi",
10
+ "dev": "cargo build --release",
11
+ "build": "cargo build --release",
12
+ "postinstall": "if [ ! -d ./test ] & [ ! -d ./target/release ]; then cargo build --release; fi",
13
13
  "test": "tape **/test/*.spec.js",
14
14
  "test:unit": "tape **/test/*.unit.spec.js",
15
15
  "test:integration": "echo 'TODO: rust integration tests'"
package/src/DEanalysis.rs CHANGED
@@ -8,7 +8,6 @@ use nalgebra::base::Matrix;
8
8
  use nalgebra::base::VecStorage;
9
9
  use nalgebra::DMatrix;
10
10
  use nalgebra::ViewStorage;
11
- use r_mathlib;
12
11
  use serde::{Deserialize, Serialize};
13
12
  use serde_json;
14
13
  use statrs::statistics::Data;
@@ -24,7 +23,7 @@ use std::time::Instant;
24
23
  //use std::cmp::Ordering;
25
24
  //use std::env;
26
25
  use std::io;
27
- //mod stats_functions; // Importing Wilcoxon function from stats_functions.rs
26
+ mod stats_functions; // Importing Wilcoxon function from stats_functions.rs
28
27
  const PAR_CUTOFF: usize = 100000; // Cutoff for triggering multithreading processing of data
29
28
 
30
29
  //const PAR_CUTOFF: usize = 1000000000000000;
@@ -458,7 +457,7 @@ fn main() {
458
457
  }
459
458
  //println!("treated{:?}", treated);
460
459
  //println!("control{:?}", control);
461
- let p_value = wilcoxon_rank_sum_test(
460
+ let p_value = stats_functions::wilcoxon_rank_sum_test(
462
461
  treated.clone(),
463
462
  control.clone(),
464
463
  THRESHOLD,
@@ -535,7 +534,7 @@ fn main() {
535
534
  }
536
535
  //println!("treated{:?}", treated);
537
536
  //println!("control{:?}", control);
538
- let p_value = wilcoxon_rank_sum_test(
537
+ let p_value = stats_functions::wilcoxon_rank_sum_test(
539
538
  treated.clone(),
540
539
  control.clone(),
541
540
  THRESHOLD,
@@ -882,7 +881,7 @@ fn rank_vector(input_vector: &Vec<f64>) -> Vec<f64> {
882
881
  rank: i as f64 + 1.0,
883
882
  });
884
883
  } else {
885
- frac_rank = calculate_frac_rank(i as f64 + 1.0, num_repeats);
884
+ frac_rank = stats_functions::calculate_frac_rank(i as f64 + 1.0, num_repeats);
886
885
  ranks.push(RankOutput {
887
886
  orig_index: input_vector_sorted[i].orig_index,
888
887
  rank: frac_rank,
@@ -1095,274 +1094,3 @@ fn cpm(
1095
1094
  //println!("output_matrix:{:?}", output_matrix);
1096
1095
  output_matrix
1097
1096
  }
1098
-
1099
- pub fn wilcoxon_rank_sum_test(
1100
- mut group1: Vec<f64>,
1101
- mut group2: Vec<f64>,
1102
- threshold: usize,
1103
- alternative: char,
1104
- correct: bool,
1105
- ) -> f64 {
1106
- // Check if there are any ties between the two groups
1107
-
1108
- let mut combined = group1.clone();
1109
- combined.extend(group2.iter().cloned());
1110
- combined.sort_by(|a, b| a.partial_cmp(b).unwrap());
1111
- //println!("combined:{:?}", combined);
1112
-
1113
- group1.sort_by(|a, b| a.partial_cmp(b).unwrap());
1114
- group2.sort_by(|a, b| a.partial_cmp(b).unwrap());
1115
- //println!("group1:{:?}", group1);
1116
- //println!("group2:{:?}", group2);
1117
-
1118
- let mut group1_iter = 0;
1119
- let mut group2_iter = 0;
1120
- let mut xy: Vec<char> = Vec::with_capacity(combined.len()); // Stores X-Y classification
1121
- let mut ranks: Vec<f64> = Vec::with_capacity(combined.len()); // Stores the rank of each element
1122
- let mut is_repeat = false;
1123
- let mut repeat_present = false;
1124
- let mut frac_rank: f64 = 0.0;
1125
- let mut num_repeats: f64 = 1.0;
1126
- let mut repeat_iter: f64 = 1.0;
1127
- #[allow(unused_variables)]
1128
- let mut weight_x: f64 = 0.0;
1129
- let mut weight_y: f64 = 0.0;
1130
- let mut group_char: char = 'X';
1131
- let mut rank_frequencies: Vec<f64> = Vec::with_capacity(combined.len());
1132
- for i in 0..combined.len() {
1133
- //println!("group1_iter:{}", group1_iter);
1134
- //println!("group2_iter:{}", group2_iter);
1135
- //println!("item1:{}", combined[i]);
1136
- //println!("is_repeat:{}", is_repeat);
1137
- if group1_iter < group1.len() && combined[i] == group1[group1_iter] {
1138
- xy.push('X');
1139
- group1_iter += 1;
1140
- group_char = 'X';
1141
- } else if group2_iter < group2.len() && combined[i] == group2[group2_iter] {
1142
- xy.push('Y');
1143
- group2_iter += 1;
1144
- group_char = 'Y';
1145
- }
1146
-
1147
- // Computing ranks
1148
- if is_repeat == false {
1149
- // Check if current element has other occurences
1150
- num_repeats = 1.0;
1151
- for j in i + 1..combined.len() {
1152
- if combined[i] == combined[j] {
1153
- is_repeat = true;
1154
- repeat_present = true;
1155
- repeat_iter = 1.0;
1156
- num_repeats += 1.0;
1157
- } else {
1158
- break;
1159
- }
1160
- }
1161
- //println!("num_repeats:{}", num_repeats);
1162
- if is_repeat == false {
1163
- ranks.push(i as f64 + 1.0);
1164
- if group_char == 'X' {
1165
- weight_x += i as f64 + 1.0;
1166
- } else if group_char == 'Y' {
1167
- weight_y += i as f64 + 1.0;
1168
- }
1169
- //rank_frequencies.push(RankFreq {
1170
- // rank: i as f64 + 1.0,
1171
- // freq: 1,
1172
- //});
1173
- rank_frequencies.push(1.0);
1174
- } else {
1175
- frac_rank = calculate_frac_rank(i as f64 + 1.0, num_repeats);
1176
- ranks.push(frac_rank);
1177
- if group_char == 'X' {
1178
- weight_x += frac_rank;
1179
- } else if group_char == 'Y' {
1180
- weight_y += frac_rank
1181
- }
1182
- //rank_frequencies.push(RankFreq {
1183
- // rank: frac_rank,
1184
- // freq: num_repeats as usize,
1185
- //});
1186
- rank_frequencies.push(num_repeats);
1187
- }
1188
- } else if repeat_iter < num_repeats {
1189
- // Repeat case
1190
- ranks.push(frac_rank);
1191
- repeat_iter += 1.0;
1192
- if group_char == 'X' {
1193
- weight_x += frac_rank;
1194
- } else if group_char == 'Y' {
1195
- weight_y += frac_rank
1196
- }
1197
- if repeat_iter == num_repeats {
1198
- is_repeat = false;
1199
- }
1200
- } else {
1201
- //println!("i:{}", i);
1202
- ranks.push(i as f64 + 1.0);
1203
- repeat_iter = 1.0;
1204
- num_repeats = 1.0;
1205
- if group_char == 'X' {
1206
- weight_x += i as f64 + 1.0;
1207
- } else if group_char == 'Y' {
1208
- weight_y += i as f64 + 1.0;
1209
- }
1210
- }
1211
- }
1212
- //println!("rank_frequencies:{:?}", rank_frequencies);
1213
- //println!("xy:{:?}", xy);
1214
- //println!("ranks:{:?}", ranks);
1215
- //println!("weight_x:{}", weight_x);
1216
- //println!("weight_y:{}", weight_y);
1217
-
1218
- //u_dash (calculated below) calculates the "W Statistic" in wilcox.test function in R
1219
-
1220
- let u_y = weight_y - (group2.len() as f64 * (group2.len() as f64 + 1.0) / 2.0) as f64;
1221
- let u_dash_y = (u_y - (group1.len() * group2.len()) as f64).abs();
1222
- //println!("u_dash_y:{}", u_dash_y);
1223
-
1224
- let u_x = weight_x - (group1.len() as f64 * (group1.len() as f64 + 1.0) / 2.0) as f64;
1225
- let _u_dash_x = (u_x - (group1.len() * group2.len()) as f64).abs();
1226
- //println!("u_dash_x:{}", u_dash_x);
1227
-
1228
- // Calculate test_statistic
1229
-
1230
- //let t1 = weight_x - ((group1.len() as f64) * (group1.len() as f64 + 1.0)) / 2.0;
1231
- //let t2 = weight_y - ((group2.len() as f64) * (group2.len() as f64 + 1.0)) / 2.0;
1232
- //
1233
- //let mut test_statistic = t1;
1234
- //if t2 < t1 {
1235
- // test_statistic = t2;
1236
- //}
1237
-
1238
- //println!("test_statistic:{}", test_statistic);
1239
-
1240
- if group1.len() < threshold && group2.len() < threshold && repeat_present == false {
1241
- // Compute exact p-values
1242
-
1243
- // Calculate conditional probability for weight_y
1244
-
1245
- if alternative == 'g' {
1246
- // Alternative "greater"
1247
- //if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
1248
- // iterate_exact_p_values(ranks, weight_y, group2.len())
1249
- //} else {
1250
- calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
1251
- //}
1252
- } else if alternative == 'l' {
1253
- // Alternative "lesser"
1254
- //if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
1255
- // iterate_exact_p_values(ranks, weight_x, group1.len())
1256
- //} else {
1257
- calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
1258
- //}
1259
- } else {
1260
- // Two-sided distribution
1261
- calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
1262
- }
1263
- } else {
1264
- // Compute p-values from a normal distribution
1265
- //println!("group1 length:{}", group1.len());
1266
- //println!("group2 length:{}", group2.len());
1267
-
1268
- let mut z = u_dash_y - ((group1.len() * group2.len()) as f64) / 2.0;
1269
- //println!("z_original:{}", z);
1270
- let mut nties_sum: f64 = 0.0;
1271
- for i in 0..rank_frequencies.len() {
1272
- nties_sum += rank_frequencies[i] * rank_frequencies[i] * rank_frequencies[i]
1273
- - rank_frequencies[i];
1274
- }
1275
-
1276
- let sigma = (((group1.len() * group2.len()) as f64) / 12.0
1277
- * ((group1.len() + group2.len() + 1) as f64
1278
- - nties_sum
1279
- / (((group1.len() + group2.len()) as f64)
1280
- * ((group1.len() + group2.len() - 1) as f64))))
1281
- .sqrt();
1282
- //println!("sigma:{}", sigma);
1283
- let mut correction: f64 = 0.0;
1284
- if correct == true {
1285
- if alternative == 'g' {
1286
- // Alternative "greater"
1287
- correction = 0.5;
1288
- } else if alternative == 'l' {
1289
- // Alternative "lesser"
1290
- correction = -0.5;
1291
- } else {
1292
- // Alternative "two-sided"
1293
- if z > 0.0 {
1294
- correction = 0.5;
1295
- } else if z < 0.0 {
1296
- correction = -0.5;
1297
- } else {
1298
- // z=0
1299
- correction = 0.0;
1300
- }
1301
- }
1302
- }
1303
- z = (z - correction) / sigma;
1304
- //println!("z:{}", z);
1305
- if alternative == 'g' {
1306
- // Alternative "greater"
1307
- //println!("greater:{}", n.cdf(weight_y));
1308
- //1.0 - n.cdf(z) // Applying continuity correction
1309
- r_mathlib::normal_cdf(z, 0.0, 1.0, false, false)
1310
- } else if alternative == 'l' {
1311
- // Alternative "lesser"
1312
- //println!("lesser:{}", n.cdf(weight_x));
1313
- //n.cdf(z) // Applying continuity coorection
1314
- r_mathlib::normal_cdf(z, 0.0, 1.0, true, false)
1315
- } else {
1316
- // Alternative "two-sided"
1317
- let p_g = r_mathlib::normal_cdf(z, 0.0, 1.0, false, false); // Applying continuity correction
1318
- let p_l = r_mathlib::normal_cdf(z, 0.0, 1.0, true, false); // Applying continuity correction
1319
- let mut p_value;
1320
- if p_g < p_l {
1321
- p_value = 2.0 * p_g;
1322
- } else {
1323
- p_value = 2.0 * p_l;
1324
- }
1325
- //println!("p_value:{}", p_value);
1326
- if p_value > 1.0 {
1327
- p_value = 1.0;
1328
- }
1329
- p_value
1330
- }
1331
- }
1332
- }
1333
-
1334
- // To be used only when there are no ties in the input data
1335
- #[allow(dead_code)]
1336
- fn calculate_exact_probability(weight: f64, x: usize, y: usize, alternative: char) -> f64 {
1337
- //println!("Using Wilcoxon CDF");
1338
- let mut p_value;
1339
- if alternative == 't' {
1340
- if weight > ((x * y) as f64) / 2.0 {
1341
- p_value = 2.0 * r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
1342
- } else {
1343
- p_value = 2.0 * r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
1344
- }
1345
- if p_value > 1.0 {
1346
- p_value = 1.0;
1347
- }
1348
- } else if alternative == 'g' {
1349
- p_value = r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
1350
- } else if alternative == 'l' {
1351
- p_value = r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
1352
- } else {
1353
- // Should not happen
1354
- panic!("Unknown alternative option given, please check!");
1355
- }
1356
- //println!("p_value:{}", p_value);
1357
- p_value
1358
- }
1359
-
1360
- #[allow(dead_code)]
1361
- pub fn calculate_frac_rank(current_rank: f64, num_repeats: f64) -> f64 {
1362
- let mut sum = 0.0;
1363
- for i in 0..num_repeats as usize {
1364
- let rank = current_rank + i as f64;
1365
- sum += rank;
1366
- }
1367
- sum / num_repeats
1368
- }
@@ -141,274 +141,274 @@ fn chi_square_test(
141
141
  }
142
142
  }
143
143
 
144
- //#[allow(dead_code)]
145
- //pub fn wilcoxon_rank_sum_test(
146
- // mut group1: Vec<f64>,
147
- // mut group2: Vec<f64>,
148
- // threshold: usize,
149
- // alternative: char,
150
- // correct: bool,
151
- //) -> f64 {
152
- // // Check if there are any ties between the two groups
153
- //
154
- // let mut combined = group1.clone();
155
- // combined.extend(group2.iter().cloned());
156
- // combined.sort_by(|a, b| a.partial_cmp(b).unwrap());
157
- // //println!("combined:{:?}", combined);
158
- //
159
- // group1.sort_by(|a, b| a.partial_cmp(b).unwrap());
160
- // group2.sort_by(|a, b| a.partial_cmp(b).unwrap());
161
- // //println!("group1:{:?}", group1);
162
- // //println!("group2:{:?}", group2);
163
- //
164
- // let mut group1_iter = 0;
165
- // let mut group2_iter = 0;
166
- // let mut xy: Vec<char> = Vec::with_capacity(combined.len()); // Stores X-Y classification
167
- // let mut ranks: Vec<f64> = Vec::with_capacity(combined.len()); // Stores the rank of each element
168
- // let mut is_repeat = false;
169
- // let mut repeat_present = false;
170
- // let mut frac_rank: f64 = 0.0;
171
- // let mut num_repeats: f64 = 1.0;
172
- // let mut repeat_iter: f64 = 1.0;
173
- // #[allow(unused_variables)]
174
- // let mut weight_x: f64 = 0.0;
175
- // let mut weight_y: f64 = 0.0;
176
- // let mut group_char: char = 'X';
177
- // let mut rank_frequencies: Vec<f64> = Vec::with_capacity(combined.len());
178
- // for i in 0..combined.len() {
179
- // //println!("group1_iter:{}", group1_iter);
180
- // //println!("group2_iter:{}", group2_iter);
181
- // //println!("item1:{}", combined[i]);
182
- // //println!("is_repeat:{}", is_repeat);
183
- // if group1_iter < group1.len() && combined[i] == group1[group1_iter] {
184
- // xy.push('X');
185
- // group1_iter += 1;
186
- // group_char = 'X';
187
- // } else if group2_iter < group2.len() && combined[i] == group2[group2_iter] {
188
- // xy.push('Y');
189
- // group2_iter += 1;
190
- // group_char = 'Y';
191
- // }
192
- //
193
- // // Computing ranks
194
- // if is_repeat == false {
195
- // // Check if current element has other occurences
196
- // num_repeats = 1.0;
197
- // for j in i + 1..combined.len() {
198
- // if combined[i] == combined[j] {
199
- // is_repeat = true;
200
- // repeat_present = true;
201
- // repeat_iter = 1.0;
202
- // num_repeats += 1.0;
203
- // } else {
204
- // break;
205
- // }
206
- // }
207
- // //println!("num_repeats:{}", num_repeats);
208
- // if is_repeat == false {
209
- // ranks.push(i as f64 + 1.0);
210
- // if group_char == 'X' {
211
- // weight_x += i as f64 + 1.0;
212
- // } else if group_char == 'Y' {
213
- // weight_y += i as f64 + 1.0;
214
- // }
215
- // //rank_frequencies.push(RankFreq {
216
- // // rank: i as f64 + 1.0,
217
- // // freq: 1,
218
- // //});
219
- // rank_frequencies.push(1.0);
220
- // } else {
221
- // frac_rank = calculate_frac_rank(i as f64 + 1.0, num_repeats);
222
- // ranks.push(frac_rank);
223
- // if group_char == 'X' {
224
- // weight_x += frac_rank;
225
- // } else if group_char == 'Y' {
226
- // weight_y += frac_rank
227
- // }
228
- // //rank_frequencies.push(RankFreq {
229
- // // rank: frac_rank,
230
- // // freq: num_repeats as usize,
231
- // //});
232
- // rank_frequencies.push(num_repeats);
233
- // }
234
- // } else if repeat_iter < num_repeats {
235
- // // Repeat case
236
- // ranks.push(frac_rank);
237
- // repeat_iter += 1.0;
238
- // if group_char == 'X' {
239
- // weight_x += frac_rank;
240
- // } else if group_char == 'Y' {
241
- // weight_y += frac_rank
242
- // }
243
- // if repeat_iter == num_repeats {
244
- // is_repeat = false;
245
- // }
246
- // } else {
247
- // //println!("i:{}", i);
248
- // ranks.push(i as f64 + 1.0);
249
- // repeat_iter = 1.0;
250
- // num_repeats = 1.0;
251
- // if group_char == 'X' {
252
- // weight_x += i as f64 + 1.0;
253
- // } else if group_char == 'Y' {
254
- // weight_y += i as f64 + 1.0;
255
- // }
256
- // }
257
- // }
258
- // //println!("rank_frequencies:{:?}", rank_frequencies);
259
- // //println!("xy:{:?}", xy);
260
- // //println!("ranks:{:?}", ranks);
261
- // //println!("weight_x:{}", weight_x);
262
- // //println!("weight_y:{}", weight_y);
263
- //
264
- // //u_dash (calculated below) calculates the "W Statistic" in wilcox.test function in R
265
- //
266
- // let u_y = weight_y - (group2.len() as f64 * (group2.len() as f64 + 1.0) / 2.0) as f64;
267
- // let u_dash_y = (u_y - (group1.len() * group2.len()) as f64).abs();
268
- // //println!("u_dash_y:{}", u_dash_y);
269
- //
270
- // let u_x = weight_x - (group1.len() as f64 * (group1.len() as f64 + 1.0) / 2.0) as f64;
271
- // let _u_dash_x = (u_x - (group1.len() * group2.len()) as f64).abs();
272
- // //println!("u_dash_x:{}", u_dash_x);
273
- //
274
- // // Calculate test_statistic
275
- //
276
- // //let t1 = weight_x - ((group1.len() as f64) * (group1.len() as f64 + 1.0)) / 2.0;
277
- // //let t2 = weight_y - ((group2.len() as f64) * (group2.len() as f64 + 1.0)) / 2.0;
278
- // //
279
- // //let mut test_statistic = t1;
280
- // //if t2 < t1 {
281
- // // test_statistic = t2;
282
- // //}
283
- //
284
- // //println!("test_statistic:{}", test_statistic);
285
- //
286
- // if group1.len() < threshold && group2.len() < threshold && repeat_present == false {
287
- // // Compute exact p-values
288
- //
289
- // // Calculate conditional probability for weight_y
290
- //
291
- // if alternative == 'g' {
292
- // // Alternative "greater"
293
- // //if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
294
- // // iterate_exact_p_values(ranks, weight_y, group2.len())
295
- // //} else {
296
- // calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
297
- // //}
298
- // } else if alternative == 'l' {
299
- // // Alternative "lesser"
300
- // //if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
301
- // // iterate_exact_p_values(ranks, weight_x, group1.len())
302
- // //} else {
303
- // calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
304
- // //}
305
- // } else {
306
- // // Two-sided distribution
307
- // calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
308
- // }
309
- // } else {
310
- // // Compute p-values from a normal distribution
311
- // //println!("group1 length:{}", group1.len());
312
- // //println!("group2 length:{}", group2.len());
313
- //
314
- // let mut z = u_dash_y - ((group1.len() * group2.len()) as f64) / 2.0;
315
- // //println!("z_original:{}", z);
316
- // let mut nties_sum: f64 = 0.0;
317
- // for i in 0..rank_frequencies.len() {
318
- // nties_sum += rank_frequencies[i] * rank_frequencies[i] * rank_frequencies[i]
319
- // - rank_frequencies[i];
320
- // }
321
- //
322
- // let sigma = (((group1.len() * group2.len()) as f64) / 12.0
323
- // * ((group1.len() + group2.len() + 1) as f64
324
- // - nties_sum
325
- // / (((group1.len() + group2.len()) as f64)
326
- // * ((group1.len() + group2.len() - 1) as f64))))
327
- // .sqrt();
328
- // //println!("sigma:{}", sigma);
329
- // let mut correction: f64 = 0.0;
330
- // if correct == true {
331
- // if alternative == 'g' {
332
- // // Alternative "greater"
333
- // correction = 0.5;
334
- // } else if alternative == 'l' {
335
- // // Alternative "lesser"
336
- // correction = -0.5;
337
- // } else {
338
- // // Alternative "two-sided"
339
- // if z > 0.0 {
340
- // correction = 0.5;
341
- // } else if z < 0.0 {
342
- // correction = -0.5;
343
- // } else {
344
- // // z=0
345
- // correction = 0.0;
346
- // }
347
- // }
348
- // }
349
- // z = (z - correction) / sigma;
350
- // //println!("z:{}", z);
351
- // if alternative == 'g' {
352
- // // Alternative "greater"
353
- // //println!("greater:{}", n.cdf(weight_y));
354
- // //1.0 - n.cdf(z) // Applying continuity correction
355
- // r_mathlib::normal_cdf(z, 0.0, 1.0, false, false)
356
- // } else if alternative == 'l' {
357
- // // Alternative "lesser"
358
- // //println!("lesser:{}", n.cdf(weight_x));
359
- // //n.cdf(z) // Applying continuity coorection
360
- // r_mathlib::normal_cdf(z, 0.0, 1.0, true, false)
361
- // } else {
362
- // // Alternative "two-sided"
363
- // let p_g = r_mathlib::normal_cdf(z, 0.0, 1.0, false, false); // Applying continuity correction
364
- // let p_l = r_mathlib::normal_cdf(z, 0.0, 1.0, true, false); // Applying continuity correction
365
- // let mut p_value;
366
- // if p_g < p_l {
367
- // p_value = 2.0 * p_g;
368
- // } else {
369
- // p_value = 2.0 * p_l;
370
- // }
371
- // //println!("p_value:{}", p_value);
372
- // if p_value > 1.0 {
373
- // p_value = 1.0;
374
- // }
375
- // p_value
376
- // }
377
- // }
378
- //}
144
+ #[allow(dead_code)]
145
+ pub fn wilcoxon_rank_sum_test(
146
+ mut group1: Vec<f64>,
147
+ mut group2: Vec<f64>,
148
+ threshold: usize,
149
+ alternative: char,
150
+ correct: bool,
151
+ ) -> f64 {
152
+ // Check if there are any ties between the two groups
153
+
154
+ let mut combined = group1.clone();
155
+ combined.extend(group2.iter().cloned());
156
+ combined.sort_by(|a, b| a.partial_cmp(b).unwrap());
157
+ //println!("combined:{:?}", combined);
158
+
159
+ group1.sort_by(|a, b| a.partial_cmp(b).unwrap());
160
+ group2.sort_by(|a, b| a.partial_cmp(b).unwrap());
161
+ //println!("group1:{:?}", group1);
162
+ //println!("group2:{:?}", group2);
163
+
164
+ let mut group1_iter = 0;
165
+ let mut group2_iter = 0;
166
+ let mut xy: Vec<char> = Vec::with_capacity(combined.len()); // Stores X-Y classification
167
+ let mut ranks: Vec<f64> = Vec::with_capacity(combined.len()); // Stores the rank of each element
168
+ let mut is_repeat = false;
169
+ let mut repeat_present = false;
170
+ let mut frac_rank: f64 = 0.0;
171
+ let mut num_repeats: f64 = 1.0;
172
+ let mut repeat_iter: f64 = 1.0;
173
+ #[allow(unused_variables)]
174
+ let mut weight_x: f64 = 0.0;
175
+ let mut weight_y: f64 = 0.0;
176
+ let mut group_char: char = 'X';
177
+ let mut rank_frequencies: Vec<f64> = Vec::with_capacity(combined.len());
178
+ for i in 0..combined.len() {
179
+ //println!("group1_iter:{}", group1_iter);
180
+ //println!("group2_iter:{}", group2_iter);
181
+ //println!("item1:{}", combined[i]);
182
+ //println!("is_repeat:{}", is_repeat);
183
+ if group1_iter < group1.len() && combined[i] == group1[group1_iter] {
184
+ xy.push('X');
185
+ group1_iter += 1;
186
+ group_char = 'X';
187
+ } else if group2_iter < group2.len() && combined[i] == group2[group2_iter] {
188
+ xy.push('Y');
189
+ group2_iter += 1;
190
+ group_char = 'Y';
191
+ }
192
+
193
+ // Computing ranks
194
+ if is_repeat == false {
195
+ // Check if current element has other occurences
196
+ num_repeats = 1.0;
197
+ for j in i + 1..combined.len() {
198
+ if combined[i] == combined[j] {
199
+ is_repeat = true;
200
+ repeat_present = true;
201
+ repeat_iter = 1.0;
202
+ num_repeats += 1.0;
203
+ } else {
204
+ break;
205
+ }
206
+ }
207
+ //println!("num_repeats:{}", num_repeats);
208
+ if is_repeat == false {
209
+ ranks.push(i as f64 + 1.0);
210
+ if group_char == 'X' {
211
+ weight_x += i as f64 + 1.0;
212
+ } else if group_char == 'Y' {
213
+ weight_y += i as f64 + 1.0;
214
+ }
215
+ //rank_frequencies.push(RankFreq {
216
+ // rank: i as f64 + 1.0,
217
+ // freq: 1,
218
+ //});
219
+ rank_frequencies.push(1.0);
220
+ } else {
221
+ frac_rank = calculate_frac_rank(i as f64 + 1.0, num_repeats);
222
+ ranks.push(frac_rank);
223
+ if group_char == 'X' {
224
+ weight_x += frac_rank;
225
+ } else if group_char == 'Y' {
226
+ weight_y += frac_rank
227
+ }
228
+ //rank_frequencies.push(RankFreq {
229
+ // rank: frac_rank,
230
+ // freq: num_repeats as usize,
231
+ //});
232
+ rank_frequencies.push(num_repeats);
233
+ }
234
+ } else if repeat_iter < num_repeats {
235
+ // Repeat case
236
+ ranks.push(frac_rank);
237
+ repeat_iter += 1.0;
238
+ if group_char == 'X' {
239
+ weight_x += frac_rank;
240
+ } else if group_char == 'Y' {
241
+ weight_y += frac_rank
242
+ }
243
+ if repeat_iter == num_repeats {
244
+ is_repeat = false;
245
+ }
246
+ } else {
247
+ //println!("i:{}", i);
248
+ ranks.push(i as f64 + 1.0);
249
+ repeat_iter = 1.0;
250
+ num_repeats = 1.0;
251
+ if group_char == 'X' {
252
+ weight_x += i as f64 + 1.0;
253
+ } else if group_char == 'Y' {
254
+ weight_y += i as f64 + 1.0;
255
+ }
256
+ }
257
+ }
258
+ //println!("rank_frequencies:{:?}", rank_frequencies);
259
+ //println!("xy:{:?}", xy);
260
+ //println!("ranks:{:?}", ranks);
261
+ //println!("weight_x:{}", weight_x);
262
+ //println!("weight_y:{}", weight_y);
263
+
264
+ //u_dash (calculated below) calculates the "W Statistic" in wilcox.test function in R
265
+
266
+ let u_y = weight_y - (group2.len() as f64 * (group2.len() as f64 + 1.0) / 2.0) as f64;
267
+ let u_dash_y = (u_y - (group1.len() * group2.len()) as f64).abs();
268
+ //println!("u_dash_y:{}", u_dash_y);
269
+
270
+ let u_x = weight_x - (group1.len() as f64 * (group1.len() as f64 + 1.0) / 2.0) as f64;
271
+ let _u_dash_x = (u_x - (group1.len() * group2.len()) as f64).abs();
272
+ //println!("u_dash_x:{}", u_dash_x);
273
+
274
+ // Calculate test_statistic
275
+
276
+ //let t1 = weight_x - ((group1.len() as f64) * (group1.len() as f64 + 1.0)) / 2.0;
277
+ //let t2 = weight_y - ((group2.len() as f64) * (group2.len() as f64 + 1.0)) / 2.0;
278
+ //
279
+ //let mut test_statistic = t1;
280
+ //if t2 < t1 {
281
+ // test_statistic = t2;
282
+ //}
283
+
284
+ //println!("test_statistic:{}", test_statistic);
285
+
286
+ if group1.len() < threshold && group2.len() < threshold && repeat_present == false {
287
+ // Compute exact p-values
288
+
289
+ // Calculate conditional probability for weight_y
290
+
291
+ if alternative == 'g' {
292
+ // Alternative "greater"
293
+ //if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
294
+ // iterate_exact_p_values(ranks, weight_y, group2.len())
295
+ //} else {
296
+ calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
297
+ //}
298
+ } else if alternative == 'l' {
299
+ // Alternative "lesser"
300
+ //if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
301
+ // iterate_exact_p_values(ranks, weight_x, group1.len())
302
+ //} else {
303
+ calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
304
+ //}
305
+ } else {
306
+ // Two-sided distribution
307
+ calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
308
+ }
309
+ } else {
310
+ // Compute p-values from a normal distribution
311
+ //println!("group1 length:{}", group1.len());
312
+ //println!("group2 length:{}", group2.len());
313
+
314
+ let mut z = u_dash_y - ((group1.len() * group2.len()) as f64) / 2.0;
315
+ //println!("z_original:{}", z);
316
+ let mut nties_sum: f64 = 0.0;
317
+ for i in 0..rank_frequencies.len() {
318
+ nties_sum += rank_frequencies[i] * rank_frequencies[i] * rank_frequencies[i]
319
+ - rank_frequencies[i];
320
+ }
379
321
 
380
- //// To be used only when there are no ties in the input data
381
- //#[allow(dead_code)]
382
- //fn calculate_exact_probability(weight: f64, x: usize, y: usize, alternative: char) -> f64 {
383
- // //println!("Using Wilcoxon CDF");
384
- // let mut p_value;
385
- // if alternative == 't' {
386
- // if weight > ((x * y) as f64) / 2.0 {
387
- // p_value = 2.0 * r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
388
- // } else {
389
- // p_value = 2.0 * r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
390
- // }
391
- // if p_value > 1.0 {
392
- // p_value = 1.0;
393
- // }
394
- // } else if alternative == 'g' {
395
- // p_value = r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
396
- // } else if alternative == 'l' {
397
- // p_value = r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
398
- // } else {
399
- // // Should not happen
400
- // panic!("Unknown alternative option given, please check!");
401
- // }
402
- // //println!("p_value:{}", p_value);
403
- // p_value
404
- //}
405
- //
406
- //#[allow(dead_code)]
407
- //pub fn calculate_frac_rank(current_rank: f64, num_repeats: f64) -> f64 {
408
- // let mut sum = 0.0;
409
- // for i in 0..num_repeats as usize {
410
- // let rank = current_rank + i as f64;
411
- // sum += rank;
412
- // }
413
- // sum / num_repeats
414
- //}
322
+ let sigma = (((group1.len() * group2.len()) as f64) / 12.0
323
+ * ((group1.len() + group2.len() + 1) as f64
324
+ - nties_sum
325
+ / (((group1.len() + group2.len()) as f64)
326
+ * ((group1.len() + group2.len() - 1) as f64))))
327
+ .sqrt();
328
+ //println!("sigma:{}", sigma);
329
+ let mut correction: f64 = 0.0;
330
+ if correct == true {
331
+ if alternative == 'g' {
332
+ // Alternative "greater"
333
+ correction = 0.5;
334
+ } else if alternative == 'l' {
335
+ // Alternative "lesser"
336
+ correction = -0.5;
337
+ } else {
338
+ // Alternative "two-sided"
339
+ if z > 0.0 {
340
+ correction = 0.5;
341
+ } else if z < 0.0 {
342
+ correction = -0.5;
343
+ } else {
344
+ // z=0
345
+ correction = 0.0;
346
+ }
347
+ }
348
+ }
349
+ z = (z - correction) / sigma;
350
+ //println!("z:{}", z);
351
+ if alternative == 'g' {
352
+ // Alternative "greater"
353
+ //println!("greater:{}", n.cdf(weight_y));
354
+ //1.0 - n.cdf(z) // Applying continuity correction
355
+ r_mathlib::normal_cdf(z, 0.0, 1.0, false, false)
356
+ } else if alternative == 'l' {
357
+ // Alternative "lesser"
358
+ //println!("lesser:{}", n.cdf(weight_x));
359
+ //n.cdf(z) // Applying continuity coorection
360
+ r_mathlib::normal_cdf(z, 0.0, 1.0, true, false)
361
+ } else {
362
+ // Alternative "two-sided"
363
+ let p_g = r_mathlib::normal_cdf(z, 0.0, 1.0, false, false); // Applying continuity correction
364
+ let p_l = r_mathlib::normal_cdf(z, 0.0, 1.0, true, false); // Applying continuity correction
365
+ let mut p_value;
366
+ if p_g < p_l {
367
+ p_value = 2.0 * p_g;
368
+ } else {
369
+ p_value = 2.0 * p_l;
370
+ }
371
+ //println!("p_value:{}", p_value);
372
+ if p_value > 1.0 {
373
+ p_value = 1.0;
374
+ }
375
+ p_value
376
+ }
377
+ }
378
+ }
379
+
380
+ // To be used only when there are no ties in the input data
381
+ #[allow(dead_code)]
382
+ fn calculate_exact_probability(weight: f64, x: usize, y: usize, alternative: char) -> f64 {
383
+ //println!("Using Wilcoxon CDF");
384
+ let mut p_value;
385
+ if alternative == 't' {
386
+ if weight > ((x * y) as f64) / 2.0 {
387
+ p_value = 2.0 * r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
388
+ } else {
389
+ p_value = 2.0 * r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
390
+ }
391
+ if p_value > 1.0 {
392
+ p_value = 1.0;
393
+ }
394
+ } else if alternative == 'g' {
395
+ p_value = r_mathlib::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
396
+ } else if alternative == 'l' {
397
+ p_value = r_mathlib::wilcox_cdf(weight, x as f64, y as f64, true, false);
398
+ } else {
399
+ // Should not happen
400
+ panic!("Unknown alternative option given, please check!");
401
+ }
402
+ //println!("p_value:{}", p_value);
403
+ p_value
404
+ }
405
+
406
+ #[allow(dead_code)]
407
+ pub fn calculate_frac_rank(current_rank: f64, num_repeats: f64) -> f64 {
408
+ let mut sum = 0.0;
409
+ for i in 0..num_repeats as usize {
410
+ let rank = current_rank + i as f64;
411
+ sum += rank;
412
+ }
413
+ sum / num_repeats
414
+ }