@sjcrh/proteinpaint-rust 2.11.1 → 2.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,427 @@
1
+ /*
2
+ ##########################
3
+ # Wilcoxon rank sum test #
4
+ ##########################
5
+
6
+ ##########################
7
+ # Documentation
8
+ ##########################
9
+
10
+ This wilcoxon test implementation aims to copy the methodology used in R's wilcox_test() function
11
+
12
+ #########
13
+ # Usage #
14
+ #########
15
+
16
+ # Usage: cd .. && cargo build --release && time echo '[{"group1_id":"European Ancestry","group1_values":[3.7,2.5,5.9,13.1,1,10.6,3.2,3,6.5,15.5,2.6,16.5,2.6,4,8.6,8.3,1.9,7.9,7.9,6.1,17.6,3.1,3,1.5,8.1,18.2,-1.8,3.6,6,1.9,8.9,3.2,0.3,-1,11.2,6.2,16.2,7.5,9,9.4,18.9,0.1,11.5,10.1,12.5,14.6,1.5,17.3,15.4,7.6,2.4,13.5,3.8,17],"group2_id":"African Ancestry","group2_values":[11.5,5.1,21.1,4.4,-0.04]},{"group1_id":"European Ancestry","group1_values":[3.7,2.5,5.9,13.1,1,10.6,3.2,3,6.5,15.5,2.6,16.5,2.6,4,8.6,8.3,1.9,7.9,7.9,6.1,17.6,3.1,3,1.5,8.1,18.2,-1.8,3.6,6,1.9,8.9,3.2,0.3,-1,11.2,6.2,16.2,7.5,9,9.4,18.9,0.1,11.5,10.1,12.5,14.6,1.5,17.3,15.4,7.6,2.4,13.5,3.8,17],"group2_id":"Asian Ancestry","group2_values":[1.7]},{"group1_id":"African Ancestry","group1_values":[11.5,5.1,21.1,4.4,-0.04],"group2_id":"Asian Ancestry","group2_values":[]}]' | target/release/wilcoxon
17
+
18
+ # Several examples are present in test_examples.rs. This can be tested using the command: cd .. && cargo build --release && time cargo test
19
+
20
+ # Input data is in JSON format and is read in from <in.json> file.
21
+ # Results are written in JSON format to stdout.
22
+
23
+ # Input JSON specifications:
24
+ # [{
25
+ # group1_id: group1 id,
26
+ # group1_values: [] group1 data values,
27
+ # group2_id: group2 id,
28
+ # group2_values: [] group2 data values
29
+ # }]
30
+ #
31
+ # Output JSON specifications:
32
+ # [{
33
+ # group1_id: group1 id,
34
+ # group1_values: [] group1 data values,
35
+ # group2_id: group2 id,
36
+ # group2_values: [] group2 data values,
37
+ # pvalue: p-value of test
38
+ # }]
39
+
40
+
41
+ ########
42
+ # Code #
43
+ ########
44
+ */
45
+
46
+ use json;
47
+ use r_stats;
48
+ use serde::{Deserialize, Serialize};
49
+ use std::io;
50
+
51
+ mod test_examples; // Contains examples to test the wilcoxon rank sum test
52
+
53
+ #[derive(Debug, Serialize, Deserialize)]
54
+ struct OutputJson {
55
+ // Output JSON data structure
56
+ group1_id: String,
57
+ group2_id: String,
58
+ group1_values: Vec<f64>,
59
+ group2_values: Vec<f64>,
60
+ pvalue: Option<f64>,
61
+ }
62
+
63
+ //#[derive(Debug)]
64
+ //struct RankFreq {
65
+ // rank: f64,
66
+ // freq: usize,
67
+ //}
68
+
69
+ fn main() {
70
+ let mut input = String::new();
71
+ match io::stdin().read_line(&mut input) {
72
+ // Accepting the piped input from nodejs (or command line from testing)
73
+ Ok(_n) => {
74
+ //println!("{} bytes read", n);
75
+ //println!("input:{}", input);
76
+ const THRESHOLD: usize = 50; // Decrease this number so as to invoke the normal approximation for lower sample sizes. This would speed up the test at the cost of sacrificing accuracy.
77
+ let input_json = json::parse(&input);
78
+ match input_json {
79
+ Ok(json_string) => {
80
+ //println!("{} bytes read", n);
81
+ //println!("json_string:{}", json_string);
82
+
83
+ let mut output_string = "[".to_string();
84
+ for i in 0..json_string.len() {
85
+ //println!("group1_id:{}", json_string[i]["group1_id"]);
86
+ //println!("group2_id:{}", json_string[i]["group2_id"]);
87
+ //println!("group1_values:{}", json_string[i]["group1_values"]);
88
+ //println!("group2_values:{}", json_string[i]["group2_values"]);
89
+ let mut vec1 = Vec::<f64>::new();
90
+ let mut vec2 = Vec::<f64>::new();
91
+
92
+ for arr_iter in 0..json_string[i]["group1_values"].len() {
93
+ vec1.push(json_string[i]["group1_values"][arr_iter].as_f64().unwrap());
94
+ }
95
+ for arr_iter in 0..json_string[i]["group2_values"].len() {
96
+ vec2.push(json_string[i]["group2_values"][arr_iter].as_f64().unwrap());
97
+ }
98
+ //println!("vec1:{:?}", vec1);
99
+ //println!("vec2:{:?}", vec2);
100
+
101
+ if vec1.len() == 0 || vec2.len() == 0 {
102
+ // If one of the vectors has a length of zero, wilcoxon test is not performed and a pvalue of NULL is given.
103
+ output_string += &serde_json::to_string(&OutputJson {
104
+ group1_id: json_string[i]["group1_id"]
105
+ .as_str()
106
+ .unwrap()
107
+ .to_string(),
108
+ group2_id: json_string[i]["group2_id"]
109
+ .as_str()
110
+ .unwrap()
111
+ .to_string(),
112
+ group1_values: vec1,
113
+ group2_values: vec2,
114
+ pvalue: None,
115
+ })
116
+ .unwrap();
117
+ output_string += &",".to_string();
118
+ } else {
119
+ let pvalue: f64 = wilcoxon_rank_sum_test(
120
+ vec1.clone(),
121
+ vec2.clone(),
122
+ THRESHOLD,
123
+ 't', // two-sided test
124
+ true,
125
+ );
126
+
127
+ //if pvalue > 0.01 {
128
+ // pvalue = format!("{:.4}", pvalue).parse().unwrap();
129
+ //}
130
+ //println!("pvalue:{}", pvalue);
131
+ output_string += &serde_json::to_string(&OutputJson {
132
+ group1_id: json_string[i]["group1_id"]
133
+ .as_str()
134
+ .unwrap()
135
+ .to_string(),
136
+ group2_id: json_string[i]["group2_id"]
137
+ .as_str()
138
+ .unwrap()
139
+ .to_string(),
140
+ group1_values: vec1,
141
+ group2_values: vec2,
142
+ pvalue: Some(pvalue),
143
+ })
144
+ .unwrap();
145
+ output_string += &",".to_string();
146
+ }
147
+ }
148
+ output_string.pop();
149
+ output_string += &"]".to_string();
150
+ println!("{}", output_string);
151
+ }
152
+ Err(error) => println!("Incorrect json: {}", error),
153
+ }
154
+ }
155
+ Err(error) => println!("Piping error: {}", error),
156
+ }
157
+ }
158
+
159
+ fn wilcoxon_rank_sum_test(
160
+ mut group1: Vec<f64>,
161
+ mut group2: Vec<f64>,
162
+ threshold: usize,
163
+ alternative: char,
164
+ correct: bool,
165
+ ) -> f64 {
166
+ // Check if there are any ties between the two groups
167
+
168
+ let mut combined = group1.clone();
169
+ combined.extend(group2.iter().cloned());
170
+ combined.sort_by(|a, b| a.partial_cmp(b).unwrap());
171
+ //println!("combined:{:?}", combined);
172
+
173
+ group1.sort_by(|a, b| a.partial_cmp(b).unwrap());
174
+ group2.sort_by(|a, b| a.partial_cmp(b).unwrap());
175
+ //println!("group1:{:?}", group1);
176
+ //println!("group2:{:?}", group2);
177
+
178
+ let mut group1_iter = 0;
179
+ let mut group2_iter = 0;
180
+ let mut xy = Vec::<char>::new(); // Stores X-Y classification
181
+ let mut ranks = Vec::<f64>::new(); // Stores the rank of each element
182
+ let mut is_repeat = false;
183
+ let mut repeat_present = false;
184
+ let mut frac_rank: f64 = 0.0;
185
+ let mut num_repeats: f64 = 1.0;
186
+ let mut repeat_iter: f64 = 1.0;
187
+ #[allow(unused_variables)]
188
+ let mut weight_x: f64 = 0.0;
189
+ let mut weight_y: f64 = 0.0;
190
+ let mut group_char: char = 'X';
191
+ let mut rank_frequencies = Vec::<f64>::new();
192
+ for i in 0..combined.len() {
193
+ //println!("group1_iter:{}", group1_iter);
194
+ //println!("group2_iter:{}", group2_iter);
195
+ //println!("item1:{}", combined[i]);
196
+ //println!("is_repeat:{}", is_repeat);
197
+ if group1_iter < group1.len() && combined[i] == group1[group1_iter] {
198
+ xy.push('X');
199
+ group1_iter += 1;
200
+ group_char = 'X';
201
+ } else if group2_iter < group2.len() && combined[i] == group2[group2_iter] {
202
+ xy.push('Y');
203
+ group2_iter += 1;
204
+ group_char = 'Y';
205
+ }
206
+
207
+ // Computing ranks
208
+ if is_repeat == false {
209
+ // Check if current element has other occurences
210
+ num_repeats = 1.0;
211
+ for j in i + 1..combined.len() {
212
+ if combined[i] == combined[j] {
213
+ is_repeat = true;
214
+ repeat_present = true;
215
+ repeat_iter = 1.0;
216
+ num_repeats += 1.0;
217
+ } else {
218
+ break;
219
+ }
220
+ }
221
+ //println!("num_repeats:{}", num_repeats);
222
+ if is_repeat == false {
223
+ ranks.push(i as f64 + 1.0);
224
+ if group_char == 'X' {
225
+ weight_x += i as f64 + 1.0;
226
+ } else if group_char == 'Y' {
227
+ weight_y += i as f64 + 1.0;
228
+ }
229
+ //rank_frequencies.push(RankFreq {
230
+ // rank: i as f64 + 1.0,
231
+ // freq: 1,
232
+ //});
233
+ rank_frequencies.push(1.0);
234
+ } else {
235
+ frac_rank = calculate_frac_rank(i as f64 + 1.0, num_repeats);
236
+ ranks.push(frac_rank);
237
+ if group_char == 'X' {
238
+ weight_x += frac_rank;
239
+ } else if group_char == 'Y' {
240
+ weight_y += frac_rank
241
+ }
242
+ //rank_frequencies.push(RankFreq {
243
+ // rank: frac_rank,
244
+ // freq: num_repeats as usize,
245
+ //});
246
+ rank_frequencies.push(num_repeats);
247
+ }
248
+ } else if repeat_iter < num_repeats {
249
+ // Repeat case
250
+ ranks.push(frac_rank);
251
+ repeat_iter += 1.0;
252
+ if group_char == 'X' {
253
+ weight_x += frac_rank;
254
+ } else if group_char == 'Y' {
255
+ weight_y += frac_rank
256
+ }
257
+ if repeat_iter == num_repeats {
258
+ is_repeat = false;
259
+ }
260
+ } else {
261
+ //println!("i:{}", i);
262
+ ranks.push(i as f64 + 1.0);
263
+ repeat_iter = 1.0;
264
+ num_repeats = 1.0;
265
+ if group_char == 'X' {
266
+ weight_x += i as f64 + 1.0;
267
+ } else if group_char == 'Y' {
268
+ weight_y += i as f64 + 1.0;
269
+ }
270
+ }
271
+ }
272
+ //println!("rank_frequencies:{:?}", rank_frequencies);
273
+ //println!("xy:{:?}", xy);
274
+ //println!("ranks:{:?}", ranks);
275
+ //println!("weight_x:{}", weight_x);
276
+ //println!("weight_y:{}", weight_y);
277
+
278
+ //u_dash (calculated below) calculates the "W Statistic" in wilcox.test function in R
279
+
280
+ let u_y = weight_y - (group2.len() as f64 * (group2.len() as f64 + 1.0) / 2.0) as f64;
281
+ let u_dash_y = (u_y - (group1.len() * group2.len()) as f64).abs();
282
+ //println!("u_dash_y:{}", u_dash_y);
283
+
284
+ let u_x = weight_x - (group1.len() as f64 * (group1.len() as f64 + 1.0) / 2.0) as f64;
285
+ let _u_dash_x = (u_x - (group1.len() * group2.len()) as f64).abs();
286
+ //println!("u_dash_x:{}", u_dash_x);
287
+
288
+ // Calculate test_statistic
289
+
290
+ //let t1 = weight_x - ((group1.len() as f64) * (group1.len() as f64 + 1.0)) / 2.0;
291
+ //let t2 = weight_y - ((group2.len() as f64) * (group2.len() as f64 + 1.0)) / 2.0;
292
+ //
293
+ //let mut test_statistic = t1;
294
+ //if t2 < t1 {
295
+ // test_statistic = t2;
296
+ //}
297
+
298
+ //println!("test_statistic:{}", test_statistic);
299
+
300
+ if group1.len() < threshold && group2.len() < threshold && repeat_present == false {
301
+ // Compute exact p-values
302
+
303
+ // Calculate conditional probability for weight_y
304
+
305
+ if alternative == 'g' {
306
+ // Alternative "greater"
307
+ //if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
308
+ // iterate_exact_p_values(ranks, weight_y, group2.len())
309
+ //} else {
310
+ calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
311
+ //}
312
+ } else if alternative == 'l' {
313
+ // Alternative "lesser"
314
+ //if group1.len() <= low_cutoff && group2.len() <= low_cutoff {
315
+ // iterate_exact_p_values(ranks, weight_x, group1.len())
316
+ //} else {
317
+ calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
318
+ //}
319
+ } else {
320
+ // Two-sided distribution
321
+ calculate_exact_probability(u_dash_y, group1.len(), group2.len(), alternative)
322
+ }
323
+ } else {
324
+ // Compute p-values from a normal distribution
325
+ //println!("group1 length:{}", group1.len());
326
+ //println!("group2 length:{}", group2.len());
327
+
328
+ let mut z = u_dash_y - ((group1.len() * group2.len()) as f64) / 2.0;
329
+ //println!("z_original:{}", z);
330
+ let mut nties_sum: f64 = 0.0;
331
+ for i in 0..rank_frequencies.len() {
332
+ nties_sum += rank_frequencies[i] * rank_frequencies[i] * rank_frequencies[i]
333
+ - rank_frequencies[i];
334
+ }
335
+
336
+ let sigma = (((group1.len() * group2.len()) as f64) / 12.0
337
+ * ((group1.len() + group2.len() + 1) as f64
338
+ - nties_sum
339
+ / (((group1.len() + group2.len()) as f64)
340
+ * ((group1.len() + group2.len() - 1) as f64))))
341
+ .sqrt();
342
+ //println!("sigma:{}", sigma);
343
+ let mut correction: f64 = 0.0;
344
+ if correct == true {
345
+ if alternative == 'g' {
346
+ // Alternative "greater"
347
+ correction = 0.5;
348
+ } else if alternative == 'g' {
349
+ // Alternative "lesser"
350
+ correction = -0.5;
351
+ } else {
352
+ // Alternative "two-sided"
353
+ if z > 0.0 {
354
+ correction = 0.5;
355
+ } else if z < 0.0 {
356
+ correction = -0.5;
357
+ } else {
358
+ // z=0
359
+ correction = 0.0;
360
+ }
361
+ }
362
+ }
363
+ z = (z - correction) / sigma;
364
+ //println!("z:{}", z);
365
+ if alternative == 'g' {
366
+ // Alternative "greater"
367
+ //println!("greater:{}", n.cdf(weight_y));
368
+ //1.0 - n.cdf(z) // Applying continuity correction
369
+ r_stats::normal_cdf(z, 0.0, 1.0, false, false)
370
+ } else if alternative == 'l' {
371
+ // Alternative "lesser"
372
+ //println!("lesser:{}", n.cdf(weight_x));
373
+ //n.cdf(z) // Applying continuity coorection
374
+ r_stats::normal_cdf(z, 0.0, 1.0, true, false)
375
+ } else {
376
+ // Alternative "two-sided"
377
+ let p_g = r_stats::normal_cdf(z, 0.0, 1.0, false, false); // Applying continuity correction
378
+ let p_l = r_stats::normal_cdf(z, 0.0, 1.0, true, false); // Applying continuity correction
379
+ let mut p_value;
380
+ if p_g < p_l {
381
+ p_value = 2.0 * p_g;
382
+ } else {
383
+ p_value = 2.0 * p_l;
384
+ }
385
+ //println!("p_value:{}", p_value);
386
+ if p_value > 1.0 {
387
+ p_value = 1.0;
388
+ }
389
+ p_value
390
+ }
391
+ }
392
+ }
393
+
394
+ // To be used only when there are no ties in the input data
395
+ fn calculate_exact_probability(weight: f64, x: usize, y: usize, alternative: char) -> f64 {
396
+ //println!("Using Wilcoxon CDF");
397
+ let mut p_value;
398
+ if alternative == 't' {
399
+ if weight > ((x * y) as f64) / 2.0 {
400
+ p_value = 2.0 * r_stats::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
401
+ } else {
402
+ p_value = 2.0 * r_stats::wilcox_cdf(weight, x as f64, y as f64, true, false);
403
+ }
404
+ if p_value > 1.0 {
405
+ p_value = 1.0;
406
+ }
407
+ } else if alternative == 'g' {
408
+ p_value = r_stats::wilcox_cdf(weight - 1.0, x as f64, y as f64, false, false);
409
+ } else if alternative == 'l' {
410
+ p_value = r_stats::wilcox_cdf(weight, x as f64, y as f64, true, false);
411
+ } else {
412
+ // Should not happen
413
+ panic!("Unknown alternative option given, please check!");
414
+ }
415
+ //println!("p_value:{}", p_value);
416
+ p_value
417
+ }
418
+
419
+ fn calculate_frac_rank(current_rank: f64, num_repeats: f64) -> f64 {
420
+ let mut sum = 0.0;
421
+ for i in 0..num_repeats as usize {
422
+ let rank = current_rank + i as f64;
423
+ sum += rank;
424
+ }
425
+
426
+ sum / num_repeats
427
+ }