@sjcrh/proteinpaint-rust 2.129.2 → 2.129.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,9 @@
1
+ #![allow(non_snake_case)]
1
2
  use fishers_exact::fishers_exact;
2
3
  //use r_mathlib;
4
+ use r_mathlib::chi_squared_cdf;
3
5
  use statrs::distribution::{ChiSquared, ContinuousCDF};
6
+ use std::collections::HashSet;
4
7
  use std::panic;
5
8
 
6
9
  #[allow(dead_code)]
@@ -99,24 +102,19 @@ fn chi_square_test(
99
102
  {
100
103
  0.05 // Arbitarily put a very high number when there are only forward or reverse reads for alternate/reference
101
104
  } else {
102
- let total: f64 = (alternate_forward_count
103
- + alternate_reverse_count
104
- + reference_forward_count
105
- + reference_reverse_count) as f64;
106
- let expected_alternate_forward_count: f64 = (alternate_forward_count
107
- + alternate_reverse_count) as f64
105
+ let total: f64 =
106
+ (alternate_forward_count + alternate_reverse_count + reference_forward_count + reference_reverse_count)
107
+ as f64;
108
+ let expected_alternate_forward_count: f64 = (alternate_forward_count + alternate_reverse_count) as f64
108
109
  * (alternate_forward_count + reference_forward_count) as f64
109
110
  / total;
110
- let expected_alternate_reverse_count: f64 = (alternate_forward_count
111
- + alternate_reverse_count) as f64
111
+ let expected_alternate_reverse_count: f64 = (alternate_forward_count + alternate_reverse_count) as f64
112
112
  * (alternate_reverse_count + reference_reverse_count) as f64
113
113
  / total;
114
- let expected_reference_forward_count: f64 = (alternate_forward_count
115
- + reference_forward_count) as f64
114
+ let expected_reference_forward_count: f64 = (alternate_forward_count + reference_forward_count) as f64
116
115
  * (reference_forward_count + reference_reverse_count) as f64
117
116
  / total;
118
- let expected_reference_reverse_count: f64 = (reference_forward_count
119
- + reference_reverse_count) as f64
117
+ let expected_reference_reverse_count: f64 = (reference_forward_count + reference_reverse_count) as f64
120
118
  * (alternate_reverse_count + reference_reverse_count) as f64
121
119
  / total;
122
120
 
@@ -315,15 +313,12 @@ pub fn wilcoxon_rank_sum_test(
315
313
  //println!("z_original:{}", z);
316
314
  let mut nties_sum: f64 = 0.0;
317
315
  for i in 0..rank_frequencies.len() {
318
- nties_sum += rank_frequencies[i] * rank_frequencies[i] * rank_frequencies[i]
319
- - rank_frequencies[i];
316
+ nties_sum += rank_frequencies[i] * rank_frequencies[i] * rank_frequencies[i] - rank_frequencies[i];
320
317
  }
321
318
 
322
319
  let sigma = (((group1.len() * group2.len()) as f64) / 12.0
323
320
  * ((group1.len() + group2.len() + 1) as f64
324
- - nties_sum
325
- / (((group1.len() + group2.len()) as f64)
326
- * ((group1.len() + group2.len() - 1) as f64))))
321
+ - nties_sum / (((group1.len() + group2.len()) as f64) * ((group1.len() + group2.len() - 1) as f64))))
327
322
  .sqrt();
328
323
  //println!("sigma:{}", sigma);
329
324
  let mut correction: f64 = 0.0;
@@ -412,3 +407,82 @@ pub fn calculate_frac_rank(current_rank: f64, num_repeats: f64) -> f64 {
412
407
  }
413
408
  sum / num_repeats
414
409
  }
410
+
411
+ #[allow(non_camel_case_types)]
412
+ #[allow(non_snake_case)]
413
+ #[derive(Debug, Clone, PartialEq, PartialOrd)]
414
+ pub struct gene_order {
415
+ pub gene_name: String,
416
+ pub fold_change: f32,
417
+ pub rank: Option<usize>,
418
+ }
419
+
420
+ #[allow(dead_code)]
421
+ pub fn cerno(
422
+ genes_descending: &Vec<gene_order>,
423
+ genes_ascending: &Vec<gene_order>,
424
+ genes_in_pathway: HashSet<String>,
425
+ ) -> (f32, f32, f32, f32, String, f32) {
426
+ // Ensure sample_coding_genes is sorted in decreasing order of fold_change
427
+ // Filter the genes_descending vector to only include those whose gene_names are in the HashSet genes_in_pathway
428
+ let gene_intersections_descending: Vec<&gene_order> = genes_descending
429
+ .iter()
430
+ .filter(|genes_descending| genes_in_pathway.contains(&genes_descending.gene_name)) // Check if name is in the HashSet genes_in_pathway
431
+ .collect(); // Collect the results into a new vector
432
+
433
+ let N1 = gene_intersections_descending.len() as f32;
434
+ let N = genes_descending.len() as f32;
435
+ let mut gene_set_hits: String = "".to_string();
436
+ for gene in &gene_intersections_descending {
437
+ gene_set_hits += &(gene.gene_name.to_string() + &",");
438
+ }
439
+ if gene_intersections_descending.len() > 0 {
440
+ // Remove the last "," in string
441
+ gene_set_hits.pop();
442
+ }
443
+
444
+ let ranks_descending: Vec<usize> = gene_intersections_descending // x <- l %in% mset$gs2gv[[m]] ; ranks <- c(1:N)[x]
445
+ .iter()
446
+ .map(|x| x.rank.unwrap())
447
+ .collect::<Vec<usize>>();
448
+
449
+ let cerno: f32 = ranks_descending // -2 * sum( log(ranks/N) )
450
+ .iter()
451
+ .map(|x| ((*x as f32) / N).ln())
452
+ .collect::<Vec<f32>>()
453
+ .iter()
454
+ .sum::<f32>()
455
+ * (-2.0);
456
+
457
+ let cES;
458
+ let N2 = N - N1; // N2 = N - N1
459
+ let R1 = ranks_descending.iter().sum::<usize>() as f32; // R1 <- sum(ranks)
460
+ let U = N1 * N2 + N1 * (N1 + 1.0) / 2.0 - R1; // U <- N1*N2+N1*(N1+1)/2-R1
461
+ let AUC = U / (N1 * N2); // AUC <- U/(N1*N2)
462
+ let p_value;
463
+ if AUC >= 0.5 {
464
+ // Upregulated geneset
465
+ cES = cerno / (2.0 * (N1 as f32)); // cES <- cerno/(2*N1)
466
+ p_value = chi_squared_cdf(cerno as f64, (2.0 * N1) as f64, false, false);
467
+ // pchisq(ret$cerno, 2*N1, lower.tail=FALSE)
468
+ } else {
469
+ let gene_intersections_ascending: Vec<&gene_order> = genes_ascending
470
+ .iter()
471
+ .filter(|genes_ascending| genes_in_pathway.contains(&genes_ascending.gene_name)) // Check if name is in the HashSet genes_in_pathway
472
+ .collect(); // Collect the results into a new vector
473
+ let ranks_ascending: Vec<usize> = gene_intersections_ascending // x <- l %in% mset$gs2gv[[m]] ; ranks <- c(1:N)[x]
474
+ .iter()
475
+ .map(|x| x.rank.unwrap())
476
+ .collect::<Vec<usize>>();
477
+ let cerno_ascending: f32 = ranks_ascending // -2 * sum( log(ranks/N) )
478
+ .iter()
479
+ .map(|x| ((*x as f32) / N).ln())
480
+ .collect::<Vec<f32>>()
481
+ .iter()
482
+ .sum::<f32>()
483
+ * (-2.0);
484
+ cES = cerno_ascending / (2.0 * (N1 as f32)); // cES <- cerno/(2*N1)
485
+ p_value = chi_squared_cdf(cerno_ascending as f64, (2.0 * N1) as f64, false, false);
486
+ }
487
+ (p_value as f32, AUC, cES, N1, gene_set_hits, cerno)
488
+ }
@@ -0,0 +1,131 @@
1
+ // For capturing output from a test, run: cd .. && cargo test -- --nocapture
2
+ #[allow(dead_code)]
3
+ fn main() {}
4
+
5
+ #[cfg(test)]
6
+ mod tests {
7
+ use crate::stats_functions::cerno;
8
+ use flate2::read::GzDecoder;
9
+ use json::JsonValue;
10
+ use std::cmp::Ordering;
11
+ use std::collections::HashSet;
12
+ use std::fs::File;
13
+ use std::io::{BufReader, Read};
14
+
15
+ const P_VALUE_CUTOFF: f32 = 0.01; // Threshold difference between calculated and expected p-value for the test to pass
16
+ const AUC_CUTOFF: f32 = 0.001; // Threshold difference between calculated and expected AUC for the test to pass
17
+ const ES_CUTOFF: f32 = 0.01; // Threshold difference between calculated and expected ES for the test to pass
18
+ const CERNO_CUTOFF: f32 = 1.0; // Threshold difference between calculated and expected CERNO value for the test to pass
19
+ #[test]
20
+ fn cerno_test() {
21
+ // Specify the path to the json file
22
+ let file_path = "test/cerno_test.json.gz";
23
+ // Open the file
24
+ let file = File::open(file_path).unwrap();
25
+ // Create a buffered reader
26
+ let buf_reader = BufReader::new(file);
27
+
28
+ // Create a GzDecoder to read the Gzip data
29
+ let mut gz_decoder = GzDecoder::new(buf_reader);
30
+
31
+ // Create a String to hold the first line
32
+ let mut first_line = String::new();
33
+
34
+ // Read the decompressed data into the String
35
+ gz_decoder.read_to_string(&mut first_line).unwrap();
36
+
37
+ // Read the first line
38
+ //buf_reader.read_line(&mut first_line).unwrap();
39
+ let input_json = json::parse(&first_line);
40
+ match input_json {
41
+ Ok(json_string) => {
42
+ let sample_genes_input: &JsonValue = &json_string["input_genes"];
43
+ let mut sample_genes = Vec::<&str>::new();
44
+ for iter in 0..sample_genes_input.len() {
45
+ let item = sample_genes_input[iter].as_str().unwrap();
46
+ sample_genes.push(item);
47
+ }
48
+ let fold_change_input: &JsonValue = &json_string["input_fold_change"];
49
+ let mut fold_change_f32 = Vec::<f32>::new();
50
+ for iter in 0..fold_change_input.len() {
51
+ let item = fold_change_input[iter].as_f32().unwrap();
52
+ fold_change_f32.push(item);
53
+ }
54
+
55
+ let mut sample_coding_genes: Vec<crate::stats_functions::gene_order> =
56
+ Vec::with_capacity(sample_genes.len());
57
+ for i in 0..sample_genes.len() {
58
+ let item: crate::stats_functions::gene_order = crate::stats_functions::gene_order {
59
+ gene_name: sample_genes[i].to_string(),
60
+ fold_change: fold_change_f32[i],
61
+ rank: None, // Will be calculated later
62
+ };
63
+ sample_coding_genes.push(item)
64
+ }
65
+
66
+ // Sort sample_coding_gene in descending order
67
+ sample_coding_genes
68
+ .as_mut_slice()
69
+ .sort_by(|a, b| (b.fold_change).partial_cmp(&a.fold_change).unwrap_or(Ordering::Equal));
70
+ let mut genes_descending = sample_coding_genes.clone();
71
+ // Sort sample_coding_gene in ascending order
72
+ sample_coding_genes
73
+ .as_mut_slice()
74
+ .sort_by(|a, b| (a.fold_change).partial_cmp(&b.fold_change).unwrap_or(Ordering::Equal));
75
+ let mut genes_ascending = sample_coding_genes.clone();
76
+
77
+ drop(sample_coding_genes); // sample_coding_genes no longer deleted, so the variable is deleted
78
+
79
+ // Assign ranks to each gene
80
+ for i in 0..genes_descending.len() {
81
+ genes_descending[i].rank = Some(i);
82
+ genes_ascending[i].rank = Some(i);
83
+ }
84
+
85
+ let modules_2_genes: &JsonValue = &json_string["MODULES2GENES"];
86
+ let expected_p_values_json: &JsonValue = &json_string["expected_p_values"]; // The expected p-value comes from the original tmod package in R
87
+ let expected_auc_json: &JsonValue = &json_string["expected_auc"]; // The expected auc comes from the original tmod package in R
88
+ let expected_es_json: &JsonValue = &json_string["expected_es"]; // The expected es comes from the original tmod package in R
89
+ let expected_cerno_json: &JsonValue = &json_string["expected_cerno"]; // The expected cerno comes from the original tmod package in R
90
+
91
+ let mut expected_p_values = Vec::<f32>::new();
92
+ let mut expected_auc = Vec::<f32>::new();
93
+ let mut expected_es = Vec::<f32>::new();
94
+ let mut expected_cerno = Vec::<f32>::new();
95
+
96
+ for j in 0..expected_p_values_json.len() {
97
+ expected_p_values.push(expected_p_values_json[j].as_f32().unwrap());
98
+ expected_auc.push(expected_auc_json[j].as_f32().unwrap());
99
+ expected_es.push(expected_es_json[j].as_f32().unwrap());
100
+ expected_cerno.push(expected_cerno_json[j].as_f32().unwrap());
101
+ }
102
+
103
+ let mut iter = 0;
104
+ for item in modules_2_genes.entries() {
105
+ let (key, value) = item;
106
+ let mut geneset = HashSet::<String>::new();
107
+ for item2 in value.members() {
108
+ geneset.insert(item2.to_string());
109
+ }
110
+ let (p_value, auc, es, _matches, _gene_set_hits, cerno_output) =
111
+ cerno(&genes_descending, &genes_ascending, geneset.clone());
112
+ println!("Geneset name:{}", key.to_string());
113
+ println!("p_value:{}", p_value);
114
+ println!("auc:{}", auc);
115
+ println!("es:{}", es);
116
+ println!("cerno:{}", cerno_output);
117
+ //println!("matches:{}", _matches1);
118
+ //println!("gene_set_hits:{}", _gene_set_hits1);
119
+
120
+ assert_eq!((p_value - expected_p_values[iter]).abs() < P_VALUE_CUTOFF, true); // The expected p-value comes from the original tmod package in R
121
+ assert_eq!((auc - expected_auc[iter]).abs() < AUC_CUTOFF, true); // The expected auc comes from the original tmod package in R
122
+ assert_eq!((es - expected_es[iter]).abs() < ES_CUTOFF, true); // The expected es comes from the original tmod package in R
123
+ assert_eq!((cerno_output - expected_cerno[iter]).abs() < CERNO_CUTOFF, true);
124
+ // The expected es comes from the original tmod package in R
125
+ iter += 1;
126
+ }
127
+ }
128
+ Err(error) => println!("Incorrect json:{}", error),
129
+ }
130
+ }
131
+ }
package/src/wilcoxon.rs CHANGED
@@ -48,6 +48,7 @@ use serde::{Deserialize, Serialize};
48
48
  use std::io;
49
49
 
50
50
  mod stats_functions; // Import Wilcoxon function
51
+ #[cfg(test)]
51
52
  mod test_examples; // Contains examples to test the wilcoxon rank sum test
52
53
 
53
54
  #[derive(Debug, Serialize, Deserialize)]
@@ -101,14 +102,8 @@ fn main() {
101
102
  if vec1.len() == 0 || vec2.len() == 0 {
102
103
  // If one of the vectors has a length of zero, wilcoxon test is not performed and a pvalue of NULL is given.
103
104
  output_string += &serde_json::to_string(&OutputJson {
104
- group1_id: json_string[i]["group1_id"]
105
- .as_str()
106
- .unwrap()
107
- .to_string(),
108
- group2_id: json_string[i]["group2_id"]
109
- .as_str()
110
- .unwrap()
111
- .to_string(),
105
+ group1_id: json_string[i]["group1_id"].as_str().unwrap().to_string(),
106
+ group2_id: json_string[i]["group2_id"].as_str().unwrap().to_string(),
112
107
  group1_values: vec1,
113
108
  group2_values: vec2,
114
109
  pvalue: None,
@@ -129,14 +124,8 @@ fn main() {
129
124
  //}
130
125
  //println!("pvalue:{}", pvalue);
131
126
  output_string += &serde_json::to_string(&OutputJson {
132
- group1_id: json_string[i]["group1_id"]
133
- .as_str()
134
- .unwrap()
135
- .to_string(),
136
- group2_id: json_string[i]["group2_id"]
137
- .as_str()
138
- .unwrap()
139
- .to_string(),
127
+ group1_id: json_string[i]["group1_id"].as_str().unwrap().to_string(),
128
+ group2_id: json_string[i]["group2_id"].as_str().unwrap().to_string(),
140
129
  group1_values: vec1,
141
130
  group2_values: vec2,
142
131
  pvalue: Some(pvalue),