@sjcrh/proteinpaint-rust 2.78.0 → 2.84.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.78.0",
2
+ "version": "2.84.0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
@@ -38,5 +38,5 @@
38
38
  "devDependencies": {
39
39
  "tape": "^5.2.2"
40
40
  },
41
- "pp_release_tag": "v2.78.0"
41
+ "pp_release_tag": "v2.84.0"
42
42
  }
package/src/DEanalysis.rs CHANGED
@@ -1,6 +1,9 @@
1
- // cd .. && cargo build --release && json='{"min_count":10,"min_total_count":15,"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
1
+ // cd .. && cargo build --release && json='{"min_count":10,"min_total_count":15,"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","data_type":"do_DE","storage_type":"text","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
2
+ // cd .. && cargo build --release && json='{"data_type":"get_samples","input_file":"/Users/rpaul1/pp_data/files/hg38/ALL-pharmacotyping/rnaseq/counts.h5"}' && time echo $json | target/release/DEanalysis
2
3
  // cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/DEanalysis
3
4
  #![allow(non_snake_case)]
5
+ use hdf5::types::VarLenAscii;
6
+ use hdf5::File as HDF5File;
4
7
  use json;
5
8
  use nalgebra::base::dimension::Const;
6
9
  use nalgebra::base::dimension::Dyn;
@@ -8,6 +11,9 @@ use nalgebra::base::Matrix;
8
11
  use nalgebra::base::VecStorage;
9
12
  use nalgebra::DMatrix;
10
13
  use nalgebra::ViewStorage;
14
+ use ndarray::Array1;
15
+ use ndarray::Array2;
16
+ use ndarray::Dim;
11
17
  use serde::{Deserialize, Serialize};
12
18
  use serde_json;
13
19
  use statrs::statistics::Data;
@@ -55,7 +61,156 @@ fn binary_search(input: &Vec<usize>, y: usize) -> i64 {
55
61
  index
56
62
  }
57
63
 
58
- fn input_data(
64
+ fn input_data_from_HDF5(
65
+ hdf5_filename: &String,
66
+ case_list: &Vec<&str>,
67
+ control_list: &Vec<&str>,
68
+ ) -> (
69
+ Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
70
+ Vec<usize>,
71
+ Vec<usize>,
72
+ Vec<String>,
73
+ Vec<String>,
74
+ ) {
75
+ let file = HDF5File::open(&hdf5_filename).unwrap(); // open for reading
76
+ let ds_dim = file.dataset("dims").unwrap(); // open the dataset
77
+ let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
78
+ let mut case_indexes: Vec<usize> = Vec::with_capacity(case_list.len());
79
+ let mut control_indexes: Vec<usize> = Vec::with_capacity(control_list.len());
80
+ // Check the data type and read the dataset accordingly
81
+ let data_dim: Array1<_> = ds_dim.read::<usize, Dim<[usize; 1]>>().unwrap();
82
+ let num_samples = data_dim[0]; // Number of total columns in the dataset
83
+ let num_genes = data_dim[1]; // Number of total rows in the dataset
84
+ println!("num_samples bulk:{}", num_samples);
85
+ println!("num_genes bulk:{}", num_genes);
86
+
87
+ let now_gene_names = Instant::now();
88
+ let ds_gene_names = file.dataset("gene_names").unwrap();
89
+ println!("ds_gene_names:{:?}", ds_gene_names);
90
+ let gene_names = ds_gene_names
91
+ .read::<VarLenAscii, Dim<[usize; 1]>>()
92
+ .unwrap();
93
+ println!("\tgene_names = {:?}", gene_names);
94
+ println!("\tgene_names.shape() = {:?}", gene_names.shape());
95
+ println!("\tgene_names.strides() = {:?}", gene_names.strides());
96
+ println!("\tgene_names.ndim() = {:?}", gene_names.ndim());
97
+ println!("Time for parsing gene names:{:?}", now_gene_names.elapsed());
98
+
99
+ let now_gene_symbols = Instant::now();
100
+ let ds_gene_symbols = file.dataset("gene_symbols").unwrap();
101
+ println!("ds_gene_symbols:{:?}", ds_gene_symbols);
102
+ let gene_symbols = ds_gene_symbols
103
+ .read::<VarLenAscii, Dim<[usize; 1]>>()
104
+ .unwrap();
105
+ println!("\tgene_symbols = {:?}", gene_symbols);
106
+ println!("\tgene_symbols.shape() = {:?}", gene_symbols.shape());
107
+ println!("\tgene_symbols.strides() = {:?}", gene_symbols.strides());
108
+ println!("\tgene_symbols.ndim() = {:?}", gene_symbols.ndim());
109
+ println!(
110
+ "Time for parsing gene symbols:{:?}",
111
+ now_gene_symbols.elapsed()
112
+ );
113
+
114
+ let mut gene_names_string: Vec<String> = Vec::with_capacity(gene_names.len());
115
+ let mut gene_symbols_string: Vec<String> = Vec::with_capacity(gene_symbols.len());
116
+ for i in 0..gene_names.len() {
117
+ gene_names_string.push(gene_names[i].to_string());
118
+ gene_symbols_string.push(gene_symbols[i].to_string());
119
+ }
120
+
121
+ let now_samples = Instant::now();
122
+ let ds_samples = file.dataset("samples").unwrap();
123
+ let samples = ds_samples.read::<VarLenAscii, Dim<[usize; 1]>>().unwrap();
124
+ println!("\tsamples = {:?}", samples);
125
+ println!("\tsamples.shape() = {:?}", samples.shape());
126
+ println!("\tsamples.strides() = {:?}", samples.strides());
127
+ println!("\tsamples.ndim() = {:?}", samples.ndim());
128
+ println!("Time for parsing samples:{:?}", now_samples.elapsed());
129
+
130
+ //Find all columns values that are populated for the given gene
131
+ let now_counts = Instant::now();
132
+ let ds_counts = file.dataset("counts").unwrap(); // open the dataset
133
+
134
+ let mut global_sample_index = 0;
135
+ for sample_name in case_list {
136
+ let sample_index;
137
+ match samples
138
+ .iter()
139
+ .position(|x| x.to_string() == *sample_name.to_string())
140
+ {
141
+ Some(index) => {
142
+ //println!(
143
+ // "The index of '{}' is {} in 0-based format (add 1 to compare with R output)",
144
+ // sample_name, index
145
+ //);
146
+ sample_index = index;
147
+ }
148
+ None => panic!(
149
+ "Sample '{}' not found in the HDF5 file '{}'",
150
+ sample_name, &hdf5_filename
151
+ ),
152
+ }
153
+
154
+ let sample_array: Array2<f64> = ds_counts
155
+ .read_slice_2d((0..gene_names.len(), sample_index..sample_index + 1))
156
+ .unwrap();
157
+ //println!("Length of gene array:{:?}", sample_array.len()); // Please check the result
158
+ input_vector.append(&mut sample_array.as_slice().unwrap().to_vec());
159
+ case_indexes.push(global_sample_index);
160
+ global_sample_index += 1;
161
+ }
162
+
163
+ for sample_name in control_list {
164
+ let sample_index;
165
+ match samples
166
+ .iter()
167
+ .position(|x| x.to_string() == *sample_name.to_string())
168
+ {
169
+ Some(index) => {
170
+ //println!(
171
+ // "The index of '{}' is {} in 0-based format (add 1 to compare with R output)",
172
+ // sample_name, index
173
+ //);
174
+ sample_index = index;
175
+ }
176
+ None => panic!(
177
+ "Sample '{}' not found in the HDF5 file '{}'",
178
+ sample_name, &hdf5_filename
179
+ ),
180
+ }
181
+ //let data_counts: Array1<_> = ds_counts.read::<f64, Dim<[usize; 1]>>().unwrap();
182
+ //println!("Data_counts: {:?}", data_counts);
183
+ let sample_array: Array2<f64> = ds_counts
184
+ .read_slice_2d((0..gene_names.len(), sample_index..sample_index + 1))
185
+ .unwrap();
186
+ //println!("Length of gene array:{:?}", sample_array.len()); // Please check the result
187
+ input_vector.append(&mut sample_array.as_slice().unwrap().to_vec());
188
+ control_indexes.push(global_sample_index);
189
+ global_sample_index += 1;
190
+ }
191
+
192
+ println!("Time for parsing HDF5 data:{:?}", now_counts.elapsed());
193
+ //println!(
194
+ // "case + control length:{}",
195
+ // case_list.len() + control_list.len()
196
+ //);
197
+ //println!("gene_names length:{}", gene_names.len());
198
+ //println!("input_vector length:{}", input_vector.len());
199
+ let dm = DMatrix::from_row_slice(
200
+ case_list.len() + control_list.len(),
201
+ gene_names.len(),
202
+ &input_vector,
203
+ );
204
+ (
205
+ dm.transpose(), // Transposing the matrix
206
+ case_indexes,
207
+ control_indexes,
208
+ gene_names_string,
209
+ gene_symbols_string,
210
+ )
211
+ }
212
+
213
+ fn input_data_from_text(
59
214
  filename: &String,
60
215
  case_list: &Vec<&str>,
61
216
  control_list: &Vec<&str>,
@@ -67,7 +222,6 @@ fn input_data(
67
222
  Vec<String>,
68
223
  ) {
69
224
  let input_time = Instant::now();
70
- //let mut rdr = csv::Reader::from_path(path).unwrap();
71
225
  let mut file = File::open(filename).unwrap();
72
226
  let mut num_lines: usize = 0;
73
227
  let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
@@ -81,7 +235,8 @@ fn input_data(
81
235
  // Check headers for samples
82
236
  let lines: Vec<&str> = buffer.split('\n').collect::<Vec<&str>>();
83
237
  let total_lines = lines.len();
84
- let headers: Vec<&str> = lines[0].split('\t').collect::<Vec<&str>>();
238
+ let header_binding = lines[0].replace("\r", "");
239
+ let headers: Vec<&str> = header_binding.split('\t').collect::<Vec<&str>>();
85
240
  //println!("headers:{:?}", headers);
86
241
  let mut case_indexes_original: Vec<usize> = Vec::with_capacity(case_list.len());
87
242
  let mut control_indexes_original: Vec<usize> = Vec::with_capacity(control_list.len());
@@ -128,7 +283,7 @@ fn input_data(
128
283
  let lines_slice = &lines[..];
129
284
  for line_iter in 1..lines_slice.len() - 1 {
130
285
  // Subtracting 1 from total length of lines_slice because the last one will be empty
131
- let line = lines_slice[line_iter];
286
+ let line = lines_slice[line_iter].replace("\r", "");
132
287
  let mut index = 0;
133
288
  for field in line.split('\t').collect::<Vec<&str>>() {
134
289
  if index == gene_name_index.unwrap() {
@@ -223,7 +378,7 @@ fn input_data(
223
378
  if remainder == thread_num {
224
379
  //println!("buffer:{}", buffer);
225
380
  // Thread analyzing a particular line must have the same remainder as the thread_num, this avoids multiple threads from parsing the same line
226
- let line = lines[line_iter];
381
+ let line = lines[line_iter].replace("\r", "");
227
382
  let mut index = 0;
228
383
  for field in line.split('\t').collect::<Vec<&str>>() {
229
384
  if index == gene_name_index.unwrap() {
@@ -355,6 +510,43 @@ struct PValueIndexes {
355
510
  p_value: f64,
356
511
  }
357
512
 
513
+ // Used to get the sample names from HDF5 file at PP server startup
514
+ fn get_DE_samples(hdf5_filename: &String) {
515
+ let file = HDF5File::open(&hdf5_filename).unwrap(); // open for reading
516
+ let now_samples = Instant::now();
517
+ let ds_samples = file.dataset("samples").unwrap();
518
+ let samples = ds_samples.read::<VarLenAscii, Dim<[usize; 1]>>().unwrap();
519
+ println!("\tsamples = {:?}", samples);
520
+ println!("\tsamples.shape() = {:?}", samples.shape());
521
+ println!("\tsamples.strides() = {:?}", samples.strides());
522
+ println!("\tsamples.ndim() = {:?}", samples.ndim());
523
+ println!("Time for parsing samples:{:?}", now_samples.elapsed());
524
+
525
+ let mut output_string = "".to_string();
526
+ for i in 0..samples.len() {
527
+ //let item_json = "{\"".to_string()
528
+ // + &samples[i].to_string()
529
+ // + &"\","
530
+ // + &gene_array[i].to_string()
531
+ // + &"}";
532
+
533
+ //let item_json = format!("{{\"{}\"}}", samples[i].to_string());
534
+
535
+ output_string += &format!("{}", samples[i].to_string());
536
+ //println!("item_json:{}", item_json);
537
+
538
+ //let item_json = format!(
539
+ // r##"{{"{}",{}}}"##,
540
+ // samples[i].to_string().replace("\\", ""),
541
+ // gene_array[i].to_string()
542
+ //);
543
+ if i != samples.len() - 1 {
544
+ output_string += &",";
545
+ }
546
+ }
547
+ println!("output_string:{}", output_string);
548
+ }
549
+
358
550
  fn main() {
359
551
  //env::set_var("RUST_BACKTRACE", "full");
360
552
  let mut input = String::new();
@@ -368,28 +560,6 @@ fn main() {
368
560
  match input_json {
369
561
  Ok(json_string) => {
370
562
  let now = Instant::now();
371
- let min_count_option = json_string["min_count"].as_f64().to_owned();
372
- let min_total_count_option = json_string["min_total_count"].as_f64().to_owned();
373
- let min_count;
374
- match min_count_option {
375
- Some(x) => min_count = x,
376
- None => {
377
- panic!("min_count is missing a value")
378
- }
379
- }
380
- let min_total_count;
381
- match min_total_count_option {
382
- Some(x) => min_total_count = x,
383
- None => {
384
- panic!("min_total_count is missing a value")
385
- }
386
- }
387
- let case_string = &json_string["case"].to_owned().as_str().unwrap().to_string();
388
- let control_string = &json_string["control"]
389
- .to_owned()
390
- .as_str()
391
- .unwrap()
392
- .to_string();
393
563
  let file_name = &json_string["input_file"]
394
564
  .to_owned()
395
565
  .as_str()
@@ -397,149 +567,150 @@ fn main() {
397
567
  .to_string()
398
568
  .split(",")
399
569
  .collect();
400
- let case_list: Vec<&str> = case_string.split(",").collect();
401
- let control_list: Vec<&str> = control_string.split(",").collect();
402
- let (input_matrix, case_indexes, control_indexes, gene_names, gene_symbols) =
403
- input_data(file_name, &case_list, &control_list);
404
- let filtering_time = Instant::now();
405
- let (filtered_matrix, lib_sizes, filtered_genes, filtered_gene_symbols) =
406
- filter_by_expr(
407
- min_count,
408
- min_total_count,
409
- &input_matrix,
410
- case_indexes.len(),
411
- control_indexes.len(),
412
- gene_names,
413
- gene_symbols,
414
- );
415
- println!("filtering time:{:?}", filtering_time.elapsed());
416
- //println!("filtered_matrix_rows:{:?}", filtered_matrix.nrows());
417
- //println!("filtered_matrix_cols:{:?}", filtered_matrix.ncols());
418
- let cpm_normalization_time = Instant::now();
419
- let mut normalized_matrix = cpm(&filtered_matrix);
420
- println!(
421
- "cpm normalization time:{:?}",
422
- cpm_normalization_time.elapsed()
423
- );
424
- let tmm_normalization_time = Instant::now();
425
- let norm_factors = tmm_normalization(filtered_matrix, &lib_sizes);
426
- println!(
427
- "tmm normalization time:{:?}",
428
- tmm_normalization_time.elapsed()
429
- );
430
- //println!("norm_factors:{:?}", norm_factors);
431
-
432
- for col in 0..normalized_matrix.ncols() {
433
- let norm_factor = norm_factors[col];
434
- for row in 0..normalized_matrix.nrows() {
435
- normalized_matrix[(row, col)] =
436
- normalized_matrix[(row, col)] / norm_factor;
437
- }
438
- }
439
- //println!("normalized_matrix:{:?}", normalized_matrix);
440
- println!("Number of cases:{}", case_list.len());
441
- println!("Number of controls:{}", control_list.len());
442
- println!("Time for pre-processing:{:?}", now.elapsed());
443
- // Using Wilcoxon test for differential gene expression
444
-
445
- let now2 = Instant::now();
446
- let mut p_values: Vec<PValueIndexes> =
447
- Vec::with_capacity(normalized_matrix.nrows());
448
- const THRESHOLD: usize = 50; // This determines whether the Wilcoxon exact test or the normal test will be used based on sample size.
449
-
450
- //println!("case_indexes:{:?}", case_indexes);
451
- //println!("control_indexes:{:?}", control_indexes);
452
- let num_normalized_rows = normalized_matrix.nrows();
453
- if normalized_matrix.nrows() * normalized_matrix.ncols() < PAR_CUTOFF {
454
- for i in 0..normalized_matrix.nrows() {
455
- let row = normalized_matrix.row(i);
456
- //println!("row:{:?}", row);
457
- let mut treated = Vec::<f64>::new();
458
- let mut control = Vec::<f64>::new();
459
- //println!("conditions:{:?}", conditions);
460
- for j in 0..(case_indexes.len() + control_indexes.len()) {
461
- //println!("row[(0, j)]:{}", row[(0, j)]);
462
- if case_indexes.contains(&j) {
463
- treated.push(row[(0, j)]);
464
- //println!("{},{}", input_data_vec.0[i][j], "Diseased");
465
- } else if control_indexes.contains(&j) {
466
- // + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
467
- control.push(row[(0, j)]);
468
- //println!("{},{}", input_data_vec.0[i][j], "Control");
570
+ println!("file_name:{}", file_name);
571
+ let data_type_option = json_string["data_type"].as_str().to_owned();
572
+ match data_type_option {
573
+ Some(x) => {
574
+ if x == "get_samples" {
575
+ get_DE_samples(file_name)
576
+ } else if x == "do_DE" {
577
+ let min_count_option = json_string["min_count"].as_f64().to_owned();
578
+ let min_total_count_option =
579
+ json_string["min_total_count"].as_f64().to_owned();
580
+ let storage_type_option =
581
+ json_string["storage_type"].as_str().to_owned();
582
+ let storage_type;
583
+ match storage_type_option {
584
+ Some(x) => {
585
+ if x == "HDF5" {
586
+ storage_type = "HDF5"
587
+ } else if x == "text" {
588
+ storage_type = "text"
589
+ } else {
590
+ panic!(
591
+ "Unknown storage_type:{}{}",
592
+ x, " Needs to be either HDF5 or text"
593
+ );
594
+ }
595
+ }
596
+ None => panic!("storage_type needs to be HDF5 or text"),
597
+ }
598
+ let min_count;
599
+ match min_count_option {
600
+ Some(x) => min_count = x,
601
+ None => {
602
+ panic!("min_count is missing a value")
603
+ }
604
+ }
605
+ let min_total_count;
606
+ match min_total_count_option {
607
+ Some(x) => min_total_count = x,
608
+ None => {
609
+ panic!("min_total_count is missing a value")
610
+ }
611
+ }
612
+ let case_string =
613
+ &json_string["case"].to_owned().as_str().unwrap().to_string();
614
+ let control_string = &json_string["control"]
615
+ .to_owned()
616
+ .as_str()
617
+ .unwrap()
618
+ .to_string();
619
+ let case_list: Vec<&str> = case_string.split(",").collect();
620
+ let control_list: Vec<&str> = control_string.split(",").collect();
621
+ let (
622
+ input_matrix,
623
+ case_indexes,
624
+ control_indexes,
625
+ gene_names,
626
+ gene_symbols,
627
+ );
628
+ if storage_type == "text" {
629
+ (
630
+ input_matrix,
631
+ case_indexes,
632
+ control_indexes,
633
+ gene_names,
634
+ gene_symbols,
635
+ ) = input_data_from_text(file_name, &case_list, &control_list);
469
636
  } else {
470
- panic!(
471
- "Column {} could not be classified into case/control",
472
- j
473
- );
637
+ // Parsing data from a HDF5 file
638
+ (
639
+ input_matrix,
640
+ case_indexes,
641
+ control_indexes,
642
+ gene_names,
643
+ gene_symbols,
644
+ ) = input_data_from_HDF5(file_name, &case_list, &control_list);
474
645
  }
475
- }
476
- //println!("treated{:?}", treated);
477
- //println!("control{:?}", control);
478
- let p_value = stats_functions::wilcoxon_rank_sum_test(
479
- treated.clone(),
480
- control.clone(),
481
- THRESHOLD,
482
- 't',
483
- true,
484
- ); // Setting continuity correction to true in case of normal approximation
485
- let treated_mean = Data::new(treated).mean();
486
- let control_mean = Data::new(control).mean();
487
- if (treated_mean.unwrap() / control_mean.unwrap())
488
- .log2()
489
- .is_nan()
490
- == false
491
- && (treated_mean.unwrap() / control_mean.unwrap())
492
- .log2()
493
- .is_infinite()
494
- == false
495
- {
496
- p_values.push(PValueIndexes {
497
- index: i,
498
- gene_name: filtered_genes[i].to_owned(),
499
- gene_symbol: filtered_gene_symbols[i].to_owned(),
500
- fold_change: (treated_mean.unwrap() / control_mean.unwrap())
501
- .log2(),
502
- p_value: p_value,
503
- });
504
- }
505
- }
506
- } else {
507
- // Multithreaded implementation of calculating wilcoxon p-values
508
- let normalized_matrix_temp = Arc::new(normalized_matrix);
509
- let filtered_genes_temp = Arc::new(filtered_genes);
510
- let filtered_gene_symbols_temp = Arc::new(filtered_gene_symbols);
511
- let case_indexes_temp = Arc::new(case_indexes);
512
- let control_indexes_temp = Arc::new(control_indexes);
513
- let p_values_temp = Arc::new(Mutex::new(Vec::<PValueIndexes>::new()));
514
- let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
515
- for thread_num in 0..max_threads {
516
- let normalized_matrix_temp = Arc::clone(&normalized_matrix_temp);
517
- let case_indexes_temp = Arc::clone(&case_indexes_temp);
518
- let control_indexes_temp = Arc::clone(&control_indexes_temp);
519
- let p_values_temp = Arc::clone(&p_values_temp);
520
- let filtered_genes_temp = Arc::clone(&filtered_genes_temp);
521
- let filtered_gene_symbols_temp =
522
- Arc::clone(&filtered_gene_symbols_temp);
523
- let handle = thread::spawn(move || {
524
- let mut p_values_thread: Vec<PValueIndexes> = Vec::with_capacity(
525
- normalized_matrix_temp.nrows() / max_threads,
646
+ let filtering_time = Instant::now();
647
+ let (
648
+ filtered_matrix,
649
+ lib_sizes,
650
+ filtered_genes,
651
+ filtered_gene_symbols,
652
+ ) = filter_by_expr(
653
+ min_count,
654
+ min_total_count,
655
+ &input_matrix,
656
+ case_indexes.len(),
657
+ control_indexes.len(),
658
+ gene_names,
659
+ gene_symbols,
660
+ );
661
+ println!("filtering time:{:?}", filtering_time.elapsed());
662
+ //println!("filtered_matrix_rows:{:?}", filtered_matrix.nrows());
663
+ //println!("filtered_matrix_cols:{:?}", filtered_matrix.ncols());
664
+ let cpm_normalization_time = Instant::now();
665
+ let mut normalized_matrix = cpm(&filtered_matrix);
666
+ println!(
667
+ "cpm normalization time:{:?}",
668
+ cpm_normalization_time.elapsed()
526
669
  );
527
- for i in 0..normalized_matrix_temp.nrows() {
528
- let remainder: usize = i % max_threads; // Calculate remainder of iteration number divided by max_threads to decide which thread parses the row
529
- if remainder == thread_num {
530
- let row = normalized_matrix_temp.row(i);
670
+ let tmm_normalization_time = Instant::now();
671
+ let norm_factors = tmm_normalization(filtered_matrix, &lib_sizes);
672
+ println!(
673
+ "tmm normalization time:{:?}",
674
+ tmm_normalization_time.elapsed()
675
+ );
676
+ //println!("norm_factors:{:?}", norm_factors);
677
+
678
+ for col in 0..normalized_matrix.ncols() {
679
+ let norm_factor = norm_factors[col];
680
+ for row in 0..normalized_matrix.nrows() {
681
+ normalized_matrix[(row, col)] =
682
+ normalized_matrix[(row, col)] / norm_factor;
683
+ }
684
+ }
685
+ //println!("normalized_matrix:{:?}", normalized_matrix);
686
+ println!("Number of cases:{}", case_list.len());
687
+ println!("Number of controls:{}", control_list.len());
688
+ println!("Time for pre-processing:{:?}", now.elapsed());
689
+ // Using Wilcoxon test for differential gene expression
690
+
691
+ let now2 = Instant::now();
692
+ let mut p_values: Vec<PValueIndexes> =
693
+ Vec::with_capacity(normalized_matrix.nrows());
694
+ const THRESHOLD: usize = 50; // This determines whether the Wilcoxon exact test or the normal test will be used based on sample size.
695
+
696
+ //println!("case_indexes:{:?}", case_indexes);
697
+ //println!("control_indexes:{:?}", control_indexes);
698
+ let num_normalized_rows = normalized_matrix.nrows();
699
+ if normalized_matrix.nrows() * normalized_matrix.ncols()
700
+ < PAR_CUTOFF
701
+ {
702
+ for i in 0..normalized_matrix.nrows() {
703
+ let row = normalized_matrix.row(i);
531
704
  //println!("row:{:?}", row);
532
705
  let mut treated = Vec::<f64>::new();
533
706
  let mut control = Vec::<f64>::new();
534
707
  //println!("conditions:{:?}", conditions);
535
- for j in 0..(case_indexes_temp.len()
536
- + control_indexes_temp.len())
537
- {
708
+ for j in 0..(case_indexes.len() + control_indexes.len()) {
538
709
  //println!("row[(0, j)]:{}", row[(0, j)]);
539
- if case_indexes_temp.contains(&j) {
710
+ if case_indexes.contains(&j) {
540
711
  treated.push(row[(0, j)]);
541
712
  //println!("{},{}", input_data_vec.0[i][j], "Diseased");
542
- } else if control_indexes_temp.contains(&j) {
713
+ } else if control_indexes.contains(&j) {
543
714
  // + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
544
715
  control.push(row[(0, j)]);
545
716
  //println!("{},{}", input_data_vec.0[i][j], "Control");
@@ -570,11 +741,10 @@ fn main() {
570
741
  .is_infinite()
571
742
  == false
572
743
  {
573
- p_values_thread.push(PValueIndexes {
744
+ p_values.push(PValueIndexes {
574
745
  index: i,
575
- gene_name: filtered_genes_temp[i].to_owned(),
576
- gene_symbol: filtered_gene_symbols_temp[i]
577
- .to_owned(),
746
+ gene_name: filtered_genes[i].to_owned(),
747
+ gene_symbol: filtered_gene_symbols[i].to_owned(),
578
748
  fold_change: (treated_mean.unwrap()
579
749
  / control_mean.unwrap())
580
750
  .log2(),
@@ -582,27 +752,126 @@ fn main() {
582
752
  });
583
753
  }
584
754
  }
755
+ } else {
756
+ // Multithreaded implementation of calculating wilcoxon p-values
757
+ let normalized_matrix_temp = Arc::new(normalized_matrix);
758
+ let filtered_genes_temp = Arc::new(filtered_genes);
759
+ let filtered_gene_symbols_temp =
760
+ Arc::new(filtered_gene_symbols);
761
+ let case_indexes_temp = Arc::new(case_indexes);
762
+ let control_indexes_temp = Arc::new(control_indexes);
763
+ let p_values_temp =
764
+ Arc::new(Mutex::new(Vec::<PValueIndexes>::new()));
765
+ let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
766
+ for thread_num in 0..max_threads {
767
+ let normalized_matrix_temp =
768
+ Arc::clone(&normalized_matrix_temp);
769
+ let case_indexes_temp = Arc::clone(&case_indexes_temp);
770
+ let control_indexes_temp =
771
+ Arc::clone(&control_indexes_temp);
772
+ let p_values_temp = Arc::clone(&p_values_temp);
773
+ let filtered_genes_temp = Arc::clone(&filtered_genes_temp);
774
+ let filtered_gene_symbols_temp =
775
+ Arc::clone(&filtered_gene_symbols_temp);
776
+ let handle = thread::spawn(move || {
777
+ let mut p_values_thread: Vec<PValueIndexes> =
778
+ Vec::with_capacity(
779
+ normalized_matrix_temp.nrows() / max_threads,
780
+ );
781
+ for i in 0..normalized_matrix_temp.nrows() {
782
+ let remainder: usize = i % max_threads; // Calculate remainder of iteration number divided by max_threads to decide which thread parses the row
783
+ if remainder == thread_num {
784
+ let row = normalized_matrix_temp.row(i);
785
+ //println!("row:{:?}", row);
786
+ let mut treated = Vec::<f64>::new();
787
+ let mut control = Vec::<f64>::new();
788
+ //println!("conditions:{:?}", conditions);
789
+ for j in 0..(case_indexes_temp.len()
790
+ + control_indexes_temp.len())
791
+ {
792
+ //println!("row[(0, j)]:{}", row[(0, j)]);
793
+ if case_indexes_temp.contains(&j) {
794
+ treated.push(row[(0, j)]);
795
+ //println!("{},{}", input_data_vec.0[i][j], "Diseased");
796
+ } else if control_indexes_temp.contains(&j)
797
+ {
798
+ // + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
799
+ control.push(row[(0, j)]);
800
+ //println!("{},{}", input_data_vec.0[i][j], "Control");
801
+ } else {
802
+ panic!(
803
+ "Column {} could not be classified into case/control",
804
+ j
805
+ );
806
+ }
807
+ }
808
+ //println!("treated{:?}", treated);
809
+ //println!("control{:?}", control);
810
+ let p_value =
811
+ stats_functions::wilcoxon_rank_sum_test(
812
+ treated.clone(),
813
+ control.clone(),
814
+ THRESHOLD,
815
+ 't',
816
+ true,
817
+ ); // Setting continuity correction to true in case of normal approximation
818
+ let treated_mean = Data::new(treated).mean();
819
+ let control_mean = Data::new(control).mean();
820
+ if (treated_mean.unwrap()
821
+ / control_mean.unwrap())
822
+ .log2()
823
+ .is_nan()
824
+ == false
825
+ && (treated_mean.unwrap()
826
+ / control_mean.unwrap())
827
+ .log2()
828
+ .is_infinite()
829
+ == false
830
+ {
831
+ p_values_thread.push(PValueIndexes {
832
+ index: i,
833
+ gene_name: filtered_genes_temp[i]
834
+ .to_owned(),
835
+ gene_symbol: filtered_gene_symbols_temp
836
+ [i]
837
+ .to_owned(),
838
+ fold_change: (treated_mean.unwrap()
839
+ / control_mean.unwrap())
840
+ .log2(),
841
+ p_value: p_value,
842
+ });
843
+ }
844
+ }
845
+ }
846
+ p_values_temp
847
+ .lock()
848
+ .unwrap()
849
+ .append(&mut p_values_thread);
850
+ });
851
+ handles.push(handle);
852
+ }
853
+ for handle in handles {
854
+ // Wait for all threads to finish before proceeding further
855
+ handle.join().unwrap();
856
+ }
857
+ p_values.append(&mut *p_values_temp.lock().unwrap());
585
858
  }
586
- p_values_temp.lock().unwrap().append(&mut p_values_thread);
587
- });
588
- handles.push(handle);
859
+ //println!("p_values:{:?}", p_values);
860
+ println!(
861
+ "Time for running {} wilcoxon tests:{:?}",
862
+ num_normalized_rows,
863
+ now2.elapsed()
864
+ );
865
+ let adjusted_p_values = adjust_p_values(p_values);
866
+ println!("adjusted_p_values:{}", adjusted_p_values);
867
+ //let fold_changes =
868
+ // calculate_fold_change(normalized_matrix, case_indexes, control_indexes);
869
+ }
589
870
  }
590
- for handle in handles {
591
- // Wait for all threads to finish before proceeding further
592
- handle.join().unwrap();
871
+ None => {
872
+ panic!("data_type is missing")
593
873
  }
594
- p_values.append(&mut *p_values_temp.lock().unwrap());
595
874
  }
596
- //println!("p_values:{:?}", p_values);
597
- println!(
598
- "Time for running {} wilcoxon tests:{:?}",
599
- num_normalized_rows,
600
- now2.elapsed()
601
- );
602
- let adjusted_p_values = adjust_p_values(p_values);
603
- println!("adjusted_p_values:{}", adjusted_p_values);
604
- //let fold_changes =
605
- // calculate_fold_change(normalized_matrix, case_indexes, control_indexes);
606
875
  }
607
876
  Err(error) => println!("Incorrect json: {}", error),
608
877
  }
@@ -1052,6 +1321,7 @@ fn filter_by_expr(
1052
1321
  positives.push(row);
1053
1322
  }
1054
1323
  }
1324
+ println!("positives length:{}", positives.len());
1055
1325
  //println!("row_sums:{:?}", row_sums);
1056
1326
  //println!("keep_cpm:{:?}", keep_cpm);
1057
1327
  //println!("positive_cpm:{}", positive_cpm);
@@ -1067,12 +1337,17 @@ fn filter_by_expr(
1067
1337
  let mut filtered_genes: Vec<String> = Vec::with_capacity(positives.len());
1068
1338
  let mut filtered_gene_symbols: Vec<String> = Vec::with_capacity(positives.len());
1069
1339
  let mut i = 0;
1340
+ println!("filtered_matrix rows:{}", filtered_matrix.nrows());
1341
+ println!("filtered_matrix cols:{}", filtered_matrix.ncols());
1070
1342
  for index in positives {
1071
1343
  let row = raw_data.row(index);
1072
1344
  filtered_genes.push(gene_names[index].to_owned());
1073
1345
  filtered_gene_symbols.push(gene_symbols[index].to_owned());
1074
1346
  let mut j = 0;
1075
1347
  for item in &row {
1348
+ //println!("index:{}", index);
1349
+ //println!("i:{}", i);
1350
+ //println!("j:{}", j);
1076
1351
  filtered_matrix[(i, j)] = *item;
1077
1352
  j += 1;
1078
1353
  }
package/src/genesetORA.rs CHANGED
@@ -7,6 +7,7 @@ use rusqlite::{Connection, Result};
7
7
  use serde::{Deserialize, Serialize};
8
8
  use serde_json;
9
9
  use std::cmp::Ordering;
10
+ use std::collections::HashSet;
10
11
  use std::io;
11
12
  use std::time::Instant;
12
13
 
@@ -17,15 +18,6 @@ struct GO_pathway {
17
18
  GO_id: String,
18
19
  }
19
20
 
20
- #[allow(non_camel_case_types)]
21
- #[allow(non_snake_case)]
22
- #[derive(Debug)]
23
- struct pathway_genes {
24
- symbol: String,
25
- _ensg: String,
26
- _enstCanonical: String,
27
- }
28
-
29
21
  #[allow(non_camel_case_types)]
30
22
  #[allow(non_snake_case)]
31
23
  #[derive(Debug, Serialize, Deserialize)]
@@ -39,28 +31,27 @@ struct pathway_p_value {
39
31
  }
40
32
 
41
33
  fn calculate_hypergeometric_p_value(
42
- sample_genes: &Vec<&str>,
34
+ sample_genes: &HashSet<String>,
43
35
  num_background_genes: usize,
44
- genes_in_pathway: Vec<pathway_genes>,
36
+ genes_in_pathway: HashSet<String>,
45
37
  ) -> (f64, f64, String) {
46
- let mut matching_sample_genes_counts = 0.0;
47
38
  let mut gene_set_hits: String = "".to_string();
48
- for gene in sample_genes {
49
- for pathway in &genes_in_pathway {
50
- if pathway.symbol == gene.to_string() {
51
- matching_sample_genes_counts += 1.0;
52
- gene_set_hits += &(gene.to_string() + &",");
53
- }
54
- }
39
+
40
+ let gene_intersections: HashSet<String> = genes_in_pathway
41
+ .intersection(sample_genes)
42
+ .cloned()
43
+ .collect();
44
+ for gene in &gene_intersections {
45
+ gene_set_hits += &(gene.to_string() + &",");
55
46
  }
56
47
 
57
- if matching_sample_genes_counts > 0.0 {
48
+ if gene_intersections.len() > 0 {
58
49
  gene_set_hits.pop();
59
50
  }
60
51
 
61
52
  //println!("sample_genes:{:?}", sample_genes);
62
53
  //println!("genes_in_pathway:{:?}", genes_in_pathway);
63
- //println!("k-1:{}", matching_sample_genes_counts - 1.0);
54
+ //println!("k-1:{}", gene_intersection.len() - 1.0);
64
55
  //println!("M:{}", genes_in_pathway.len() as f64);
65
56
  //println!(
66
57
  // "N-M:{}",
@@ -68,7 +59,7 @@ fn calculate_hypergeometric_p_value(
68
59
  //);
69
60
  //println!("n:{}", sample_genes.len() as f64);
70
61
  let p_value = r_mathlib::hypergeometric_cdf(
71
- matching_sample_genes_counts - 1.0,
62
+ gene_intersections.len() as f64 - 1.0,
72
63
  genes_in_pathway.len() as f64,
73
64
  num_background_genes as f64 - genes_in_pathway.len() as f64,
74
65
  sample_genes.len() as f64,
@@ -76,7 +67,7 @@ fn calculate_hypergeometric_p_value(
76
67
  false,
77
68
  );
78
69
  //println!("p_value:{}", p_value);
79
- (p_value, matching_sample_genes_counts, gene_set_hits)
70
+ (p_value, gene_intersections.len() as f64, gene_set_hits)
80
71
  }
81
72
 
82
73
  fn main() -> Result<()> {
@@ -104,8 +95,49 @@ fn main() -> Result<()> {
104
95
  let sample_genes: Vec<&str> =
105
96
  sample_genes_input.as_str().unwrap().split(",").collect();
106
97
  let mut pathway_p_values: Vec<pathway_p_value> = Vec::with_capacity(10000);
98
+
99
+ let genedb_input: &JsonValue = &json_string["genedb"];
100
+ let genedb;
101
+ match genedb_input.as_str() {
102
+ Some(gene_db_string) => genedb = gene_db_string.to_string(),
103
+ None => panic!("genedb file path is missing"),
104
+ }
105
+
106
+ let filter_non_coding_genes_input: &JsonValue =
107
+ &json_string["filter_non_coding_genes"];
108
+ let filter_non_coding_genes: bool =
109
+ filter_non_coding_genes_input.as_bool().unwrap();
110
+
111
+ let genedbconn = Connection::open(genedb)?;
112
+ let genedb_result = genedbconn.prepare(&("select * from codingGenes"));
113
+ let mut num_coding_genes: usize = 0;
114
+ let mut sample_coding_genes: HashSet<String> = HashSet::with_capacity(24000);
115
+ match genedb_result {
116
+ Ok(mut x) => {
117
+ let mut genes = x.query([])?;
118
+ while let Some(coding_gene) = genes.next()? {
119
+ num_coding_genes += 1;
120
+ //println!("coding_gene:{:?}", coding_gene);
121
+ for sample_gene in &sample_genes {
122
+ let code_gene: String = coding_gene.get(0).unwrap();
123
+ if filter_non_coding_genes == true && code_gene == *sample_gene
124
+ {
125
+ sample_coding_genes.insert(code_gene);
126
+ } else if filter_non_coding_genes == false {
127
+ sample_coding_genes.insert(code_gene);
128
+ }
129
+ }
130
+ }
131
+ }
132
+ Err(_) => {}
133
+ }
134
+
135
+ if sample_coding_genes.len() == 0 {
136
+ panic!("All query genes are non-coding");
137
+ }
138
+
107
139
  let background_genes_input: &JsonValue = &json_string["background_genes"];
108
- let mut num_background_genes: usize = 0;
140
+ let num_background_genes;
109
141
  match background_genes_input.as_str() {
110
142
  Some(x) => {
111
143
  let background_genes_str: Vec<&str> = x.split(",").collect(); // Background genes is defined for e.g in case of DE analysis
@@ -114,24 +146,7 @@ fn main() -> Result<()> {
114
146
  None => {
115
147
  // Background genes not present for e.g. in hierarchial clustering
116
148
  // Get background genes from the gene database
117
- let genedb_input: &JsonValue = &json_string["genedb"];
118
- let genedb;
119
- match genedb_input.as_str() {
120
- Some(gene_db_string) => genedb = gene_db_string.to_string(),
121
- None => panic!("genedb file path is missing"),
122
- }
123
-
124
- let genedbconn = Connection::open(genedb)?;
125
- let genedb_result = genedbconn.prepare(&("select * from codingGenes"));
126
- match genedb_result {
127
- Ok(mut x) => {
128
- let mut genes = x.query([])?;
129
- while let Some(_gene) = genes.next()? {
130
- num_background_genes += 1;
131
- }
132
- }
133
- Err(_) => {}
134
- }
149
+ num_background_genes = num_coding_genes;
135
150
  }
136
151
  }
137
152
  //println!("sample_genes:{:?}", sample_genes);
@@ -169,24 +184,17 @@ fn main() -> Result<()> {
169
184
  //println!("gene_stmt:{:?}", gene_stmt);
170
185
 
171
186
  let mut rows = gene_stmt.query([])?;
172
- let mut names = Vec::<pathway_genes>::new();
187
+ let mut names = HashSet::<String>::new();
173
188
  while let Some(row) = rows.next()? {
174
189
  let a: String = row.get(0)?;
175
190
  let input_gene_json = json::parse(&a);
176
191
  match input_gene_json {
177
192
  Ok(json_genes) => {
178
193
  for json_iter in 0..json_genes.len() {
179
- let item = pathway_genes {
180
- symbol: json_genes[json_iter]["symbol"]
181
- .to_string(),
182
- _ensg: json_genes[json_iter]["ensg"]
183
- .to_string(),
184
- _enstCanonical: json_genes[json_iter]
185
- ["enstCanonical"]
194
+ names.insert(
195
+ json_genes[json_iter]["symbol"]
186
196
  .to_string(),
187
- };
188
- //println!("item:{:?}", item);
189
- names.push(item);
197
+ );
190
198
  }
191
199
  }
192
200
  Err(_) => {
@@ -199,7 +207,7 @@ fn main() -> Result<()> {
199
207
  let gene_set_size = names.len();
200
208
  let (p_value, matches, gene_set_hits) =
201
209
  calculate_hypergeometric_p_value(
202
- &sample_genes,
210
+ &sample_coding_genes,
203
211
  num_background_genes,
204
212
  names,
205
213
  );