@sjcrh/proteinpaint-rust 2.148.1 → 2.150.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/DEanalysis.rs CHANGED
@@ -2,15 +2,16 @@
2
2
  // cd .. && cargo build --release && json='{"data_type":"get_samples","input_file":"/Users/rpaul1/pp_data/files/hg38/ALL-pharmacotyping/rnaseq/counts.h5"}' && time echo $json | target/release/DEanalysis
3
3
  // cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/DEanalysis
4
4
  #![allow(non_snake_case)]
5
- use hdf5::types::VarLenAscii;
6
5
  use hdf5::File as HDF5File;
6
+ use hdf5::types::VarLenAscii;
7
+ use hdf5::types::VarLenUnicode;
7
8
  use json;
8
- use nalgebra::base::dimension::Const;
9
- use nalgebra::base::dimension::Dyn;
10
- use nalgebra::base::Matrix;
11
- use nalgebra::base::VecStorage;
12
9
  use nalgebra::DMatrix;
13
10
  use nalgebra::ViewStorage;
11
+ use nalgebra::base::Matrix;
12
+ use nalgebra::base::VecStorage;
13
+ use nalgebra::base::dimension::Const;
14
+ use nalgebra::base::dimension::Dyn;
14
15
  //use ndarray::Array1;
15
16
  use ndarray::Array2;
16
17
  use ndarray::Dim;
@@ -70,7 +71,6 @@ fn input_data_from_HDF5(
70
71
  Vec<usize>,
71
72
  Vec<usize>,
72
73
  Vec<String>,
73
- Vec<String>,
74
74
  ) {
75
75
  let file = HDF5File::open(&hdf5_filename).unwrap(); // open for reading
76
76
 
@@ -86,129 +86,53 @@ fn input_data_from_HDF5(
86
86
  //println!("num_samples bulk:{}", num_samples);
87
87
  //println!("num_genes bulk:{}", num_genes);
88
88
 
89
- //let now_gene_ids = Instant::now();
90
- let ds_gene_ids = file.dataset("gene_ids").unwrap();
91
- //println!("ds_gene_ids:{:?}", ds_gene_ids);
92
- let gene_ids = ds_gene_ids
93
- .read::<VarLenAscii, Dim<[usize; 1]>>()
94
- .unwrap();
95
- //println!("\tgene_ids = {:?}", gene_ids);
96
- //println!("\tgene_ids.shape() = {:?}", gene_ids.shape());
97
- //println!("\tgene_ids.strides() = {:?}", gene_ids.strides());
98
- //println!("\tgene_ids.ndim() = {:?}", gene_ids.ndim());
99
- //println!("Time for parsing gene names:{:?}", now_gene_ids.elapsed());
100
-
101
- //let now_gene_names = Instant::now();
102
- let ds_gene_names = file.dataset("gene_names").unwrap();
103
- //println!("ds_gene_names:{:?}", ds_gene_names);
104
- let gene_names = ds_gene_names
105
- .read::<VarLenAscii, Dim<[usize; 1]>>()
106
- .unwrap();
107
- //println!("\tgene_names = {:?}", gene_names);
108
- //println!("\tgene_names.shape() = {:?}", gene_names.shape());
109
- //println!("\tgene_names.strides() = {:?}", gene_names.strides());
110
- //println!("\tgene_names.ndim() = {:?}", gene_names.ndim());
111
- //println!(
112
- // "Time for parsing gene symbols:{:?}",
113
- // now_gene_names.elapsed()
114
- //);
89
+ // Read the item dataset
90
+ let ds_item = file.dataset("item").unwrap();
91
+ let item = ds_item.read_1d::<VarLenUnicode>().unwrap();
92
+ let gene_names: Vec<String> = item.iter().map(|x| x.to_string()).collect();
115
93
 
116
- let mut gene_ids_string: Vec<String> = Vec::with_capacity(gene_ids.len());
117
- let mut gene_names_string: Vec<String> = Vec::with_capacity(gene_names.len());
118
- for i in 0..gene_ids.len() {
119
- gene_ids_string.push(gene_ids[i].to_string());
120
- gene_names_string.push(gene_names[i].to_string());
121
- }
122
-
123
- //let now_samples = Instant::now();
94
+ // Read the samples dataset
124
95
  let ds_samples = file.dataset("samples").unwrap();
125
- let samples = ds_samples.read::<VarLenAscii, Dim<[usize; 1]>>().unwrap();
126
- //println!("\tsamples = {:?}", samples);
127
- //println!("\tsamples.shape() = {:?}", samples.shape());
128
- //println!("\tsamples.strides() = {:?}", samples.strides());
129
- //println!("\tsamples.ndim() = {:?}", samples.ndim());
130
- //println!("Time for parsing samples:{:?}", now_samples.elapsed());
96
+ let samples = ds_samples.read_1d::<VarLenUnicode>().unwrap();
97
+
98
+ // Read the matrix dataset
99
+ let ds_matrix = file.dataset("matrix").unwrap();
131
100
 
132
- //Find all columns values that are populated for the given gene
133
- //let now_counts = Instant::now();
134
- let ds_counts = file.dataset("counts").unwrap(); // open the dataset
101
+ // Get dimensions from the matrix dataset
102
+ let matrix_shape = ds_matrix.shape();
103
+ let num_genes = matrix_shape[0];
135
104
 
136
105
  let mut global_sample_index = 0;
137
106
  for sample_name in case_list {
138
- let sample_index;
139
- match samples
140
- .iter()
141
- .position(|x| x.to_string() == *sample_name.to_string())
142
- {
143
- Some(index) => {
144
- //println!(
145
- // "The index of '{}' is {} in 0-based format (add 1 to compare with R output)",
146
- // sample_name, index
147
- //);
148
- sample_index = index;
149
- }
150
- None => panic!(
151
- "Sample '{}' not found in the HDF5 file '{}'",
152
- sample_name, &hdf5_filename
153
- ),
107
+ if let Some(sample_index) = samples.iter().position(|x| x.to_string() == *sample_name.to_string()) {
108
+ let sample_array: Array2<f64> = ds_matrix
109
+ .read_slice_2d((0..num_genes, sample_index..sample_index + 1))
110
+ .unwrap();
111
+ input_vector.append(&mut sample_array.as_slice().unwrap().to_vec());
112
+ case_indexes.push(global_sample_index);
113
+ global_sample_index += 1;
154
114
  }
155
-
156
- let sample_array: Array2<f64> = ds_counts
157
- .read_slice_2d((0..gene_ids.len(), sample_index..sample_index + 1))
158
- .unwrap();
159
- //println!("Length of gene array:{:?}", sample_array.len()); // Please check the result
160
- input_vector.append(&mut sample_array.as_slice().unwrap().to_vec());
161
- case_indexes.push(global_sample_index);
162
- global_sample_index += 1;
115
+ // Skip sample if not found
163
116
  }
164
117
 
165
118
  for sample_name in control_list {
166
- let sample_index;
167
- match samples
168
- .iter()
169
- .position(|x| x.to_string() == *sample_name.to_string())
170
- {
171
- Some(index) => {
172
- //println!(
173
- // "The index of '{}' is {} in 0-based format (add 1 to compare with R output)",
174
- // sample_name, index
175
- //);
176
- sample_index = index;
177
- }
178
- None => panic!(
179
- "Sample '{}' not found in the HDF5 file '{}'",
180
- sample_name, &hdf5_filename
181
- ),
119
+ if let Some(sample_index) = samples.iter().position(|x| x.to_string() == *sample_name.to_string()) {
120
+ let sample_array: Array2<f64> = ds_matrix
121
+ .read_slice_2d((0..num_genes, sample_index..sample_index + 1))
122
+ .unwrap();
123
+ input_vector.append(&mut sample_array.as_slice().unwrap().to_vec());
124
+ control_indexes.push(global_sample_index);
125
+ global_sample_index += 1;
182
126
  }
183
- //let data_counts: Array1<_> = ds_counts.read::<f64, Dim<[usize; 1]>>().unwrap();
184
- //println!("Data_counts: {:?}", data_counts);
185
- let sample_array: Array2<f64> = ds_counts
186
- .read_slice_2d((0..gene_ids.len(), sample_index..sample_index + 1))
187
- .unwrap();
188
- //println!("Length of gene array:{:?}", sample_array.len()); // Please check the result
189
- input_vector.append(&mut sample_array.as_slice().unwrap().to_vec());
190
- control_indexes.push(global_sample_index);
191
- global_sample_index += 1;
127
+ // Ship sample if not found
192
128
  }
193
129
 
194
- //println!("Time for parsing HDF5 data:{:?}", now_counts.elapsed());
195
- //println!(
196
- // "case + control length:{}",
197
- // case_list.len() + control_list.len()
198
- //);
199
- //println!("gene_ids length:{}", gene_ids.len());
200
- //println!("input_vector length:{}", input_vector.len());
201
- let dm = DMatrix::from_row_slice(
202
- case_list.len() + control_list.len(),
203
- gene_ids.len(),
204
- &input_vector,
205
- );
130
+ let dm = DMatrix::from_row_slice(case_indexes.len() + control_indexes.len(), num_genes, &input_vector);
206
131
  (
207
132
  dm.transpose(), // Transposing the matrix
208
133
  case_indexes,
209
134
  control_indexes,
210
- gene_ids_string,
211
- gene_names_string,
135
+ gene_names,
212
136
  )
213
137
  }
214
138
 
@@ -221,7 +145,6 @@ fn input_data_from_text(
221
145
  Vec<usize>,
222
146
  Vec<usize>,
223
147
  Vec<String>,
224
- Vec<String>,
225
148
  ) {
226
149
  //let input_time = Instant::now();
227
150
  let mut file = File::open(filename).unwrap();
@@ -344,15 +267,14 @@ fn input_data_from_text(
344
267
  let control_indexes_original = Arc::new(control_indexes_original);
345
268
  let buffer = Arc::new(buffer);
346
269
  let case_indexes_temp = Arc::new(Mutex::new(Vec::<usize>::with_capacity(case_list.len())));
347
- let control_indexes_temp =
348
- Arc::new(Mutex::new(Vec::<usize>::with_capacity(control_list.len())));
270
+ let control_indexes_temp = Arc::new(Mutex::new(Vec::<usize>::with_capacity(control_list.len())));
349
271
  let num_lines_temp = Arc::new(Mutex::<usize>::new(0));
350
272
  let num_columns_temp = Arc::new(Mutex::<usize>::new(0));
351
273
  let genes_names_temp = Arc::new(Mutex::new(Vec::<String>::new()));
352
274
  let genes_symbols_temp = Arc::new(Mutex::new(Vec::<String>::new()));
353
275
  let input_vector_temp = Arc::new(Mutex::new(Vec::<f64>::new()));
354
276
  let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
355
- //println!("Number of threads used:{}", max_threads);
277
+ //println!("Number of threads used:{}", max_threads);
356
278
  for thread_num in 0..max_threads {
357
279
  let case_indexes_original = Arc::clone(&case_indexes_original);
358
280
  let control_indexes_original = Arc::clone(&control_indexes_original);
@@ -400,11 +322,11 @@ fn input_data_from_text(
400
322
  }
401
323
  Err(_n) => {
402
324
  panic!(
403
- "Number {} in line {} and column {} is not a decimal number",
404
- field,
405
- num_lines_thread + 1,
406
- index + 1
407
- );
325
+ "Number {} in line {} and column {} is not a decimal number",
326
+ field,
327
+ num_lines_thread + 1,
328
+ index + 1
329
+ );
408
330
  }
409
331
  }
410
332
  } else if binary_search(&control_indexes_original, index) != -1 {
@@ -420,11 +342,11 @@ fn input_data_from_text(
420
342
  }
421
343
  Err(_n) => {
422
344
  panic!(
423
- "Number {} in line {} and column {} is not a decimal number",
424
- field,
425
- num_lines_thread + 1,
426
- index + 1
427
- );
345
+ "Number {} in line {} and column {} is not a decimal number",
346
+ field,
347
+ num_lines_thread + 1,
348
+ index + 1
349
+ );
428
350
  }
429
351
  }
430
352
  }
@@ -433,26 +355,11 @@ fn input_data_from_text(
433
355
  num_lines_thread += 1;
434
356
  }
435
357
  }
436
- input_vector_temp
437
- .lock()
438
- .unwrap()
439
- .append(&mut input_vector_thread);
440
- case_indexes_temp
441
- .lock()
442
- .unwrap()
443
- .append(&mut case_indexes_thread);
444
- control_indexes_temp
445
- .lock()
446
- .unwrap()
447
- .append(&mut control_indexes_thread);
448
- genes_names_temp
449
- .lock()
450
- .unwrap()
451
- .append(&mut genes_names_thread);
452
- genes_symbols_temp
453
- .lock()
454
- .unwrap()
455
- .append(&mut genes_symbols_thread);
358
+ input_vector_temp.lock().unwrap().append(&mut input_vector_thread);
359
+ case_indexes_temp.lock().unwrap().append(&mut case_indexes_thread);
360
+ control_indexes_temp.lock().unwrap().append(&mut control_indexes_thread);
361
+ genes_names_temp.lock().unwrap().append(&mut genes_names_thread);
362
+ genes_symbols_temp.lock().unwrap().append(&mut genes_symbols_thread);
456
363
  *num_lines_temp.lock().unwrap() += num_lines_thread;
457
364
  if num_columns_thread > 0 {
458
365
  *num_columns_temp.lock().unwrap() += num_columns_thread;
@@ -490,14 +397,13 @@ fn input_data_from_text(
490
397
  //println!("Time for inputting data:{:?}", input_time.elapsed());
491
398
  let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
492
399
  //println!("dm:{:?}", dm);
493
- (dm, case_indexes, control_indexes, gene_ids, gene_names)
400
+ (dm, case_indexes, control_indexes, gene_names)
494
401
  }
495
402
 
496
403
  #[allow(dead_code)]
497
404
  #[derive(Debug, Serialize, Deserialize)]
498
405
  struct AdjustedPValueIndexes {
499
406
  index: usize,
500
- gene_id: String,
501
407
  gene_name: String,
502
408
  fold_change: f64,
503
409
  original_p_value: f64,
@@ -506,7 +412,6 @@ struct AdjustedPValueIndexes {
506
412
 
507
413
  struct PValueIndexes {
508
414
  index: usize,
509
- gene_id: String,
510
415
  gene_name: String,
511
416
  fold_change: f64,
512
417
  p_value: f64,
@@ -578,10 +483,8 @@ fn main() {
578
483
  get_DE_samples(file_name)
579
484
  } else if x == "do_DE" {
580
485
  let min_count_option = json_string["min_count"].as_f64().to_owned();
581
- let min_total_count_option =
582
- json_string["min_total_count"].as_f64().to_owned();
583
- let storage_type_option =
584
- json_string["storage_type"].as_str().to_owned();
486
+ let min_total_count_option = json_string["min_total_count"].as_f64().to_owned();
487
+ let storage_type_option = json_string["storage_type"].as_str().to_owned();
585
488
  let storage_type;
586
489
  match storage_type_option {
587
490
  Some(x) => {
@@ -590,10 +493,7 @@ fn main() {
590
493
  } else if x == "text" {
591
494
  storage_type = "text"
592
495
  } else {
593
- panic!(
594
- "Unknown storage_type:{}{}",
595
- x, " Needs to be either HDF5 or text"
596
- );
496
+ panic!("Unknown storage_type:{}{}", x, " Needs to be either HDF5 or text");
597
497
  }
598
498
  }
599
499
  None => panic!("storage_type needs to be HDF5 or text"),
@@ -612,53 +512,26 @@ fn main() {
612
512
  panic!("min_total_count is missing a value")
613
513
  }
614
514
  }
615
- let case_string =
616
- &json_string["case"].to_owned().as_str().unwrap().to_string();
617
- let control_string = &json_string["control"]
618
- .to_owned()
619
- .as_str()
620
- .unwrap()
621
- .to_string();
515
+ let case_string = &json_string["case"].to_owned().as_str().unwrap().to_string();
516
+ let control_string = &json_string["control"].to_owned().as_str().unwrap().to_string();
622
517
  let case_list: Vec<&str> = case_string.split(",").collect();
623
518
  let control_list: Vec<&str> = control_string.split(",").collect();
624
- let (
625
- input_matrix,
626
- case_indexes,
627
- control_indexes,
628
- gene_ids,
629
- gene_names,
630
- );
519
+ let (input_matrix, case_indexes, control_indexes, gene_names);
631
520
  if storage_type == "text" {
632
- (
633
- input_matrix,
634
- case_indexes,
635
- control_indexes,
636
- gene_ids,
637
- gene_names,
638
- ) = input_data_from_text(file_name, &case_list, &control_list);
521
+ (input_matrix, case_indexes, control_indexes, gene_names) =
522
+ input_data_from_text(file_name, &case_list, &control_list);
639
523
  } else {
640
524
  // Parsing data from a HDF5 file
641
- (
642
- input_matrix,
643
- case_indexes,
644
- control_indexes,
645
- gene_ids,
646
- gene_names,
647
- ) = input_data_from_HDF5(file_name, &case_list, &control_list);
525
+ (input_matrix, case_indexes, control_indexes, gene_names) =
526
+ input_data_from_HDF5(file_name, &case_list, &control_list);
648
527
  }
649
528
  //let filtering_time = Instant::now();
650
- let (
651
- filtered_matrix,
652
- lib_sizes,
653
- filtered_genes,
654
- filtered_gene_names,
655
- ) = filter_by_expr(
529
+ let (filtered_matrix, lib_sizes, filtered_gene_names) = filter_by_expr(
656
530
  min_count,
657
531
  min_total_count,
658
532
  &input_matrix,
659
533
  case_indexes.len(),
660
534
  control_indexes.len(),
661
- gene_ids,
662
535
  gene_names,
663
536
  );
664
537
  //println!("filtering time:{:?}", filtering_time.elapsed());
@@ -689,8 +562,7 @@ fn main() {
689
562
  for col in 0..normalized_matrix.ncols() {
690
563
  let norm_factor = norm_factors[col];
691
564
  for row in 0..normalized_matrix.nrows() {
692
- normalized_matrix[(row, col)] =
693
- normalized_matrix[(row, col)] / norm_factor;
565
+ normalized_matrix[(row, col)] = normalized_matrix[(row, col)] / norm_factor;
694
566
  }
695
567
  }
696
568
  //println!("normalized_matrix:{:?}", normalized_matrix);
@@ -700,16 +572,13 @@ fn main() {
700
572
  // Using Wilcoxon test for differential gene expression
701
573
 
702
574
  //let now2 = Instant::now();
703
- let mut p_values: Vec<PValueIndexes> =
704
- Vec::with_capacity(normalized_matrix.nrows());
575
+ let mut p_values: Vec<PValueIndexes> = Vec::with_capacity(normalized_matrix.nrows());
705
576
  const THRESHOLD: usize = 50; // This determines whether the Wilcoxon exact test or the normal test will be used based on sample size.
706
577
 
707
578
  //println!("case_indexes:{:?}", case_indexes);
708
579
  //println!("control_indexes:{:?}", control_indexes);
709
580
  //let num_normalized_rows = normalized_matrix.nrows();
710
- if normalized_matrix.nrows() * normalized_matrix.ncols()
711
- < PAR_CUTOFF
712
- {
581
+ if normalized_matrix.nrows() * normalized_matrix.ncols() < PAR_CUTOFF {
713
582
  for i in 0..normalized_matrix.nrows() {
714
583
  let row = normalized_matrix.row(i);
715
584
  //println!("row:{:?}", row);
@@ -726,10 +595,7 @@ fn main() {
726
595
  control.push(row[(0, j)]);
727
596
  //println!("{},{}", input_data_vec.0[i][j], "Control");
728
597
  } else {
729
- panic!(
730
- "Column {} could not be classified into case/control",
731
- j
732
- );
598
+ panic!("Column {} could not be classified into case/control", j);
733
599
  }
734
600
  }
735
601
  //println!("treated{:?}", treated);
@@ -743,22 +609,14 @@ fn main() {
743
609
  ); // Setting continuity correction to true in case of normal approximation
744
610
  let treated_mean = Data::new(treated).mean();
745
611
  let control_mean = Data::new(control).mean();
746
- if (treated_mean.unwrap() / control_mean.unwrap())
747
- .log2()
748
- .is_nan()
749
- == false
750
- && (treated_mean.unwrap() / control_mean.unwrap())
751
- .log2()
752
- .is_infinite()
612
+ if (treated_mean.unwrap() / control_mean.unwrap()).log2().is_nan() == false
613
+ && (treated_mean.unwrap() / control_mean.unwrap()).log2().is_infinite()
753
614
  == false
754
615
  {
755
616
  p_values.push(PValueIndexes {
756
617
  index: i,
757
- gene_id: filtered_genes[i].to_owned(),
758
618
  gene_name: filtered_gene_names[i].to_owned(),
759
- fold_change: (treated_mean.unwrap()
760
- / control_mean.unwrap())
761
- .log2(),
619
+ fold_change: (treated_mean.unwrap() / control_mean.unwrap()).log2(),
762
620
  p_value: p_value,
763
621
  });
764
622
  }
@@ -766,29 +624,20 @@ fn main() {
766
624
  } else {
767
625
  // Multithreaded implementation of calculating wilcoxon p-values
768
626
  let normalized_matrix_temp = Arc::new(normalized_matrix);
769
- let filtered_genes_temp = Arc::new(filtered_genes);
770
- let filtered_gene_names_temp =
771
- Arc::new(filtered_gene_names);
627
+ let filtered_gene_names_temp = Arc::new(filtered_gene_names);
772
628
  let case_indexes_temp = Arc::new(case_indexes);
773
629
  let control_indexes_temp = Arc::new(control_indexes);
774
- let p_values_temp =
775
- Arc::new(Mutex::new(Vec::<PValueIndexes>::new()));
630
+ let p_values_temp = Arc::new(Mutex::new(Vec::<PValueIndexes>::new()));
776
631
  let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
777
632
  for thread_num in 0..max_threads {
778
- let normalized_matrix_temp =
779
- Arc::clone(&normalized_matrix_temp);
633
+ let normalized_matrix_temp = Arc::clone(&normalized_matrix_temp);
780
634
  let case_indexes_temp = Arc::clone(&case_indexes_temp);
781
- let control_indexes_temp =
782
- Arc::clone(&control_indexes_temp);
635
+ let control_indexes_temp = Arc::clone(&control_indexes_temp);
783
636
  let p_values_temp = Arc::clone(&p_values_temp);
784
- let filtered_genes_temp = Arc::clone(&filtered_genes_temp);
785
- let filtered_gene_names_temp =
786
- Arc::clone(&filtered_gene_names_temp);
637
+ let filtered_gene_names_temp = Arc::clone(&filtered_gene_names_temp);
787
638
  let handle = thread::spawn(move || {
788
639
  let mut p_values_thread: Vec<PValueIndexes> =
789
- Vec::with_capacity(
790
- normalized_matrix_temp.nrows() / max_threads,
791
- );
640
+ Vec::with_capacity(normalized_matrix_temp.nrows() / max_threads);
792
641
  for i in 0..normalized_matrix_temp.nrows() {
793
642
  let remainder: usize = i % max_threads; // Calculate remainder of iteration number divided by max_threads to decide which thread parses the row
794
643
  if remainder == thread_num {
@@ -797,55 +646,43 @@ fn main() {
797
646
  let mut treated = Vec::<f64>::new();
798
647
  let mut control = Vec::<f64>::new();
799
648
  //println!("conditions:{:?}", conditions);
800
- for j in 0..(case_indexes_temp.len()
801
- + control_indexes_temp.len())
802
- {
649
+ for j in 0..(case_indexes_temp.len() + control_indexes_temp.len()) {
803
650
  //println!("row[(0, j)]:{}", row[(0, j)]);
804
651
  if case_indexes_temp.contains(&j) {
805
652
  treated.push(row[(0, j)]);
806
653
  //println!("{},{}", input_data_vec.0[i][j], "Diseased");
807
- } else if control_indexes_temp.contains(&j)
808
- {
654
+ } else if control_indexes_temp.contains(&j) {
809
655
  // + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
810
656
  control.push(row[(0, j)]);
811
657
  //println!("{},{}", input_data_vec.0[i][j], "Control");
812
658
  } else {
813
659
  panic!(
814
- "Column {} could not be classified into case/control",
815
- j
816
- );
660
+ "Column {} could not be classified into case/control",
661
+ j
662
+ );
817
663
  }
818
664
  }
819
665
  //println!("treated{:?}", treated);
820
666
  //println!("control{:?}", control);
821
- let p_value =
822
- stats_functions::wilcoxon_rank_sum_test(
823
- treated.clone(),
824
- control.clone(),
825
- THRESHOLD,
826
- 't',
827
- true,
828
- ); // Setting continuity correction to true in case of normal approximation
667
+ let p_value = stats_functions::wilcoxon_rank_sum_test(
668
+ treated.clone(),
669
+ control.clone(),
670
+ THRESHOLD,
671
+ 't',
672
+ true,
673
+ ); // Setting continuity correction to true in case of normal approximation
829
674
  let treated_mean = Data::new(treated).mean();
830
675
  let control_mean = Data::new(control).mean();
831
- if (treated_mean.unwrap()
832
- / control_mean.unwrap())
833
- .log2()
834
- .is_nan()
676
+ if (treated_mean.unwrap() / control_mean.unwrap()).log2().is_nan()
835
677
  == false
836
- && (treated_mean.unwrap()
837
- / control_mean.unwrap())
838
- .log2()
839
- .is_infinite()
678
+ && (treated_mean.unwrap() / control_mean.unwrap())
679
+ .log2()
680
+ .is_infinite()
840
681
  == false
841
682
  {
842
683
  p_values_thread.push(PValueIndexes {
843
684
  index: i,
844
- gene_id: filtered_genes_temp[i]
845
- .to_owned(),
846
- gene_name: filtered_gene_names_temp
847
- [i]
848
- .to_owned(),
685
+ gene_name: filtered_gene_names_temp[i].to_owned(),
849
686
  fold_change: (treated_mean.unwrap()
850
687
  / control_mean.unwrap())
851
688
  .log2(),
@@ -854,10 +691,7 @@ fn main() {
854
691
  }
855
692
  }
856
693
  }
857
- p_values_temp
858
- .lock()
859
- .unwrap()
860
- .append(&mut p_values_thread);
694
+ p_values_temp.lock().unwrap().append(&mut p_values_thread);
861
695
  });
862
696
  handles.push(handle);
863
697
  }
@@ -893,22 +727,18 @@ fn main() {
893
727
 
894
728
  fn adjust_p_values(mut original_p_values: Vec<PValueIndexes>) -> String {
895
729
  // Sorting p-values in ascending order
896
- original_p_values.as_mut_slice().sort_by(|a, b| {
897
- (a.p_value)
898
- .partial_cmp(&b.p_value)
899
- .unwrap_or(Ordering::Equal)
900
- });
730
+ original_p_values
731
+ .as_mut_slice()
732
+ .sort_by(|a, b| (a.p_value).partial_cmp(&b.p_value).unwrap_or(Ordering::Equal));
901
733
 
902
- let mut adjusted_p_values: Vec<AdjustedPValueIndexes> =
903
- Vec::with_capacity(original_p_values.len());
734
+ let mut adjusted_p_values: Vec<AdjustedPValueIndexes> = Vec::with_capacity(original_p_values.len());
904
735
  let mut old_p_value: f64 = 0.0;
905
736
  let mut rank: f64 = original_p_values.len() as f64;
906
737
  for j in 0..original_p_values.len() {
907
738
  let i = original_p_values.len() - j - 1;
908
739
 
909
740
  //println!("p_val:{}", p_val);
910
- let mut adjusted_p_val: f64 =
911
- original_p_values[i].p_value * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
741
+ let mut adjusted_p_val: f64 = original_p_values[i].p_value * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
912
742
  if adjusted_p_val > 1.0 {
913
743
  // p_value should NEVER be greater than 1
914
744
  adjusted_p_val = 1.0;
@@ -927,7 +757,6 @@ fn adjust_p_values(mut original_p_values: Vec<PValueIndexes>) -> String {
927
757
  adjusted_p_values.push(AdjustedPValueIndexes {
928
758
  index: original_p_values[i].index,
929
759
  fold_change: original_p_values[i].fold_change,
930
- gene_id: original_p_values[i].gene_id.to_owned(),
931
760
  gene_name: original_p_values[i].gene_name.to_owned(),
932
761
  original_p_value: original_p_values[i].p_value,
933
762
  adjusted_p_value: adjusted_p_val,
@@ -948,18 +777,15 @@ fn adjust_p_values(mut original_p_values: Vec<PValueIndexes>) -> String {
948
777
 
949
778
  #[allow(dead_code)]
950
779
  fn adjust_p_values_bonferroni(original_p_values: Vec<PValueIndexes>) -> Vec<AdjustedPValueIndexes> {
951
- let mut adjusted_p_values: Vec<AdjustedPValueIndexes> =
952
- Vec::with_capacity(original_p_values.len());
780
+ let mut adjusted_p_values: Vec<AdjustedPValueIndexes> = Vec::with_capacity(original_p_values.len());
953
781
  for i in 0..original_p_values.len() {
954
- let mut adjusted_p_value: f64 =
955
- original_p_values[i].p_value * original_p_values.len() as f64; // In bonferroni correction, multiplying p_value by number of tests (excluding those with low sample sizes)
782
+ let mut adjusted_p_value: f64 = original_p_values[i].p_value * original_p_values.len() as f64; // In bonferroni correction, multiplying p_value by number of tests (excluding those with low sample sizes)
956
783
  if adjusted_p_value > 1.0 {
957
784
  // p_value should NEVER be greater than 1
958
785
  adjusted_p_value = 1.0;
959
786
  }
960
787
  adjusted_p_values.push(AdjustedPValueIndexes {
961
788
  index: original_p_values[i].index,
962
- gene_id: original_p_values[i].gene_id.to_owned(),
963
789
  gene_name: original_p_values[i].gene_name.to_owned(),
964
790
  fold_change: original_p_values[i].fold_change,
965
791
  original_p_value: original_p_values[i].p_value,
@@ -970,10 +796,7 @@ fn adjust_p_values_bonferroni(original_p_values: Vec<PValueIndexes>) -> Vec<Adju
970
796
  }
971
797
 
972
798
  // Original TMM normalization source code in edgeR: https://rdrr.io/bioc/edgeR/src/R/calcNormFactors.R
973
- fn tmm_normalization(
974
- input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
975
- lib_sizes: &Vec<f64>,
976
- ) -> Vec<f64> {
799
+ fn tmm_normalization(input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>, lib_sizes: &Vec<f64>) -> Vec<f64> {
977
800
  //println!("Unnormalized matrix:{:?}", input_matrix);
978
801
  let f75 = calc_factor_quantile(&input_matrix, lib_sizes);
979
802
  //println!("f75:{:?}", f75);
@@ -1010,12 +833,7 @@ fn tmm_normalization(
1010
833
  for col in 0..input_matrix.ncols() {
1011
834
  let obs_data = input_matrix.column(col);
1012
835
  let obs_lib_size = lib_sizes[col];
1013
- f.push(calc_factor_tmm(
1014
- obs_data,
1015
- &ref_data,
1016
- ref_lib_size,
1017
- obs_lib_size,
1018
- ));
836
+ f.push(calc_factor_tmm(obs_data, &ref_data, ref_lib_size, obs_lib_size));
1019
837
  }
1020
838
  } else {
1021
839
  // Multithreaded implementation of TMM normalization
@@ -1028,8 +846,7 @@ fn tmm_normalization(
1028
846
  let lib_sizes_temp = Arc::clone(&lib_sizes_temp);
1029
847
  let input_matrix_temp = Arc::clone(&input_matrix_temp);
1030
848
  let handle = thread::spawn(move || {
1031
- let mut f_thread: Vec<f_index> =
1032
- Vec::with_capacity(input_matrix_temp.ncols() / max_threads);
849
+ let mut f_thread: Vec<f_index> = Vec::with_capacity(input_matrix_temp.ncols() / max_threads);
1033
850
  let ref_data = input_matrix_temp.column(ref_column);
1034
851
  let ref_lib_size = lib_sizes_temp[ref_column];
1035
852
  for col in 0..input_matrix_temp.ncols() {
@@ -1120,11 +937,7 @@ fn calc_factor_tmm(
1120
937
  let mut num: f64 = 0.0;
1121
938
  let mut den: f64 = 0.0;
1122
939
  for i in 0..log_r.len() {
1123
- if log_r_log[i] >= lo_l
1124
- && log_r_log[i] <= hi_l
1125
- && abs_e_log[i] >= lo_s
1126
- && abs_e_log[i] <= hi_s
1127
- {
940
+ if log_r_log[i] >= lo_l && log_r_log[i] <= hi_l && abs_e_log[i] >= lo_s && abs_e_log[i] <= hi_s {
1128
941
  num += log_r[i] / v[i];
1129
942
  den += 1.0 / v[i];
1130
943
  }
@@ -1252,14 +1065,8 @@ fn filter_by_expr(
1252
1065
  raw_data: &Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
1253
1066
  num_diseased: usize,
1254
1067
  num_control: usize,
1255
- gene_ids: Vec<String>,
1256
1068
  gene_names: Vec<String>,
1257
- ) -> (
1258
- Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
1259
- Vec<f64>,
1260
- Vec<String>,
1261
- Vec<String>,
1262
- ) {
1069
+ ) -> (Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>, Vec<f64>, Vec<String>) {
1263
1070
  // Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>
1264
1071
  //const min_count: f64 = 10.0; // Value of constant from R implementation
1265
1072
  //const min_total_count: f64 = 15.0; // Value of constant from R implementation
@@ -1344,14 +1151,12 @@ fn filter_by_expr(
1344
1151
  blank.push(0.0);
1345
1152
  }
1346
1153
  let mut filtered_matrix = DMatrix::from_vec(positives.len(), num_diseased + num_control, blank);
1347
- let mut filtered_genes: Vec<String> = Vec::with_capacity(positives.len());
1348
1154
  let mut filtered_gene_names: Vec<String> = Vec::with_capacity(positives.len());
1349
1155
  let mut i = 0;
1350
1156
  //println!("filtered_matrix rows:{}", filtered_matrix.nrows());
1351
1157
  //println!("filtered_matrix cols:{}", filtered_matrix.ncols());
1352
1158
  for index in positives {
1353
1159
  let row = raw_data.row(index);
1354
- filtered_genes.push(gene_ids[index].to_owned());
1355
1160
  filtered_gene_names.push(gene_names[index].to_owned());
1356
1161
  let mut j = 0;
1357
1162
  for item in &row {
@@ -1372,12 +1177,7 @@ fn filter_by_expr(
1372
1177
  modified_lib_sizes.push(modified_lib_sizes_vector[(0, i)].into());
1373
1178
  }
1374
1179
  //println!("filtered_matrix:{:?}", filtered_matrix);
1375
- (
1376
- filtered_matrix,
1377
- modified_lib_sizes,
1378
- filtered_genes,
1379
- filtered_gene_names,
1380
- )
1180
+ (filtered_matrix, modified_lib_sizes, filtered_gene_names)
1381
1181
  }
1382
1182
 
1383
1183
  fn cpm(
@@ -1393,8 +1193,7 @@ fn cpm(
1393
1193
  for col in 0..input_matrix.ncols() {
1394
1194
  let norm_factor = column_sums[(0, col)];
1395
1195
  for row in 0..input_matrix.nrows() {
1396
- output_matrix[(row, col)] =
1397
- (input_matrix[(row, col)] as f64 * 1000000.0) / norm_factor as f64;
1196
+ output_matrix[(row, col)] = (input_matrix[(row, col)] as f64 * 1000000.0) / norm_factor as f64;
1398
1197
  }
1399
1198
  }
1400
1199
  //println!("output_matrix:{:?}", output_matrix);