@sjcrh/proteinpaint-rust 2.148.1 → 2.150.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +1 -1
- package/README.md +5 -0
- package/package.json +1 -1
- package/src/DEanalysis.rs +110 -311
- package/src/aichatbot.rs +770 -136
- package/src/ollama.rs +1108 -0
- package/src/sjprovider.rs +52 -11
- package/src/test_ai.rs +168 -0
package/src/DEanalysis.rs
CHANGED
|
@@ -2,15 +2,16 @@
|
|
|
2
2
|
// cd .. && cargo build --release && json='{"data_type":"get_samples","input_file":"/Users/rpaul1/pp_data/files/hg38/ALL-pharmacotyping/rnaseq/counts.h5"}' && time echo $json | target/release/DEanalysis
|
|
3
3
|
// cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/DEanalysis
|
|
4
4
|
#![allow(non_snake_case)]
|
|
5
|
-
use hdf5::types::VarLenAscii;
|
|
6
5
|
use hdf5::File as HDF5File;
|
|
6
|
+
use hdf5::types::VarLenAscii;
|
|
7
|
+
use hdf5::types::VarLenUnicode;
|
|
7
8
|
use json;
|
|
8
|
-
use nalgebra::base::dimension::Const;
|
|
9
|
-
use nalgebra::base::dimension::Dyn;
|
|
10
|
-
use nalgebra::base::Matrix;
|
|
11
|
-
use nalgebra::base::VecStorage;
|
|
12
9
|
use nalgebra::DMatrix;
|
|
13
10
|
use nalgebra::ViewStorage;
|
|
11
|
+
use nalgebra::base::Matrix;
|
|
12
|
+
use nalgebra::base::VecStorage;
|
|
13
|
+
use nalgebra::base::dimension::Const;
|
|
14
|
+
use nalgebra::base::dimension::Dyn;
|
|
14
15
|
//use ndarray::Array1;
|
|
15
16
|
use ndarray::Array2;
|
|
16
17
|
use ndarray::Dim;
|
|
@@ -70,7 +71,6 @@ fn input_data_from_HDF5(
|
|
|
70
71
|
Vec<usize>,
|
|
71
72
|
Vec<usize>,
|
|
72
73
|
Vec<String>,
|
|
73
|
-
Vec<String>,
|
|
74
74
|
) {
|
|
75
75
|
let file = HDF5File::open(&hdf5_filename).unwrap(); // open for reading
|
|
76
76
|
|
|
@@ -86,129 +86,53 @@ fn input_data_from_HDF5(
|
|
|
86
86
|
//println!("num_samples bulk:{}", num_samples);
|
|
87
87
|
//println!("num_genes bulk:{}", num_genes);
|
|
88
88
|
|
|
89
|
-
//
|
|
90
|
-
let
|
|
91
|
-
|
|
92
|
-
let
|
|
93
|
-
.read::<VarLenAscii, Dim<[usize; 1]>>()
|
|
94
|
-
.unwrap();
|
|
95
|
-
//println!("\tgene_ids = {:?}", gene_ids);
|
|
96
|
-
//println!("\tgene_ids.shape() = {:?}", gene_ids.shape());
|
|
97
|
-
//println!("\tgene_ids.strides() = {:?}", gene_ids.strides());
|
|
98
|
-
//println!("\tgene_ids.ndim() = {:?}", gene_ids.ndim());
|
|
99
|
-
//println!("Time for parsing gene names:{:?}", now_gene_ids.elapsed());
|
|
100
|
-
|
|
101
|
-
//let now_gene_names = Instant::now();
|
|
102
|
-
let ds_gene_names = file.dataset("gene_names").unwrap();
|
|
103
|
-
//println!("ds_gene_names:{:?}", ds_gene_names);
|
|
104
|
-
let gene_names = ds_gene_names
|
|
105
|
-
.read::<VarLenAscii, Dim<[usize; 1]>>()
|
|
106
|
-
.unwrap();
|
|
107
|
-
//println!("\tgene_names = {:?}", gene_names);
|
|
108
|
-
//println!("\tgene_names.shape() = {:?}", gene_names.shape());
|
|
109
|
-
//println!("\tgene_names.strides() = {:?}", gene_names.strides());
|
|
110
|
-
//println!("\tgene_names.ndim() = {:?}", gene_names.ndim());
|
|
111
|
-
//println!(
|
|
112
|
-
// "Time for parsing gene symbols:{:?}",
|
|
113
|
-
// now_gene_names.elapsed()
|
|
114
|
-
//);
|
|
89
|
+
// Read the item dataset
|
|
90
|
+
let ds_item = file.dataset("item").unwrap();
|
|
91
|
+
let item = ds_item.read_1d::<VarLenUnicode>().unwrap();
|
|
92
|
+
let gene_names: Vec<String> = item.iter().map(|x| x.to_string()).collect();
|
|
115
93
|
|
|
116
|
-
|
|
117
|
-
let mut gene_names_string: Vec<String> = Vec::with_capacity(gene_names.len());
|
|
118
|
-
for i in 0..gene_ids.len() {
|
|
119
|
-
gene_ids_string.push(gene_ids[i].to_string());
|
|
120
|
-
gene_names_string.push(gene_names[i].to_string());
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
//let now_samples = Instant::now();
|
|
94
|
+
// Read the samples dataset
|
|
124
95
|
let ds_samples = file.dataset("samples").unwrap();
|
|
125
|
-
let samples = ds_samples.
|
|
126
|
-
|
|
127
|
-
//
|
|
128
|
-
|
|
129
|
-
//println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
130
|
-
//println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
96
|
+
let samples = ds_samples.read_1d::<VarLenUnicode>().unwrap();
|
|
97
|
+
|
|
98
|
+
// Read the matrix dataset
|
|
99
|
+
let ds_matrix = file.dataset("matrix").unwrap();
|
|
131
100
|
|
|
132
|
-
//
|
|
133
|
-
|
|
134
|
-
let
|
|
101
|
+
// Get dimensions from the matrix dataset
|
|
102
|
+
let matrix_shape = ds_matrix.shape();
|
|
103
|
+
let num_genes = matrix_shape[0];
|
|
135
104
|
|
|
136
105
|
let mut global_sample_index = 0;
|
|
137
106
|
for sample_name in case_list {
|
|
138
|
-
let sample_index
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
// "The index of '{}' is {} in 0-based format (add 1 to compare with R output)",
|
|
146
|
-
// sample_name, index
|
|
147
|
-
//);
|
|
148
|
-
sample_index = index;
|
|
149
|
-
}
|
|
150
|
-
None => panic!(
|
|
151
|
-
"Sample '{}' not found in the HDF5 file '{}'",
|
|
152
|
-
sample_name, &hdf5_filename
|
|
153
|
-
),
|
|
107
|
+
if let Some(sample_index) = samples.iter().position(|x| x.to_string() == *sample_name.to_string()) {
|
|
108
|
+
let sample_array: Array2<f64> = ds_matrix
|
|
109
|
+
.read_slice_2d((0..num_genes, sample_index..sample_index + 1))
|
|
110
|
+
.unwrap();
|
|
111
|
+
input_vector.append(&mut sample_array.as_slice().unwrap().to_vec());
|
|
112
|
+
case_indexes.push(global_sample_index);
|
|
113
|
+
global_sample_index += 1;
|
|
154
114
|
}
|
|
155
|
-
|
|
156
|
-
let sample_array: Array2<f64> = ds_counts
|
|
157
|
-
.read_slice_2d((0..gene_ids.len(), sample_index..sample_index + 1))
|
|
158
|
-
.unwrap();
|
|
159
|
-
//println!("Length of gene array:{:?}", sample_array.len()); // Please check the result
|
|
160
|
-
input_vector.append(&mut sample_array.as_slice().unwrap().to_vec());
|
|
161
|
-
case_indexes.push(global_sample_index);
|
|
162
|
-
global_sample_index += 1;
|
|
115
|
+
// Skip sample if not found
|
|
163
116
|
}
|
|
164
117
|
|
|
165
118
|
for sample_name in control_list {
|
|
166
|
-
let sample_index
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
// "The index of '{}' is {} in 0-based format (add 1 to compare with R output)",
|
|
174
|
-
// sample_name, index
|
|
175
|
-
//);
|
|
176
|
-
sample_index = index;
|
|
177
|
-
}
|
|
178
|
-
None => panic!(
|
|
179
|
-
"Sample '{}' not found in the HDF5 file '{}'",
|
|
180
|
-
sample_name, &hdf5_filename
|
|
181
|
-
),
|
|
119
|
+
if let Some(sample_index) = samples.iter().position(|x| x.to_string() == *sample_name.to_string()) {
|
|
120
|
+
let sample_array: Array2<f64> = ds_matrix
|
|
121
|
+
.read_slice_2d((0..num_genes, sample_index..sample_index + 1))
|
|
122
|
+
.unwrap();
|
|
123
|
+
input_vector.append(&mut sample_array.as_slice().unwrap().to_vec());
|
|
124
|
+
control_indexes.push(global_sample_index);
|
|
125
|
+
global_sample_index += 1;
|
|
182
126
|
}
|
|
183
|
-
//
|
|
184
|
-
//println!("Data_counts: {:?}", data_counts);
|
|
185
|
-
let sample_array: Array2<f64> = ds_counts
|
|
186
|
-
.read_slice_2d((0..gene_ids.len(), sample_index..sample_index + 1))
|
|
187
|
-
.unwrap();
|
|
188
|
-
//println!("Length of gene array:{:?}", sample_array.len()); // Please check the result
|
|
189
|
-
input_vector.append(&mut sample_array.as_slice().unwrap().to_vec());
|
|
190
|
-
control_indexes.push(global_sample_index);
|
|
191
|
-
global_sample_index += 1;
|
|
127
|
+
// Ship sample if not found
|
|
192
128
|
}
|
|
193
129
|
|
|
194
|
-
|
|
195
|
-
//println!(
|
|
196
|
-
// "case + control length:{}",
|
|
197
|
-
// case_list.len() + control_list.len()
|
|
198
|
-
//);
|
|
199
|
-
//println!("gene_ids length:{}", gene_ids.len());
|
|
200
|
-
//println!("input_vector length:{}", input_vector.len());
|
|
201
|
-
let dm = DMatrix::from_row_slice(
|
|
202
|
-
case_list.len() + control_list.len(),
|
|
203
|
-
gene_ids.len(),
|
|
204
|
-
&input_vector,
|
|
205
|
-
);
|
|
130
|
+
let dm = DMatrix::from_row_slice(case_indexes.len() + control_indexes.len(), num_genes, &input_vector);
|
|
206
131
|
(
|
|
207
132
|
dm.transpose(), // Transposing the matrix
|
|
208
133
|
case_indexes,
|
|
209
134
|
control_indexes,
|
|
210
|
-
|
|
211
|
-
gene_names_string,
|
|
135
|
+
gene_names,
|
|
212
136
|
)
|
|
213
137
|
}
|
|
214
138
|
|
|
@@ -221,7 +145,6 @@ fn input_data_from_text(
|
|
|
221
145
|
Vec<usize>,
|
|
222
146
|
Vec<usize>,
|
|
223
147
|
Vec<String>,
|
|
224
|
-
Vec<String>,
|
|
225
148
|
) {
|
|
226
149
|
//let input_time = Instant::now();
|
|
227
150
|
let mut file = File::open(filename).unwrap();
|
|
@@ -344,15 +267,14 @@ fn input_data_from_text(
|
|
|
344
267
|
let control_indexes_original = Arc::new(control_indexes_original);
|
|
345
268
|
let buffer = Arc::new(buffer);
|
|
346
269
|
let case_indexes_temp = Arc::new(Mutex::new(Vec::<usize>::with_capacity(case_list.len())));
|
|
347
|
-
let control_indexes_temp =
|
|
348
|
-
Arc::new(Mutex::new(Vec::<usize>::with_capacity(control_list.len())));
|
|
270
|
+
let control_indexes_temp = Arc::new(Mutex::new(Vec::<usize>::with_capacity(control_list.len())));
|
|
349
271
|
let num_lines_temp = Arc::new(Mutex::<usize>::new(0));
|
|
350
272
|
let num_columns_temp = Arc::new(Mutex::<usize>::new(0));
|
|
351
273
|
let genes_names_temp = Arc::new(Mutex::new(Vec::<String>::new()));
|
|
352
274
|
let genes_symbols_temp = Arc::new(Mutex::new(Vec::<String>::new()));
|
|
353
275
|
let input_vector_temp = Arc::new(Mutex::new(Vec::<f64>::new()));
|
|
354
276
|
let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
|
|
355
|
-
|
|
277
|
+
//println!("Number of threads used:{}", max_threads);
|
|
356
278
|
for thread_num in 0..max_threads {
|
|
357
279
|
let case_indexes_original = Arc::clone(&case_indexes_original);
|
|
358
280
|
let control_indexes_original = Arc::clone(&control_indexes_original);
|
|
@@ -400,11 +322,11 @@ fn input_data_from_text(
|
|
|
400
322
|
}
|
|
401
323
|
Err(_n) => {
|
|
402
324
|
panic!(
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
325
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
326
|
+
field,
|
|
327
|
+
num_lines_thread + 1,
|
|
328
|
+
index + 1
|
|
329
|
+
);
|
|
408
330
|
}
|
|
409
331
|
}
|
|
410
332
|
} else if binary_search(&control_indexes_original, index) != -1 {
|
|
@@ -420,11 +342,11 @@ fn input_data_from_text(
|
|
|
420
342
|
}
|
|
421
343
|
Err(_n) => {
|
|
422
344
|
panic!(
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
345
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
346
|
+
field,
|
|
347
|
+
num_lines_thread + 1,
|
|
348
|
+
index + 1
|
|
349
|
+
);
|
|
428
350
|
}
|
|
429
351
|
}
|
|
430
352
|
}
|
|
@@ -433,26 +355,11 @@ fn input_data_from_text(
|
|
|
433
355
|
num_lines_thread += 1;
|
|
434
356
|
}
|
|
435
357
|
}
|
|
436
|
-
input_vector_temp
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
.lock()
|
|
442
|
-
.unwrap()
|
|
443
|
-
.append(&mut case_indexes_thread);
|
|
444
|
-
control_indexes_temp
|
|
445
|
-
.lock()
|
|
446
|
-
.unwrap()
|
|
447
|
-
.append(&mut control_indexes_thread);
|
|
448
|
-
genes_names_temp
|
|
449
|
-
.lock()
|
|
450
|
-
.unwrap()
|
|
451
|
-
.append(&mut genes_names_thread);
|
|
452
|
-
genes_symbols_temp
|
|
453
|
-
.lock()
|
|
454
|
-
.unwrap()
|
|
455
|
-
.append(&mut genes_symbols_thread);
|
|
358
|
+
input_vector_temp.lock().unwrap().append(&mut input_vector_thread);
|
|
359
|
+
case_indexes_temp.lock().unwrap().append(&mut case_indexes_thread);
|
|
360
|
+
control_indexes_temp.lock().unwrap().append(&mut control_indexes_thread);
|
|
361
|
+
genes_names_temp.lock().unwrap().append(&mut genes_names_thread);
|
|
362
|
+
genes_symbols_temp.lock().unwrap().append(&mut genes_symbols_thread);
|
|
456
363
|
*num_lines_temp.lock().unwrap() += num_lines_thread;
|
|
457
364
|
if num_columns_thread > 0 {
|
|
458
365
|
*num_columns_temp.lock().unwrap() += num_columns_thread;
|
|
@@ -490,14 +397,13 @@ fn input_data_from_text(
|
|
|
490
397
|
//println!("Time for inputting data:{:?}", input_time.elapsed());
|
|
491
398
|
let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
|
|
492
399
|
//println!("dm:{:?}", dm);
|
|
493
|
-
(dm, case_indexes, control_indexes,
|
|
400
|
+
(dm, case_indexes, control_indexes, gene_names)
|
|
494
401
|
}
|
|
495
402
|
|
|
496
403
|
#[allow(dead_code)]
|
|
497
404
|
#[derive(Debug, Serialize, Deserialize)]
|
|
498
405
|
struct AdjustedPValueIndexes {
|
|
499
406
|
index: usize,
|
|
500
|
-
gene_id: String,
|
|
501
407
|
gene_name: String,
|
|
502
408
|
fold_change: f64,
|
|
503
409
|
original_p_value: f64,
|
|
@@ -506,7 +412,6 @@ struct AdjustedPValueIndexes {
|
|
|
506
412
|
|
|
507
413
|
struct PValueIndexes {
|
|
508
414
|
index: usize,
|
|
509
|
-
gene_id: String,
|
|
510
415
|
gene_name: String,
|
|
511
416
|
fold_change: f64,
|
|
512
417
|
p_value: f64,
|
|
@@ -578,10 +483,8 @@ fn main() {
|
|
|
578
483
|
get_DE_samples(file_name)
|
|
579
484
|
} else if x == "do_DE" {
|
|
580
485
|
let min_count_option = json_string["min_count"].as_f64().to_owned();
|
|
581
|
-
let min_total_count_option =
|
|
582
|
-
|
|
583
|
-
let storage_type_option =
|
|
584
|
-
json_string["storage_type"].as_str().to_owned();
|
|
486
|
+
let min_total_count_option = json_string["min_total_count"].as_f64().to_owned();
|
|
487
|
+
let storage_type_option = json_string["storage_type"].as_str().to_owned();
|
|
585
488
|
let storage_type;
|
|
586
489
|
match storage_type_option {
|
|
587
490
|
Some(x) => {
|
|
@@ -590,10 +493,7 @@ fn main() {
|
|
|
590
493
|
} else if x == "text" {
|
|
591
494
|
storage_type = "text"
|
|
592
495
|
} else {
|
|
593
|
-
panic!(
|
|
594
|
-
"Unknown storage_type:{}{}",
|
|
595
|
-
x, " Needs to be either HDF5 or text"
|
|
596
|
-
);
|
|
496
|
+
panic!("Unknown storage_type:{}{}", x, " Needs to be either HDF5 or text");
|
|
597
497
|
}
|
|
598
498
|
}
|
|
599
499
|
None => panic!("storage_type needs to be HDF5 or text"),
|
|
@@ -612,53 +512,26 @@ fn main() {
|
|
|
612
512
|
panic!("min_total_count is missing a value")
|
|
613
513
|
}
|
|
614
514
|
}
|
|
615
|
-
let case_string =
|
|
616
|
-
|
|
617
|
-
let control_string = &json_string["control"]
|
|
618
|
-
.to_owned()
|
|
619
|
-
.as_str()
|
|
620
|
-
.unwrap()
|
|
621
|
-
.to_string();
|
|
515
|
+
let case_string = &json_string["case"].to_owned().as_str().unwrap().to_string();
|
|
516
|
+
let control_string = &json_string["control"].to_owned().as_str().unwrap().to_string();
|
|
622
517
|
let case_list: Vec<&str> = case_string.split(",").collect();
|
|
623
518
|
let control_list: Vec<&str> = control_string.split(",").collect();
|
|
624
|
-
let (
|
|
625
|
-
input_matrix,
|
|
626
|
-
case_indexes,
|
|
627
|
-
control_indexes,
|
|
628
|
-
gene_ids,
|
|
629
|
-
gene_names,
|
|
630
|
-
);
|
|
519
|
+
let (input_matrix, case_indexes, control_indexes, gene_names);
|
|
631
520
|
if storage_type == "text" {
|
|
632
|
-
(
|
|
633
|
-
|
|
634
|
-
case_indexes,
|
|
635
|
-
control_indexes,
|
|
636
|
-
gene_ids,
|
|
637
|
-
gene_names,
|
|
638
|
-
) = input_data_from_text(file_name, &case_list, &control_list);
|
|
521
|
+
(input_matrix, case_indexes, control_indexes, gene_names) =
|
|
522
|
+
input_data_from_text(file_name, &case_list, &control_list);
|
|
639
523
|
} else {
|
|
640
524
|
// Parsing data from a HDF5 file
|
|
641
|
-
(
|
|
642
|
-
|
|
643
|
-
case_indexes,
|
|
644
|
-
control_indexes,
|
|
645
|
-
gene_ids,
|
|
646
|
-
gene_names,
|
|
647
|
-
) = input_data_from_HDF5(file_name, &case_list, &control_list);
|
|
525
|
+
(input_matrix, case_indexes, control_indexes, gene_names) =
|
|
526
|
+
input_data_from_HDF5(file_name, &case_list, &control_list);
|
|
648
527
|
}
|
|
649
528
|
//let filtering_time = Instant::now();
|
|
650
|
-
let (
|
|
651
|
-
filtered_matrix,
|
|
652
|
-
lib_sizes,
|
|
653
|
-
filtered_genes,
|
|
654
|
-
filtered_gene_names,
|
|
655
|
-
) = filter_by_expr(
|
|
529
|
+
let (filtered_matrix, lib_sizes, filtered_gene_names) = filter_by_expr(
|
|
656
530
|
min_count,
|
|
657
531
|
min_total_count,
|
|
658
532
|
&input_matrix,
|
|
659
533
|
case_indexes.len(),
|
|
660
534
|
control_indexes.len(),
|
|
661
|
-
gene_ids,
|
|
662
535
|
gene_names,
|
|
663
536
|
);
|
|
664
537
|
//println!("filtering time:{:?}", filtering_time.elapsed());
|
|
@@ -689,8 +562,7 @@ fn main() {
|
|
|
689
562
|
for col in 0..normalized_matrix.ncols() {
|
|
690
563
|
let norm_factor = norm_factors[col];
|
|
691
564
|
for row in 0..normalized_matrix.nrows() {
|
|
692
|
-
normalized_matrix[(row, col)] =
|
|
693
|
-
normalized_matrix[(row, col)] / norm_factor;
|
|
565
|
+
normalized_matrix[(row, col)] = normalized_matrix[(row, col)] / norm_factor;
|
|
694
566
|
}
|
|
695
567
|
}
|
|
696
568
|
//println!("normalized_matrix:{:?}", normalized_matrix);
|
|
@@ -700,16 +572,13 @@ fn main() {
|
|
|
700
572
|
// Using Wilcoxon test for differential gene expression
|
|
701
573
|
|
|
702
574
|
//let now2 = Instant::now();
|
|
703
|
-
let mut p_values: Vec<PValueIndexes> =
|
|
704
|
-
Vec::with_capacity(normalized_matrix.nrows());
|
|
575
|
+
let mut p_values: Vec<PValueIndexes> = Vec::with_capacity(normalized_matrix.nrows());
|
|
705
576
|
const THRESHOLD: usize = 50; // This determines whether the Wilcoxon exact test or the normal test will be used based on sample size.
|
|
706
577
|
|
|
707
578
|
//println!("case_indexes:{:?}", case_indexes);
|
|
708
579
|
//println!("control_indexes:{:?}", control_indexes);
|
|
709
580
|
//let num_normalized_rows = normalized_matrix.nrows();
|
|
710
|
-
if normalized_matrix.nrows() * normalized_matrix.ncols()
|
|
711
|
-
< PAR_CUTOFF
|
|
712
|
-
{
|
|
581
|
+
if normalized_matrix.nrows() * normalized_matrix.ncols() < PAR_CUTOFF {
|
|
713
582
|
for i in 0..normalized_matrix.nrows() {
|
|
714
583
|
let row = normalized_matrix.row(i);
|
|
715
584
|
//println!("row:{:?}", row);
|
|
@@ -726,10 +595,7 @@ fn main() {
|
|
|
726
595
|
control.push(row[(0, j)]);
|
|
727
596
|
//println!("{},{}", input_data_vec.0[i][j], "Control");
|
|
728
597
|
} else {
|
|
729
|
-
panic!(
|
|
730
|
-
"Column {} could not be classified into case/control",
|
|
731
|
-
j
|
|
732
|
-
);
|
|
598
|
+
panic!("Column {} could not be classified into case/control", j);
|
|
733
599
|
}
|
|
734
600
|
}
|
|
735
601
|
//println!("treated{:?}", treated);
|
|
@@ -743,22 +609,14 @@ fn main() {
|
|
|
743
609
|
); // Setting continuity correction to true in case of normal approximation
|
|
744
610
|
let treated_mean = Data::new(treated).mean();
|
|
745
611
|
let control_mean = Data::new(control).mean();
|
|
746
|
-
if (treated_mean.unwrap() / control_mean.unwrap())
|
|
747
|
-
.log2()
|
|
748
|
-
.is_nan()
|
|
749
|
-
== false
|
|
750
|
-
&& (treated_mean.unwrap() / control_mean.unwrap())
|
|
751
|
-
.log2()
|
|
752
|
-
.is_infinite()
|
|
612
|
+
if (treated_mean.unwrap() / control_mean.unwrap()).log2().is_nan() == false
|
|
613
|
+
&& (treated_mean.unwrap() / control_mean.unwrap()).log2().is_infinite()
|
|
753
614
|
== false
|
|
754
615
|
{
|
|
755
616
|
p_values.push(PValueIndexes {
|
|
756
617
|
index: i,
|
|
757
|
-
gene_id: filtered_genes[i].to_owned(),
|
|
758
618
|
gene_name: filtered_gene_names[i].to_owned(),
|
|
759
|
-
fold_change: (treated_mean.unwrap()
|
|
760
|
-
/ control_mean.unwrap())
|
|
761
|
-
.log2(),
|
|
619
|
+
fold_change: (treated_mean.unwrap() / control_mean.unwrap()).log2(),
|
|
762
620
|
p_value: p_value,
|
|
763
621
|
});
|
|
764
622
|
}
|
|
@@ -766,29 +624,20 @@ fn main() {
|
|
|
766
624
|
} else {
|
|
767
625
|
// Multithreaded implementation of calculating wilcoxon p-values
|
|
768
626
|
let normalized_matrix_temp = Arc::new(normalized_matrix);
|
|
769
|
-
let
|
|
770
|
-
let filtered_gene_names_temp =
|
|
771
|
-
Arc::new(filtered_gene_names);
|
|
627
|
+
let filtered_gene_names_temp = Arc::new(filtered_gene_names);
|
|
772
628
|
let case_indexes_temp = Arc::new(case_indexes);
|
|
773
629
|
let control_indexes_temp = Arc::new(control_indexes);
|
|
774
|
-
let p_values_temp =
|
|
775
|
-
Arc::new(Mutex::new(Vec::<PValueIndexes>::new()));
|
|
630
|
+
let p_values_temp = Arc::new(Mutex::new(Vec::<PValueIndexes>::new()));
|
|
776
631
|
let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
|
|
777
632
|
for thread_num in 0..max_threads {
|
|
778
|
-
let normalized_matrix_temp =
|
|
779
|
-
Arc::clone(&normalized_matrix_temp);
|
|
633
|
+
let normalized_matrix_temp = Arc::clone(&normalized_matrix_temp);
|
|
780
634
|
let case_indexes_temp = Arc::clone(&case_indexes_temp);
|
|
781
|
-
let control_indexes_temp =
|
|
782
|
-
Arc::clone(&control_indexes_temp);
|
|
635
|
+
let control_indexes_temp = Arc::clone(&control_indexes_temp);
|
|
783
636
|
let p_values_temp = Arc::clone(&p_values_temp);
|
|
784
|
-
let
|
|
785
|
-
let filtered_gene_names_temp =
|
|
786
|
-
Arc::clone(&filtered_gene_names_temp);
|
|
637
|
+
let filtered_gene_names_temp = Arc::clone(&filtered_gene_names_temp);
|
|
787
638
|
let handle = thread::spawn(move || {
|
|
788
639
|
let mut p_values_thread: Vec<PValueIndexes> =
|
|
789
|
-
Vec::with_capacity(
|
|
790
|
-
normalized_matrix_temp.nrows() / max_threads,
|
|
791
|
-
);
|
|
640
|
+
Vec::with_capacity(normalized_matrix_temp.nrows() / max_threads);
|
|
792
641
|
for i in 0..normalized_matrix_temp.nrows() {
|
|
793
642
|
let remainder: usize = i % max_threads; // Calculate remainder of iteration number divided by max_threads to decide which thread parses the row
|
|
794
643
|
if remainder == thread_num {
|
|
@@ -797,55 +646,43 @@ fn main() {
|
|
|
797
646
|
let mut treated = Vec::<f64>::new();
|
|
798
647
|
let mut control = Vec::<f64>::new();
|
|
799
648
|
//println!("conditions:{:?}", conditions);
|
|
800
|
-
for j in 0..(case_indexes_temp.len()
|
|
801
|
-
+ control_indexes_temp.len())
|
|
802
|
-
{
|
|
649
|
+
for j in 0..(case_indexes_temp.len() + control_indexes_temp.len()) {
|
|
803
650
|
//println!("row[(0, j)]:{}", row[(0, j)]);
|
|
804
651
|
if case_indexes_temp.contains(&j) {
|
|
805
652
|
treated.push(row[(0, j)]);
|
|
806
653
|
//println!("{},{}", input_data_vec.0[i][j], "Diseased");
|
|
807
|
-
} else if control_indexes_temp.contains(&j)
|
|
808
|
-
{
|
|
654
|
+
} else if control_indexes_temp.contains(&j) {
|
|
809
655
|
// + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
|
|
810
656
|
control.push(row[(0, j)]);
|
|
811
657
|
//println!("{},{}", input_data_vec.0[i][j], "Control");
|
|
812
658
|
} else {
|
|
813
659
|
panic!(
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
660
|
+
"Column {} could not be classified into case/control",
|
|
661
|
+
j
|
|
662
|
+
);
|
|
817
663
|
}
|
|
818
664
|
}
|
|
819
665
|
//println!("treated{:?}", treated);
|
|
820
666
|
//println!("control{:?}", control);
|
|
821
|
-
let p_value =
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
); // Setting continuity correction to true in case of normal approximation
|
|
667
|
+
let p_value = stats_functions::wilcoxon_rank_sum_test(
|
|
668
|
+
treated.clone(),
|
|
669
|
+
control.clone(),
|
|
670
|
+
THRESHOLD,
|
|
671
|
+
't',
|
|
672
|
+
true,
|
|
673
|
+
); // Setting continuity correction to true in case of normal approximation
|
|
829
674
|
let treated_mean = Data::new(treated).mean();
|
|
830
675
|
let control_mean = Data::new(control).mean();
|
|
831
|
-
if (treated_mean.unwrap()
|
|
832
|
-
/ control_mean.unwrap())
|
|
833
|
-
.log2()
|
|
834
|
-
.is_nan()
|
|
676
|
+
if (treated_mean.unwrap() / control_mean.unwrap()).log2().is_nan()
|
|
835
677
|
== false
|
|
836
|
-
&& (treated_mean.unwrap()
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
.is_infinite()
|
|
678
|
+
&& (treated_mean.unwrap() / control_mean.unwrap())
|
|
679
|
+
.log2()
|
|
680
|
+
.is_infinite()
|
|
840
681
|
== false
|
|
841
682
|
{
|
|
842
683
|
p_values_thread.push(PValueIndexes {
|
|
843
684
|
index: i,
|
|
844
|
-
|
|
845
|
-
.to_owned(),
|
|
846
|
-
gene_name: filtered_gene_names_temp
|
|
847
|
-
[i]
|
|
848
|
-
.to_owned(),
|
|
685
|
+
gene_name: filtered_gene_names_temp[i].to_owned(),
|
|
849
686
|
fold_change: (treated_mean.unwrap()
|
|
850
687
|
/ control_mean.unwrap())
|
|
851
688
|
.log2(),
|
|
@@ -854,10 +691,7 @@ fn main() {
|
|
|
854
691
|
}
|
|
855
692
|
}
|
|
856
693
|
}
|
|
857
|
-
p_values_temp
|
|
858
|
-
.lock()
|
|
859
|
-
.unwrap()
|
|
860
|
-
.append(&mut p_values_thread);
|
|
694
|
+
p_values_temp.lock().unwrap().append(&mut p_values_thread);
|
|
861
695
|
});
|
|
862
696
|
handles.push(handle);
|
|
863
697
|
}
|
|
@@ -893,22 +727,18 @@ fn main() {
|
|
|
893
727
|
|
|
894
728
|
fn adjust_p_values(mut original_p_values: Vec<PValueIndexes>) -> String {
|
|
895
729
|
// Sorting p-values in ascending order
|
|
896
|
-
original_p_values
|
|
897
|
-
(
|
|
898
|
-
|
|
899
|
-
.unwrap_or(Ordering::Equal)
|
|
900
|
-
});
|
|
730
|
+
original_p_values
|
|
731
|
+
.as_mut_slice()
|
|
732
|
+
.sort_by(|a, b| (a.p_value).partial_cmp(&b.p_value).unwrap_or(Ordering::Equal));
|
|
901
733
|
|
|
902
|
-
let mut adjusted_p_values: Vec<AdjustedPValueIndexes> =
|
|
903
|
-
Vec::with_capacity(original_p_values.len());
|
|
734
|
+
let mut adjusted_p_values: Vec<AdjustedPValueIndexes> = Vec::with_capacity(original_p_values.len());
|
|
904
735
|
let mut old_p_value: f64 = 0.0;
|
|
905
736
|
let mut rank: f64 = original_p_values.len() as f64;
|
|
906
737
|
for j in 0..original_p_values.len() {
|
|
907
738
|
let i = original_p_values.len() - j - 1;
|
|
908
739
|
|
|
909
740
|
//println!("p_val:{}", p_val);
|
|
910
|
-
let mut adjusted_p_val: f64 =
|
|
911
|
-
original_p_values[i].p_value * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
|
|
741
|
+
let mut adjusted_p_val: f64 = original_p_values[i].p_value * (original_p_values.len() as f64 / rank); // adjusted p-value = original_p_value * (N/rank)
|
|
912
742
|
if adjusted_p_val > 1.0 {
|
|
913
743
|
// p_value should NEVER be greater than 1
|
|
914
744
|
adjusted_p_val = 1.0;
|
|
@@ -927,7 +757,6 @@ fn adjust_p_values(mut original_p_values: Vec<PValueIndexes>) -> String {
|
|
|
927
757
|
adjusted_p_values.push(AdjustedPValueIndexes {
|
|
928
758
|
index: original_p_values[i].index,
|
|
929
759
|
fold_change: original_p_values[i].fold_change,
|
|
930
|
-
gene_id: original_p_values[i].gene_id.to_owned(),
|
|
931
760
|
gene_name: original_p_values[i].gene_name.to_owned(),
|
|
932
761
|
original_p_value: original_p_values[i].p_value,
|
|
933
762
|
adjusted_p_value: adjusted_p_val,
|
|
@@ -948,18 +777,15 @@ fn adjust_p_values(mut original_p_values: Vec<PValueIndexes>) -> String {
|
|
|
948
777
|
|
|
949
778
|
#[allow(dead_code)]
|
|
950
779
|
fn adjust_p_values_bonferroni(original_p_values: Vec<PValueIndexes>) -> Vec<AdjustedPValueIndexes> {
|
|
951
|
-
let mut adjusted_p_values: Vec<AdjustedPValueIndexes> =
|
|
952
|
-
Vec::with_capacity(original_p_values.len());
|
|
780
|
+
let mut adjusted_p_values: Vec<AdjustedPValueIndexes> = Vec::with_capacity(original_p_values.len());
|
|
953
781
|
for i in 0..original_p_values.len() {
|
|
954
|
-
let mut adjusted_p_value: f64 =
|
|
955
|
-
original_p_values[i].p_value * original_p_values.len() as f64; // In bonferroni correction, multiplying p_value by number of tests (excluding those with low sample sizes)
|
|
782
|
+
let mut adjusted_p_value: f64 = original_p_values[i].p_value * original_p_values.len() as f64; // In bonferroni correction, multiplying p_value by number of tests (excluding those with low sample sizes)
|
|
956
783
|
if adjusted_p_value > 1.0 {
|
|
957
784
|
// p_value should NEVER be greater than 1
|
|
958
785
|
adjusted_p_value = 1.0;
|
|
959
786
|
}
|
|
960
787
|
adjusted_p_values.push(AdjustedPValueIndexes {
|
|
961
788
|
index: original_p_values[i].index,
|
|
962
|
-
gene_id: original_p_values[i].gene_id.to_owned(),
|
|
963
789
|
gene_name: original_p_values[i].gene_name.to_owned(),
|
|
964
790
|
fold_change: original_p_values[i].fold_change,
|
|
965
791
|
original_p_value: original_p_values[i].p_value,
|
|
@@ -970,10 +796,7 @@ fn adjust_p_values_bonferroni(original_p_values: Vec<PValueIndexes>) -> Vec<Adju
|
|
|
970
796
|
}
|
|
971
797
|
|
|
972
798
|
// Original TMM normalization source code in edgeR: https://rdrr.io/bioc/edgeR/src/R/calcNormFactors.R
|
|
973
|
-
fn tmm_normalization(
|
|
974
|
-
input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
975
|
-
lib_sizes: &Vec<f64>,
|
|
976
|
-
) -> Vec<f64> {
|
|
799
|
+
fn tmm_normalization(input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>, lib_sizes: &Vec<f64>) -> Vec<f64> {
|
|
977
800
|
//println!("Unnormalized matrix:{:?}", input_matrix);
|
|
978
801
|
let f75 = calc_factor_quantile(&input_matrix, lib_sizes);
|
|
979
802
|
//println!("f75:{:?}", f75);
|
|
@@ -1010,12 +833,7 @@ fn tmm_normalization(
|
|
|
1010
833
|
for col in 0..input_matrix.ncols() {
|
|
1011
834
|
let obs_data = input_matrix.column(col);
|
|
1012
835
|
let obs_lib_size = lib_sizes[col];
|
|
1013
|
-
f.push(calc_factor_tmm(
|
|
1014
|
-
obs_data,
|
|
1015
|
-
&ref_data,
|
|
1016
|
-
ref_lib_size,
|
|
1017
|
-
obs_lib_size,
|
|
1018
|
-
));
|
|
836
|
+
f.push(calc_factor_tmm(obs_data, &ref_data, ref_lib_size, obs_lib_size));
|
|
1019
837
|
}
|
|
1020
838
|
} else {
|
|
1021
839
|
// Multithreaded implementation of TMM normalization
|
|
@@ -1028,8 +846,7 @@ fn tmm_normalization(
|
|
|
1028
846
|
let lib_sizes_temp = Arc::clone(&lib_sizes_temp);
|
|
1029
847
|
let input_matrix_temp = Arc::clone(&input_matrix_temp);
|
|
1030
848
|
let handle = thread::spawn(move || {
|
|
1031
|
-
let mut f_thread: Vec<f_index> =
|
|
1032
|
-
Vec::with_capacity(input_matrix_temp.ncols() / max_threads);
|
|
849
|
+
let mut f_thread: Vec<f_index> = Vec::with_capacity(input_matrix_temp.ncols() / max_threads);
|
|
1033
850
|
let ref_data = input_matrix_temp.column(ref_column);
|
|
1034
851
|
let ref_lib_size = lib_sizes_temp[ref_column];
|
|
1035
852
|
for col in 0..input_matrix_temp.ncols() {
|
|
@@ -1120,11 +937,7 @@ fn calc_factor_tmm(
|
|
|
1120
937
|
let mut num: f64 = 0.0;
|
|
1121
938
|
let mut den: f64 = 0.0;
|
|
1122
939
|
for i in 0..log_r.len() {
|
|
1123
|
-
if log_r_log[i] >= lo_l
|
|
1124
|
-
&& log_r_log[i] <= hi_l
|
|
1125
|
-
&& abs_e_log[i] >= lo_s
|
|
1126
|
-
&& abs_e_log[i] <= hi_s
|
|
1127
|
-
{
|
|
940
|
+
if log_r_log[i] >= lo_l && log_r_log[i] <= hi_l && abs_e_log[i] >= lo_s && abs_e_log[i] <= hi_s {
|
|
1128
941
|
num += log_r[i] / v[i];
|
|
1129
942
|
den += 1.0 / v[i];
|
|
1130
943
|
}
|
|
@@ -1252,14 +1065,8 @@ fn filter_by_expr(
|
|
|
1252
1065
|
raw_data: &Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
1253
1066
|
num_diseased: usize,
|
|
1254
1067
|
num_control: usize,
|
|
1255
|
-
gene_ids: Vec<String>,
|
|
1256
1068
|
gene_names: Vec<String>,
|
|
1257
|
-
) -> (
|
|
1258
|
-
Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
1259
|
-
Vec<f64>,
|
|
1260
|
-
Vec<String>,
|
|
1261
|
-
Vec<String>,
|
|
1262
|
-
) {
|
|
1069
|
+
) -> (Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>, Vec<f64>, Vec<String>) {
|
|
1263
1070
|
// Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>
|
|
1264
1071
|
//const min_count: f64 = 10.0; // Value of constant from R implementation
|
|
1265
1072
|
//const min_total_count: f64 = 15.0; // Value of constant from R implementation
|
|
@@ -1344,14 +1151,12 @@ fn filter_by_expr(
|
|
|
1344
1151
|
blank.push(0.0);
|
|
1345
1152
|
}
|
|
1346
1153
|
let mut filtered_matrix = DMatrix::from_vec(positives.len(), num_diseased + num_control, blank);
|
|
1347
|
-
let mut filtered_genes: Vec<String> = Vec::with_capacity(positives.len());
|
|
1348
1154
|
let mut filtered_gene_names: Vec<String> = Vec::with_capacity(positives.len());
|
|
1349
1155
|
let mut i = 0;
|
|
1350
1156
|
//println!("filtered_matrix rows:{}", filtered_matrix.nrows());
|
|
1351
1157
|
//println!("filtered_matrix cols:{}", filtered_matrix.ncols());
|
|
1352
1158
|
for index in positives {
|
|
1353
1159
|
let row = raw_data.row(index);
|
|
1354
|
-
filtered_genes.push(gene_ids[index].to_owned());
|
|
1355
1160
|
filtered_gene_names.push(gene_names[index].to_owned());
|
|
1356
1161
|
let mut j = 0;
|
|
1357
1162
|
for item in &row {
|
|
@@ -1372,12 +1177,7 @@ fn filter_by_expr(
|
|
|
1372
1177
|
modified_lib_sizes.push(modified_lib_sizes_vector[(0, i)].into());
|
|
1373
1178
|
}
|
|
1374
1179
|
//println!("filtered_matrix:{:?}", filtered_matrix);
|
|
1375
|
-
(
|
|
1376
|
-
filtered_matrix,
|
|
1377
|
-
modified_lib_sizes,
|
|
1378
|
-
filtered_genes,
|
|
1379
|
-
filtered_gene_names,
|
|
1380
|
-
)
|
|
1180
|
+
(filtered_matrix, modified_lib_sizes, filtered_gene_names)
|
|
1381
1181
|
}
|
|
1382
1182
|
|
|
1383
1183
|
fn cpm(
|
|
@@ -1393,8 +1193,7 @@ fn cpm(
|
|
|
1393
1193
|
for col in 0..input_matrix.ncols() {
|
|
1394
1194
|
let norm_factor = column_sums[(0, col)];
|
|
1395
1195
|
for row in 0..input_matrix.nrows() {
|
|
1396
|
-
output_matrix[(row, col)] =
|
|
1397
|
-
(input_matrix[(row, col)] as f64 * 1000000.0) / norm_factor as f64;
|
|
1196
|
+
output_matrix[(row, col)] = (input_matrix[(row, col)] as f64 * 1000000.0) / norm_factor as f64;
|
|
1398
1197
|
}
|
|
1399
1198
|
}
|
|
1400
1199
|
//println!("output_matrix:{:?}", output_matrix);
|