@sjcrh/proteinpaint-rust 2.78.0 → 2.81.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/DEanalysis.rs +453 -179
- package/src/genesetORA.rs +62 -54
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.
|
|
2
|
+
"version": "2.81.5",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"description": "Rust-based utilities for proteinpaint",
|
|
5
5
|
"main": "index.js",
|
|
@@ -38,5 +38,5 @@
|
|
|
38
38
|
"devDependencies": {
|
|
39
39
|
"tape": "^5.2.2"
|
|
40
40
|
},
|
|
41
|
-
"pp_release_tag": "v2.
|
|
41
|
+
"pp_release_tag": "v2.81.5"
|
|
42
42
|
}
|
package/src/DEanalysis.rs
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
|
-
// cd .. && cargo build --release && json='{"min_count":10,"min_total_count":15,"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
|
|
1
|
+
// cd .. && cargo build --release && json='{"min_count":10,"min_total_count":15,"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","data_type":"do_DE","storage_type":"text","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
|
|
2
|
+
// cd .. && cargo build --release && json='{"data_type":"get_samples","input_file":"/Users/rpaul1/pp_data/files/hg38/ALL-pharmacotyping/rnaseq/counts.h5"}' && time echo $json | target/release/DEanalysis
|
|
2
3
|
// cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/DEanalysis
|
|
3
4
|
#![allow(non_snake_case)]
|
|
5
|
+
use hdf5::types::VarLenAscii;
|
|
6
|
+
use hdf5::File as HDF5File;
|
|
4
7
|
use json;
|
|
5
8
|
use nalgebra::base::dimension::Const;
|
|
6
9
|
use nalgebra::base::dimension::Dyn;
|
|
@@ -8,6 +11,9 @@ use nalgebra::base::Matrix;
|
|
|
8
11
|
use nalgebra::base::VecStorage;
|
|
9
12
|
use nalgebra::DMatrix;
|
|
10
13
|
use nalgebra::ViewStorage;
|
|
14
|
+
use ndarray::Array1;
|
|
15
|
+
use ndarray::Array2;
|
|
16
|
+
use ndarray::Dim;
|
|
11
17
|
use serde::{Deserialize, Serialize};
|
|
12
18
|
use serde_json;
|
|
13
19
|
use statrs::statistics::Data;
|
|
@@ -55,7 +61,156 @@ fn binary_search(input: &Vec<usize>, y: usize) -> i64 {
|
|
|
55
61
|
index
|
|
56
62
|
}
|
|
57
63
|
|
|
58
|
-
fn
|
|
64
|
+
fn input_data_from_HDF5(
|
|
65
|
+
hdf5_filename: &String,
|
|
66
|
+
case_list: &Vec<&str>,
|
|
67
|
+
control_list: &Vec<&str>,
|
|
68
|
+
) -> (
|
|
69
|
+
Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
70
|
+
Vec<usize>,
|
|
71
|
+
Vec<usize>,
|
|
72
|
+
Vec<String>,
|
|
73
|
+
Vec<String>,
|
|
74
|
+
) {
|
|
75
|
+
let file = HDF5File::open(&hdf5_filename).unwrap(); // open for reading
|
|
76
|
+
let ds_dim = file.dataset("dims").unwrap(); // open the dataset
|
|
77
|
+
let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
|
|
78
|
+
let mut case_indexes: Vec<usize> = Vec::with_capacity(case_list.len());
|
|
79
|
+
let mut control_indexes: Vec<usize> = Vec::with_capacity(control_list.len());
|
|
80
|
+
// Check the data type and read the dataset accordingly
|
|
81
|
+
let data_dim: Array1<_> = ds_dim.read::<usize, Dim<[usize; 1]>>().unwrap();
|
|
82
|
+
let num_samples = data_dim[0]; // Number of total columns in the dataset
|
|
83
|
+
let num_genes = data_dim[1]; // Number of total rows in the dataset
|
|
84
|
+
println!("num_samples bulk:{}", num_samples);
|
|
85
|
+
println!("num_genes bulk:{}", num_genes);
|
|
86
|
+
|
|
87
|
+
let now_gene_names = Instant::now();
|
|
88
|
+
let ds_gene_names = file.dataset("gene_names").unwrap();
|
|
89
|
+
println!("ds_gene_names:{:?}", ds_gene_names);
|
|
90
|
+
let gene_names = ds_gene_names
|
|
91
|
+
.read::<VarLenAscii, Dim<[usize; 1]>>()
|
|
92
|
+
.unwrap();
|
|
93
|
+
println!("\tgene_names = {:?}", gene_names);
|
|
94
|
+
println!("\tgene_names.shape() = {:?}", gene_names.shape());
|
|
95
|
+
println!("\tgene_names.strides() = {:?}", gene_names.strides());
|
|
96
|
+
println!("\tgene_names.ndim() = {:?}", gene_names.ndim());
|
|
97
|
+
println!("Time for parsing gene names:{:?}", now_gene_names.elapsed());
|
|
98
|
+
|
|
99
|
+
let now_gene_symbols = Instant::now();
|
|
100
|
+
let ds_gene_symbols = file.dataset("gene_symbols").unwrap();
|
|
101
|
+
println!("ds_gene_symbols:{:?}", ds_gene_symbols);
|
|
102
|
+
let gene_symbols = ds_gene_symbols
|
|
103
|
+
.read::<VarLenAscii, Dim<[usize; 1]>>()
|
|
104
|
+
.unwrap();
|
|
105
|
+
println!("\tgene_symbols = {:?}", gene_symbols);
|
|
106
|
+
println!("\tgene_symbols.shape() = {:?}", gene_symbols.shape());
|
|
107
|
+
println!("\tgene_symbols.strides() = {:?}", gene_symbols.strides());
|
|
108
|
+
println!("\tgene_symbols.ndim() = {:?}", gene_symbols.ndim());
|
|
109
|
+
println!(
|
|
110
|
+
"Time for parsing gene symbols:{:?}",
|
|
111
|
+
now_gene_symbols.elapsed()
|
|
112
|
+
);
|
|
113
|
+
|
|
114
|
+
let mut gene_names_string: Vec<String> = Vec::with_capacity(gene_names.len());
|
|
115
|
+
let mut gene_symbols_string: Vec<String> = Vec::with_capacity(gene_symbols.len());
|
|
116
|
+
for i in 0..gene_names.len() {
|
|
117
|
+
gene_names_string.push(gene_names[i].to_string());
|
|
118
|
+
gene_symbols_string.push(gene_symbols[i].to_string());
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
let now_samples = Instant::now();
|
|
122
|
+
let ds_samples = file.dataset("samples").unwrap();
|
|
123
|
+
let samples = ds_samples.read::<VarLenAscii, Dim<[usize; 1]>>().unwrap();
|
|
124
|
+
println!("\tsamples = {:?}", samples);
|
|
125
|
+
println!("\tsamples.shape() = {:?}", samples.shape());
|
|
126
|
+
println!("\tsamples.strides() = {:?}", samples.strides());
|
|
127
|
+
println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
128
|
+
println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
129
|
+
|
|
130
|
+
//Find all columns values that are populated for the given gene
|
|
131
|
+
let now_counts = Instant::now();
|
|
132
|
+
let ds_counts = file.dataset("counts").unwrap(); // open the dataset
|
|
133
|
+
|
|
134
|
+
let mut global_sample_index = 0;
|
|
135
|
+
for sample_name in case_list {
|
|
136
|
+
let sample_index;
|
|
137
|
+
match samples
|
|
138
|
+
.iter()
|
|
139
|
+
.position(|x| x.to_string() == *sample_name.to_string())
|
|
140
|
+
{
|
|
141
|
+
Some(index) => {
|
|
142
|
+
//println!(
|
|
143
|
+
// "The index of '{}' is {} in 0-based format (add 1 to compare with R output)",
|
|
144
|
+
// sample_name, index
|
|
145
|
+
//);
|
|
146
|
+
sample_index = index;
|
|
147
|
+
}
|
|
148
|
+
None => panic!(
|
|
149
|
+
"Sample '{}' not found in the HDF5 file '{}'",
|
|
150
|
+
sample_name, &hdf5_filename
|
|
151
|
+
),
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
let sample_array: Array2<f64> = ds_counts
|
|
155
|
+
.read_slice_2d((0..gene_names.len(), sample_index..sample_index + 1))
|
|
156
|
+
.unwrap();
|
|
157
|
+
//println!("Length of gene array:{:?}", sample_array.len()); // Please check the result
|
|
158
|
+
input_vector.append(&mut sample_array.as_slice().unwrap().to_vec());
|
|
159
|
+
case_indexes.push(global_sample_index);
|
|
160
|
+
global_sample_index += 1;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
for sample_name in control_list {
|
|
164
|
+
let sample_index;
|
|
165
|
+
match samples
|
|
166
|
+
.iter()
|
|
167
|
+
.position(|x| x.to_string() == *sample_name.to_string())
|
|
168
|
+
{
|
|
169
|
+
Some(index) => {
|
|
170
|
+
//println!(
|
|
171
|
+
// "The index of '{}' is {} in 0-based format (add 1 to compare with R output)",
|
|
172
|
+
// sample_name, index
|
|
173
|
+
//);
|
|
174
|
+
sample_index = index;
|
|
175
|
+
}
|
|
176
|
+
None => panic!(
|
|
177
|
+
"Sample '{}' not found in the HDF5 file '{}'",
|
|
178
|
+
sample_name, &hdf5_filename
|
|
179
|
+
),
|
|
180
|
+
}
|
|
181
|
+
//let data_counts: Array1<_> = ds_counts.read::<f64, Dim<[usize; 1]>>().unwrap();
|
|
182
|
+
//println!("Data_counts: {:?}", data_counts);
|
|
183
|
+
let sample_array: Array2<f64> = ds_counts
|
|
184
|
+
.read_slice_2d((0..gene_names.len(), sample_index..sample_index + 1))
|
|
185
|
+
.unwrap();
|
|
186
|
+
//println!("Length of gene array:{:?}", sample_array.len()); // Please check the result
|
|
187
|
+
input_vector.append(&mut sample_array.as_slice().unwrap().to_vec());
|
|
188
|
+
control_indexes.push(global_sample_index);
|
|
189
|
+
global_sample_index += 1;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
println!("Time for parsing HDF5 data:{:?}", now_counts.elapsed());
|
|
193
|
+
//println!(
|
|
194
|
+
// "case + control length:{}",
|
|
195
|
+
// case_list.len() + control_list.len()
|
|
196
|
+
//);
|
|
197
|
+
//println!("gene_names length:{}", gene_names.len());
|
|
198
|
+
//println!("input_vector length:{}", input_vector.len());
|
|
199
|
+
let dm = DMatrix::from_row_slice(
|
|
200
|
+
case_list.len() + control_list.len(),
|
|
201
|
+
gene_names.len(),
|
|
202
|
+
&input_vector,
|
|
203
|
+
);
|
|
204
|
+
(
|
|
205
|
+
dm.transpose(), // Transposing the matrix
|
|
206
|
+
case_indexes,
|
|
207
|
+
control_indexes,
|
|
208
|
+
gene_names_string,
|
|
209
|
+
gene_symbols_string,
|
|
210
|
+
)
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
fn input_data_from_text(
|
|
59
214
|
filename: &String,
|
|
60
215
|
case_list: &Vec<&str>,
|
|
61
216
|
control_list: &Vec<&str>,
|
|
@@ -67,7 +222,6 @@ fn input_data(
|
|
|
67
222
|
Vec<String>,
|
|
68
223
|
) {
|
|
69
224
|
let input_time = Instant::now();
|
|
70
|
-
//let mut rdr = csv::Reader::from_path(path).unwrap();
|
|
71
225
|
let mut file = File::open(filename).unwrap();
|
|
72
226
|
let mut num_lines: usize = 0;
|
|
73
227
|
let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
|
|
@@ -355,6 +509,43 @@ struct PValueIndexes {
|
|
|
355
509
|
p_value: f64,
|
|
356
510
|
}
|
|
357
511
|
|
|
512
|
+
// Used to get the sample names from HDF5 file at PP server startup
|
|
513
|
+
fn get_DE_samples(hdf5_filename: &String) {
|
|
514
|
+
let file = HDF5File::open(&hdf5_filename).unwrap(); // open for reading
|
|
515
|
+
let now_samples = Instant::now();
|
|
516
|
+
let ds_samples = file.dataset("samples").unwrap();
|
|
517
|
+
let samples = ds_samples.read::<VarLenAscii, Dim<[usize; 1]>>().unwrap();
|
|
518
|
+
println!("\tsamples = {:?}", samples);
|
|
519
|
+
println!("\tsamples.shape() = {:?}", samples.shape());
|
|
520
|
+
println!("\tsamples.strides() = {:?}", samples.strides());
|
|
521
|
+
println!("\tsamples.ndim() = {:?}", samples.ndim());
|
|
522
|
+
println!("Time for parsing samples:{:?}", now_samples.elapsed());
|
|
523
|
+
|
|
524
|
+
let mut output_string = "".to_string();
|
|
525
|
+
for i in 0..samples.len() {
|
|
526
|
+
//let item_json = "{\"".to_string()
|
|
527
|
+
// + &samples[i].to_string()
|
|
528
|
+
// + &"\","
|
|
529
|
+
// + &gene_array[i].to_string()
|
|
530
|
+
// + &"}";
|
|
531
|
+
|
|
532
|
+
//let item_json = format!("{{\"{}\"}}", samples[i].to_string());
|
|
533
|
+
|
|
534
|
+
output_string += &format!("{}", samples[i].to_string());
|
|
535
|
+
//println!("item_json:{}", item_json);
|
|
536
|
+
|
|
537
|
+
//let item_json = format!(
|
|
538
|
+
// r##"{{"{}",{}}}"##,
|
|
539
|
+
// samples[i].to_string().replace("\\", ""),
|
|
540
|
+
// gene_array[i].to_string()
|
|
541
|
+
//);
|
|
542
|
+
if i != samples.len() - 1 {
|
|
543
|
+
output_string += &",";
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
println!("output_string:{}", output_string);
|
|
547
|
+
}
|
|
548
|
+
|
|
358
549
|
fn main() {
|
|
359
550
|
//env::set_var("RUST_BACKTRACE", "full");
|
|
360
551
|
let mut input = String::new();
|
|
@@ -368,28 +559,6 @@ fn main() {
|
|
|
368
559
|
match input_json {
|
|
369
560
|
Ok(json_string) => {
|
|
370
561
|
let now = Instant::now();
|
|
371
|
-
let min_count_option = json_string["min_count"].as_f64().to_owned();
|
|
372
|
-
let min_total_count_option = json_string["min_total_count"].as_f64().to_owned();
|
|
373
|
-
let min_count;
|
|
374
|
-
match min_count_option {
|
|
375
|
-
Some(x) => min_count = x,
|
|
376
|
-
None => {
|
|
377
|
-
panic!("min_count is missing a value")
|
|
378
|
-
}
|
|
379
|
-
}
|
|
380
|
-
let min_total_count;
|
|
381
|
-
match min_total_count_option {
|
|
382
|
-
Some(x) => min_total_count = x,
|
|
383
|
-
None => {
|
|
384
|
-
panic!("min_total_count is missing a value")
|
|
385
|
-
}
|
|
386
|
-
}
|
|
387
|
-
let case_string = &json_string["case"].to_owned().as_str().unwrap().to_string();
|
|
388
|
-
let control_string = &json_string["control"]
|
|
389
|
-
.to_owned()
|
|
390
|
-
.as_str()
|
|
391
|
-
.unwrap()
|
|
392
|
-
.to_string();
|
|
393
562
|
let file_name = &json_string["input_file"]
|
|
394
563
|
.to_owned()
|
|
395
564
|
.as_str()
|
|
@@ -397,149 +566,150 @@ fn main() {
|
|
|
397
566
|
.to_string()
|
|
398
567
|
.split(",")
|
|
399
568
|
.collect();
|
|
400
|
-
|
|
401
|
-
let
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
// + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
|
|
467
|
-
control.push(row[(0, j)]);
|
|
468
|
-
//println!("{},{}", input_data_vec.0[i][j], "Control");
|
|
569
|
+
println!("file_name:{}", file_name);
|
|
570
|
+
let data_type_option = json_string["data_type"].as_str().to_owned();
|
|
571
|
+
match data_type_option {
|
|
572
|
+
Some(x) => {
|
|
573
|
+
if x == "get_samples" {
|
|
574
|
+
get_DE_samples(file_name)
|
|
575
|
+
} else if x == "do_DE" {
|
|
576
|
+
let min_count_option = json_string["min_count"].as_f64().to_owned();
|
|
577
|
+
let min_total_count_option =
|
|
578
|
+
json_string["min_total_count"].as_f64().to_owned();
|
|
579
|
+
let storage_type_option =
|
|
580
|
+
json_string["storage_type"].as_str().to_owned();
|
|
581
|
+
let storage_type;
|
|
582
|
+
match storage_type_option {
|
|
583
|
+
Some(x) => {
|
|
584
|
+
if x == "HDF5" {
|
|
585
|
+
storage_type = "HDF5"
|
|
586
|
+
} else if x == "text" {
|
|
587
|
+
storage_type = "text"
|
|
588
|
+
} else {
|
|
589
|
+
panic!(
|
|
590
|
+
"Unknown storage_type:{}{}",
|
|
591
|
+
x, " Needs to be either HDF5 or text"
|
|
592
|
+
);
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
None => panic!("storage_type needs to be HDF5 or text"),
|
|
596
|
+
}
|
|
597
|
+
let min_count;
|
|
598
|
+
match min_count_option {
|
|
599
|
+
Some(x) => min_count = x,
|
|
600
|
+
None => {
|
|
601
|
+
panic!("min_count is missing a value")
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
let min_total_count;
|
|
605
|
+
match min_total_count_option {
|
|
606
|
+
Some(x) => min_total_count = x,
|
|
607
|
+
None => {
|
|
608
|
+
panic!("min_total_count is missing a value")
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
let case_string =
|
|
612
|
+
&json_string["case"].to_owned().as_str().unwrap().to_string();
|
|
613
|
+
let control_string = &json_string["control"]
|
|
614
|
+
.to_owned()
|
|
615
|
+
.as_str()
|
|
616
|
+
.unwrap()
|
|
617
|
+
.to_string();
|
|
618
|
+
let case_list: Vec<&str> = case_string.split(",").collect();
|
|
619
|
+
let control_list: Vec<&str> = control_string.split(",").collect();
|
|
620
|
+
let (
|
|
621
|
+
input_matrix,
|
|
622
|
+
case_indexes,
|
|
623
|
+
control_indexes,
|
|
624
|
+
gene_names,
|
|
625
|
+
gene_symbols,
|
|
626
|
+
);
|
|
627
|
+
if storage_type == "text" {
|
|
628
|
+
(
|
|
629
|
+
input_matrix,
|
|
630
|
+
case_indexes,
|
|
631
|
+
control_indexes,
|
|
632
|
+
gene_names,
|
|
633
|
+
gene_symbols,
|
|
634
|
+
) = input_data_from_text(file_name, &case_list, &control_list);
|
|
469
635
|
} else {
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
636
|
+
// Parsing data from a HDF5 file
|
|
637
|
+
(
|
|
638
|
+
input_matrix,
|
|
639
|
+
case_indexes,
|
|
640
|
+
control_indexes,
|
|
641
|
+
gene_names,
|
|
642
|
+
gene_symbols,
|
|
643
|
+
) = input_data_from_HDF5(file_name, &case_list, &control_list);
|
|
474
644
|
}
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
gene_name: filtered_genes[i].to_owned(),
|
|
499
|
-
gene_symbol: filtered_gene_symbols[i].to_owned(),
|
|
500
|
-
fold_change: (treated_mean.unwrap() / control_mean.unwrap())
|
|
501
|
-
.log2(),
|
|
502
|
-
p_value: p_value,
|
|
503
|
-
});
|
|
504
|
-
}
|
|
505
|
-
}
|
|
506
|
-
} else {
|
|
507
|
-
// Multithreaded implementation of calculating wilcoxon p-values
|
|
508
|
-
let normalized_matrix_temp = Arc::new(normalized_matrix);
|
|
509
|
-
let filtered_genes_temp = Arc::new(filtered_genes);
|
|
510
|
-
let filtered_gene_symbols_temp = Arc::new(filtered_gene_symbols);
|
|
511
|
-
let case_indexes_temp = Arc::new(case_indexes);
|
|
512
|
-
let control_indexes_temp = Arc::new(control_indexes);
|
|
513
|
-
let p_values_temp = Arc::new(Mutex::new(Vec::<PValueIndexes>::new()));
|
|
514
|
-
let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
|
|
515
|
-
for thread_num in 0..max_threads {
|
|
516
|
-
let normalized_matrix_temp = Arc::clone(&normalized_matrix_temp);
|
|
517
|
-
let case_indexes_temp = Arc::clone(&case_indexes_temp);
|
|
518
|
-
let control_indexes_temp = Arc::clone(&control_indexes_temp);
|
|
519
|
-
let p_values_temp = Arc::clone(&p_values_temp);
|
|
520
|
-
let filtered_genes_temp = Arc::clone(&filtered_genes_temp);
|
|
521
|
-
let filtered_gene_symbols_temp =
|
|
522
|
-
Arc::clone(&filtered_gene_symbols_temp);
|
|
523
|
-
let handle = thread::spawn(move || {
|
|
524
|
-
let mut p_values_thread: Vec<PValueIndexes> = Vec::with_capacity(
|
|
525
|
-
normalized_matrix_temp.nrows() / max_threads,
|
|
645
|
+
let filtering_time = Instant::now();
|
|
646
|
+
let (
|
|
647
|
+
filtered_matrix,
|
|
648
|
+
lib_sizes,
|
|
649
|
+
filtered_genes,
|
|
650
|
+
filtered_gene_symbols,
|
|
651
|
+
) = filter_by_expr(
|
|
652
|
+
min_count,
|
|
653
|
+
min_total_count,
|
|
654
|
+
&input_matrix,
|
|
655
|
+
case_indexes.len(),
|
|
656
|
+
control_indexes.len(),
|
|
657
|
+
gene_names,
|
|
658
|
+
gene_symbols,
|
|
659
|
+
);
|
|
660
|
+
println!("filtering time:{:?}", filtering_time.elapsed());
|
|
661
|
+
//println!("filtered_matrix_rows:{:?}", filtered_matrix.nrows());
|
|
662
|
+
//println!("filtered_matrix_cols:{:?}", filtered_matrix.ncols());
|
|
663
|
+
let cpm_normalization_time = Instant::now();
|
|
664
|
+
let mut normalized_matrix = cpm(&filtered_matrix);
|
|
665
|
+
println!(
|
|
666
|
+
"cpm normalization time:{:?}",
|
|
667
|
+
cpm_normalization_time.elapsed()
|
|
526
668
|
);
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
669
|
+
let tmm_normalization_time = Instant::now();
|
|
670
|
+
let norm_factors = tmm_normalization(filtered_matrix, &lib_sizes);
|
|
671
|
+
println!(
|
|
672
|
+
"tmm normalization time:{:?}",
|
|
673
|
+
tmm_normalization_time.elapsed()
|
|
674
|
+
);
|
|
675
|
+
//println!("norm_factors:{:?}", norm_factors);
|
|
676
|
+
|
|
677
|
+
for col in 0..normalized_matrix.ncols() {
|
|
678
|
+
let norm_factor = norm_factors[col];
|
|
679
|
+
for row in 0..normalized_matrix.nrows() {
|
|
680
|
+
normalized_matrix[(row, col)] =
|
|
681
|
+
normalized_matrix[(row, col)] / norm_factor;
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
//println!("normalized_matrix:{:?}", normalized_matrix);
|
|
685
|
+
println!("Number of cases:{}", case_list.len());
|
|
686
|
+
println!("Number of controls:{}", control_list.len());
|
|
687
|
+
println!("Time for pre-processing:{:?}", now.elapsed());
|
|
688
|
+
// Using Wilcoxon test for differential gene expression
|
|
689
|
+
|
|
690
|
+
let now2 = Instant::now();
|
|
691
|
+
let mut p_values: Vec<PValueIndexes> =
|
|
692
|
+
Vec::with_capacity(normalized_matrix.nrows());
|
|
693
|
+
const THRESHOLD: usize = 50; // This determines whether the Wilcoxon exact test or the normal test will be used based on sample size.
|
|
694
|
+
|
|
695
|
+
//println!("case_indexes:{:?}", case_indexes);
|
|
696
|
+
//println!("control_indexes:{:?}", control_indexes);
|
|
697
|
+
let num_normalized_rows = normalized_matrix.nrows();
|
|
698
|
+
if normalized_matrix.nrows() * normalized_matrix.ncols()
|
|
699
|
+
< PAR_CUTOFF
|
|
700
|
+
{
|
|
701
|
+
for i in 0..normalized_matrix.nrows() {
|
|
702
|
+
let row = normalized_matrix.row(i);
|
|
531
703
|
//println!("row:{:?}", row);
|
|
532
704
|
let mut treated = Vec::<f64>::new();
|
|
533
705
|
let mut control = Vec::<f64>::new();
|
|
534
706
|
//println!("conditions:{:?}", conditions);
|
|
535
|
-
for j in 0..(
|
|
536
|
-
+ control_indexes_temp.len())
|
|
537
|
-
{
|
|
707
|
+
for j in 0..(case_indexes.len() + control_indexes.len()) {
|
|
538
708
|
//println!("row[(0, j)]:{}", row[(0, j)]);
|
|
539
|
-
if
|
|
709
|
+
if case_indexes.contains(&j) {
|
|
540
710
|
treated.push(row[(0, j)]);
|
|
541
711
|
//println!("{},{}", input_data_vec.0[i][j], "Diseased");
|
|
542
|
-
} else if
|
|
712
|
+
} else if control_indexes.contains(&j) {
|
|
543
713
|
// + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
|
|
544
714
|
control.push(row[(0, j)]);
|
|
545
715
|
//println!("{},{}", input_data_vec.0[i][j], "Control");
|
|
@@ -570,11 +740,10 @@ fn main() {
|
|
|
570
740
|
.is_infinite()
|
|
571
741
|
== false
|
|
572
742
|
{
|
|
573
|
-
|
|
743
|
+
p_values.push(PValueIndexes {
|
|
574
744
|
index: i,
|
|
575
|
-
gene_name:
|
|
576
|
-
gene_symbol:
|
|
577
|
-
.to_owned(),
|
|
745
|
+
gene_name: filtered_genes[i].to_owned(),
|
|
746
|
+
gene_symbol: filtered_gene_symbols[i].to_owned(),
|
|
578
747
|
fold_change: (treated_mean.unwrap()
|
|
579
748
|
/ control_mean.unwrap())
|
|
580
749
|
.log2(),
|
|
@@ -582,27 +751,126 @@ fn main() {
|
|
|
582
751
|
});
|
|
583
752
|
}
|
|
584
753
|
}
|
|
754
|
+
} else {
|
|
755
|
+
// Multithreaded implementation of calculating wilcoxon p-values
|
|
756
|
+
let normalized_matrix_temp = Arc::new(normalized_matrix);
|
|
757
|
+
let filtered_genes_temp = Arc::new(filtered_genes);
|
|
758
|
+
let filtered_gene_symbols_temp =
|
|
759
|
+
Arc::new(filtered_gene_symbols);
|
|
760
|
+
let case_indexes_temp = Arc::new(case_indexes);
|
|
761
|
+
let control_indexes_temp = Arc::new(control_indexes);
|
|
762
|
+
let p_values_temp =
|
|
763
|
+
Arc::new(Mutex::new(Vec::<PValueIndexes>::new()));
|
|
764
|
+
let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
|
|
765
|
+
for thread_num in 0..max_threads {
|
|
766
|
+
let normalized_matrix_temp =
|
|
767
|
+
Arc::clone(&normalized_matrix_temp);
|
|
768
|
+
let case_indexes_temp = Arc::clone(&case_indexes_temp);
|
|
769
|
+
let control_indexes_temp =
|
|
770
|
+
Arc::clone(&control_indexes_temp);
|
|
771
|
+
let p_values_temp = Arc::clone(&p_values_temp);
|
|
772
|
+
let filtered_genes_temp = Arc::clone(&filtered_genes_temp);
|
|
773
|
+
let filtered_gene_symbols_temp =
|
|
774
|
+
Arc::clone(&filtered_gene_symbols_temp);
|
|
775
|
+
let handle = thread::spawn(move || {
|
|
776
|
+
let mut p_values_thread: Vec<PValueIndexes> =
|
|
777
|
+
Vec::with_capacity(
|
|
778
|
+
normalized_matrix_temp.nrows() / max_threads,
|
|
779
|
+
);
|
|
780
|
+
for i in 0..normalized_matrix_temp.nrows() {
|
|
781
|
+
let remainder: usize = i % max_threads; // Calculate remainder of iteration number divided by max_threads to decide which thread parses the row
|
|
782
|
+
if remainder == thread_num {
|
|
783
|
+
let row = normalized_matrix_temp.row(i);
|
|
784
|
+
//println!("row:{:?}", row);
|
|
785
|
+
let mut treated = Vec::<f64>::new();
|
|
786
|
+
let mut control = Vec::<f64>::new();
|
|
787
|
+
//println!("conditions:{:?}", conditions);
|
|
788
|
+
for j in 0..(case_indexes_temp.len()
|
|
789
|
+
+ control_indexes_temp.len())
|
|
790
|
+
{
|
|
791
|
+
//println!("row[(0, j)]:{}", row[(0, j)]);
|
|
792
|
+
if case_indexes_temp.contains(&j) {
|
|
793
|
+
treated.push(row[(0, j)]);
|
|
794
|
+
//println!("{},{}", input_data_vec.0[i][j], "Diseased");
|
|
795
|
+
} else if control_indexes_temp.contains(&j)
|
|
796
|
+
{
|
|
797
|
+
// + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
|
|
798
|
+
control.push(row[(0, j)]);
|
|
799
|
+
//println!("{},{}", input_data_vec.0[i][j], "Control");
|
|
800
|
+
} else {
|
|
801
|
+
panic!(
|
|
802
|
+
"Column {} could not be classified into case/control",
|
|
803
|
+
j
|
|
804
|
+
);
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
//println!("treated{:?}", treated);
|
|
808
|
+
//println!("control{:?}", control);
|
|
809
|
+
let p_value =
|
|
810
|
+
stats_functions::wilcoxon_rank_sum_test(
|
|
811
|
+
treated.clone(),
|
|
812
|
+
control.clone(),
|
|
813
|
+
THRESHOLD,
|
|
814
|
+
't',
|
|
815
|
+
true,
|
|
816
|
+
); // Setting continuity correction to true in case of normal approximation
|
|
817
|
+
let treated_mean = Data::new(treated).mean();
|
|
818
|
+
let control_mean = Data::new(control).mean();
|
|
819
|
+
if (treated_mean.unwrap()
|
|
820
|
+
/ control_mean.unwrap())
|
|
821
|
+
.log2()
|
|
822
|
+
.is_nan()
|
|
823
|
+
== false
|
|
824
|
+
&& (treated_mean.unwrap()
|
|
825
|
+
/ control_mean.unwrap())
|
|
826
|
+
.log2()
|
|
827
|
+
.is_infinite()
|
|
828
|
+
== false
|
|
829
|
+
{
|
|
830
|
+
p_values_thread.push(PValueIndexes {
|
|
831
|
+
index: i,
|
|
832
|
+
gene_name: filtered_genes_temp[i]
|
|
833
|
+
.to_owned(),
|
|
834
|
+
gene_symbol: filtered_gene_symbols_temp
|
|
835
|
+
[i]
|
|
836
|
+
.to_owned(),
|
|
837
|
+
fold_change: (treated_mean.unwrap()
|
|
838
|
+
/ control_mean.unwrap())
|
|
839
|
+
.log2(),
|
|
840
|
+
p_value: p_value,
|
|
841
|
+
});
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
p_values_temp
|
|
846
|
+
.lock()
|
|
847
|
+
.unwrap()
|
|
848
|
+
.append(&mut p_values_thread);
|
|
849
|
+
});
|
|
850
|
+
handles.push(handle);
|
|
851
|
+
}
|
|
852
|
+
for handle in handles {
|
|
853
|
+
// Wait for all threads to finish before proceeding further
|
|
854
|
+
handle.join().unwrap();
|
|
855
|
+
}
|
|
856
|
+
p_values.append(&mut *p_values_temp.lock().unwrap());
|
|
585
857
|
}
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
858
|
+
//println!("p_values:{:?}", p_values);
|
|
859
|
+
println!(
|
|
860
|
+
"Time for running {} wilcoxon tests:{:?}",
|
|
861
|
+
num_normalized_rows,
|
|
862
|
+
now2.elapsed()
|
|
863
|
+
);
|
|
864
|
+
let adjusted_p_values = adjust_p_values(p_values);
|
|
865
|
+
println!("adjusted_p_values:{}", adjusted_p_values);
|
|
866
|
+
//let fold_changes =
|
|
867
|
+
// calculate_fold_change(normalized_matrix, case_indexes, control_indexes);
|
|
868
|
+
}
|
|
589
869
|
}
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
handle.join().unwrap();
|
|
870
|
+
None => {
|
|
871
|
+
panic!("data_type is missing")
|
|
593
872
|
}
|
|
594
|
-
p_values.append(&mut *p_values_temp.lock().unwrap());
|
|
595
873
|
}
|
|
596
|
-
//println!("p_values:{:?}", p_values);
|
|
597
|
-
println!(
|
|
598
|
-
"Time for running {} wilcoxon tests:{:?}",
|
|
599
|
-
num_normalized_rows,
|
|
600
|
-
now2.elapsed()
|
|
601
|
-
);
|
|
602
|
-
let adjusted_p_values = adjust_p_values(p_values);
|
|
603
|
-
println!("adjusted_p_values:{}", adjusted_p_values);
|
|
604
|
-
//let fold_changes =
|
|
605
|
-
// calculate_fold_change(normalized_matrix, case_indexes, control_indexes);
|
|
606
874
|
}
|
|
607
875
|
Err(error) => println!("Incorrect json: {}", error),
|
|
608
876
|
}
|
|
@@ -1052,6 +1320,7 @@ fn filter_by_expr(
|
|
|
1052
1320
|
positives.push(row);
|
|
1053
1321
|
}
|
|
1054
1322
|
}
|
|
1323
|
+
println!("positives length:{}", positives.len());
|
|
1055
1324
|
//println!("row_sums:{:?}", row_sums);
|
|
1056
1325
|
//println!("keep_cpm:{:?}", keep_cpm);
|
|
1057
1326
|
//println!("positive_cpm:{}", positive_cpm);
|
|
@@ -1067,12 +1336,17 @@ fn filter_by_expr(
|
|
|
1067
1336
|
let mut filtered_genes: Vec<String> = Vec::with_capacity(positives.len());
|
|
1068
1337
|
let mut filtered_gene_symbols: Vec<String> = Vec::with_capacity(positives.len());
|
|
1069
1338
|
let mut i = 0;
|
|
1339
|
+
println!("filtered_matrix rows:{}", filtered_matrix.nrows());
|
|
1340
|
+
println!("filtered_matrix cols:{}", filtered_matrix.ncols());
|
|
1070
1341
|
for index in positives {
|
|
1071
1342
|
let row = raw_data.row(index);
|
|
1072
1343
|
filtered_genes.push(gene_names[index].to_owned());
|
|
1073
1344
|
filtered_gene_symbols.push(gene_symbols[index].to_owned());
|
|
1074
1345
|
let mut j = 0;
|
|
1075
1346
|
for item in &row {
|
|
1347
|
+
//println!("index:{}", index);
|
|
1348
|
+
//println!("i:{}", i);
|
|
1349
|
+
//println!("j:{}", j);
|
|
1076
1350
|
filtered_matrix[(i, j)] = *item;
|
|
1077
1351
|
j += 1;
|
|
1078
1352
|
}
|
package/src/genesetORA.rs
CHANGED
|
@@ -7,6 +7,7 @@ use rusqlite::{Connection, Result};
|
|
|
7
7
|
use serde::{Deserialize, Serialize};
|
|
8
8
|
use serde_json;
|
|
9
9
|
use std::cmp::Ordering;
|
|
10
|
+
use std::collections::HashSet;
|
|
10
11
|
use std::io;
|
|
11
12
|
use std::time::Instant;
|
|
12
13
|
|
|
@@ -17,15 +18,6 @@ struct GO_pathway {
|
|
|
17
18
|
GO_id: String,
|
|
18
19
|
}
|
|
19
20
|
|
|
20
|
-
#[allow(non_camel_case_types)]
|
|
21
|
-
#[allow(non_snake_case)]
|
|
22
|
-
#[derive(Debug)]
|
|
23
|
-
struct pathway_genes {
|
|
24
|
-
symbol: String,
|
|
25
|
-
_ensg: String,
|
|
26
|
-
_enstCanonical: String,
|
|
27
|
-
}
|
|
28
|
-
|
|
29
21
|
#[allow(non_camel_case_types)]
|
|
30
22
|
#[allow(non_snake_case)]
|
|
31
23
|
#[derive(Debug, Serialize, Deserialize)]
|
|
@@ -39,28 +31,27 @@ struct pathway_p_value {
|
|
|
39
31
|
}
|
|
40
32
|
|
|
41
33
|
fn calculate_hypergeometric_p_value(
|
|
42
|
-
sample_genes: &
|
|
34
|
+
sample_genes: &HashSet<String>,
|
|
43
35
|
num_background_genes: usize,
|
|
44
|
-
genes_in_pathway:
|
|
36
|
+
genes_in_pathway: HashSet<String>,
|
|
45
37
|
) -> (f64, f64, String) {
|
|
46
|
-
let mut matching_sample_genes_counts = 0.0;
|
|
47
38
|
let mut gene_set_hits: String = "".to_string();
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
39
|
+
|
|
40
|
+
let gene_intersections: HashSet<String> = genes_in_pathway
|
|
41
|
+
.intersection(sample_genes)
|
|
42
|
+
.cloned()
|
|
43
|
+
.collect();
|
|
44
|
+
for gene in &gene_intersections {
|
|
45
|
+
gene_set_hits += &(gene.to_string() + &",");
|
|
55
46
|
}
|
|
56
47
|
|
|
57
|
-
if
|
|
48
|
+
if gene_intersections.len() > 0 {
|
|
58
49
|
gene_set_hits.pop();
|
|
59
50
|
}
|
|
60
51
|
|
|
61
52
|
//println!("sample_genes:{:?}", sample_genes);
|
|
62
53
|
//println!("genes_in_pathway:{:?}", genes_in_pathway);
|
|
63
|
-
//println!("k-1:{}",
|
|
54
|
+
//println!("k-1:{}", gene_intersection.len() - 1.0);
|
|
64
55
|
//println!("M:{}", genes_in_pathway.len() as f64);
|
|
65
56
|
//println!(
|
|
66
57
|
// "N-M:{}",
|
|
@@ -68,7 +59,7 @@ fn calculate_hypergeometric_p_value(
|
|
|
68
59
|
//);
|
|
69
60
|
//println!("n:{}", sample_genes.len() as f64);
|
|
70
61
|
let p_value = r_mathlib::hypergeometric_cdf(
|
|
71
|
-
|
|
62
|
+
gene_intersections.len() as f64 - 1.0,
|
|
72
63
|
genes_in_pathway.len() as f64,
|
|
73
64
|
num_background_genes as f64 - genes_in_pathway.len() as f64,
|
|
74
65
|
sample_genes.len() as f64,
|
|
@@ -76,7 +67,7 @@ fn calculate_hypergeometric_p_value(
|
|
|
76
67
|
false,
|
|
77
68
|
);
|
|
78
69
|
//println!("p_value:{}", p_value);
|
|
79
|
-
(p_value,
|
|
70
|
+
(p_value, gene_intersections.len() as f64, gene_set_hits)
|
|
80
71
|
}
|
|
81
72
|
|
|
82
73
|
fn main() -> Result<()> {
|
|
@@ -104,8 +95,49 @@ fn main() -> Result<()> {
|
|
|
104
95
|
let sample_genes: Vec<&str> =
|
|
105
96
|
sample_genes_input.as_str().unwrap().split(",").collect();
|
|
106
97
|
let mut pathway_p_values: Vec<pathway_p_value> = Vec::with_capacity(10000);
|
|
98
|
+
|
|
99
|
+
let genedb_input: &JsonValue = &json_string["genedb"];
|
|
100
|
+
let genedb;
|
|
101
|
+
match genedb_input.as_str() {
|
|
102
|
+
Some(gene_db_string) => genedb = gene_db_string.to_string(),
|
|
103
|
+
None => panic!("genedb file path is missing"),
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
let filter_non_coding_genes_input: &JsonValue =
|
|
107
|
+
&json_string["filter_non_coding_genes"];
|
|
108
|
+
let filter_non_coding_genes: bool =
|
|
109
|
+
filter_non_coding_genes_input.as_bool().unwrap();
|
|
110
|
+
|
|
111
|
+
let genedbconn = Connection::open(genedb)?;
|
|
112
|
+
let genedb_result = genedbconn.prepare(&("select * from codingGenes"));
|
|
113
|
+
let mut num_coding_genes: usize = 0;
|
|
114
|
+
let mut sample_coding_genes: HashSet<String> = HashSet::with_capacity(24000);
|
|
115
|
+
match genedb_result {
|
|
116
|
+
Ok(mut x) => {
|
|
117
|
+
let mut genes = x.query([])?;
|
|
118
|
+
while let Some(coding_gene) = genes.next()? {
|
|
119
|
+
num_coding_genes += 1;
|
|
120
|
+
//println!("coding_gene:{:?}", coding_gene);
|
|
121
|
+
for sample_gene in &sample_genes {
|
|
122
|
+
let code_gene: String = coding_gene.get(0).unwrap();
|
|
123
|
+
if filter_non_coding_genes == true && code_gene == *sample_gene
|
|
124
|
+
{
|
|
125
|
+
sample_coding_genes.insert(code_gene);
|
|
126
|
+
} else if filter_non_coding_genes == false {
|
|
127
|
+
sample_coding_genes.insert(code_gene);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
Err(_) => {}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if sample_coding_genes.len() == 0 {
|
|
136
|
+
panic!("All query genes are non-coding");
|
|
137
|
+
}
|
|
138
|
+
|
|
107
139
|
let background_genes_input: &JsonValue = &json_string["background_genes"];
|
|
108
|
-
let
|
|
140
|
+
let num_background_genes;
|
|
109
141
|
match background_genes_input.as_str() {
|
|
110
142
|
Some(x) => {
|
|
111
143
|
let background_genes_str: Vec<&str> = x.split(",").collect(); // Background genes is defined for e.g in case of DE analysis
|
|
@@ -114,24 +146,7 @@ fn main() -> Result<()> {
|
|
|
114
146
|
None => {
|
|
115
147
|
// Background genes not present for e.g. in hierarchial clustering
|
|
116
148
|
// Get background genes from the gene database
|
|
117
|
-
|
|
118
|
-
let genedb;
|
|
119
|
-
match genedb_input.as_str() {
|
|
120
|
-
Some(gene_db_string) => genedb = gene_db_string.to_string(),
|
|
121
|
-
None => panic!("genedb file path is missing"),
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
let genedbconn = Connection::open(genedb)?;
|
|
125
|
-
let genedb_result = genedbconn.prepare(&("select * from codingGenes"));
|
|
126
|
-
match genedb_result {
|
|
127
|
-
Ok(mut x) => {
|
|
128
|
-
let mut genes = x.query([])?;
|
|
129
|
-
while let Some(_gene) = genes.next()? {
|
|
130
|
-
num_background_genes += 1;
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
Err(_) => {}
|
|
134
|
-
}
|
|
149
|
+
num_background_genes = num_coding_genes;
|
|
135
150
|
}
|
|
136
151
|
}
|
|
137
152
|
//println!("sample_genes:{:?}", sample_genes);
|
|
@@ -169,24 +184,17 @@ fn main() -> Result<()> {
|
|
|
169
184
|
//println!("gene_stmt:{:?}", gene_stmt);
|
|
170
185
|
|
|
171
186
|
let mut rows = gene_stmt.query([])?;
|
|
172
|
-
let mut names =
|
|
187
|
+
let mut names = HashSet::<String>::new();
|
|
173
188
|
while let Some(row) = rows.next()? {
|
|
174
189
|
let a: String = row.get(0)?;
|
|
175
190
|
let input_gene_json = json::parse(&a);
|
|
176
191
|
match input_gene_json {
|
|
177
192
|
Ok(json_genes) => {
|
|
178
193
|
for json_iter in 0..json_genes.len() {
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
.to_string(),
|
|
182
|
-
_ensg: json_genes[json_iter]["ensg"]
|
|
183
|
-
.to_string(),
|
|
184
|
-
_enstCanonical: json_genes[json_iter]
|
|
185
|
-
["enstCanonical"]
|
|
194
|
+
names.insert(
|
|
195
|
+
json_genes[json_iter]["symbol"]
|
|
186
196
|
.to_string(),
|
|
187
|
-
|
|
188
|
-
//println!("item:{:?}", item);
|
|
189
|
-
names.push(item);
|
|
197
|
+
);
|
|
190
198
|
}
|
|
191
199
|
}
|
|
192
200
|
Err(_) => {
|
|
@@ -199,7 +207,7 @@ fn main() -> Result<()> {
|
|
|
199
207
|
let gene_set_size = names.len();
|
|
200
208
|
let (p_value, matches, gene_set_hits) =
|
|
201
209
|
calculate_hypergeometric_p_value(
|
|
202
|
-
&
|
|
210
|
+
&sample_coding_genes,
|
|
203
211
|
num_background_genes,
|
|
204
212
|
names,
|
|
205
213
|
);
|