@sjcrh/proteinpaint-rust 2.39.0 → 2.44.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +1 -2
- package/package.json +35 -35
- package/src/DEanalysis.rs +458 -113
- package/src/indel.rs +8 -3
package/Cargo.toml
CHANGED
|
@@ -19,14 +19,13 @@ plotters = "0.3.4"
|
|
|
19
19
|
colorgrad = "0.6.2"
|
|
20
20
|
statrs = "^0.16.0"
|
|
21
21
|
fishers_exact="^1.0.1"
|
|
22
|
-
bio = "
|
|
22
|
+
bio = "1.5.0"
|
|
23
23
|
bigtools = "^0.1.11"
|
|
24
24
|
libmath = "^0.2.1"
|
|
25
25
|
json = "^0.12.4"
|
|
26
26
|
serde = {version = "^1.0.147", features = ["derive"]}
|
|
27
27
|
serde_json="^1.0.88"
|
|
28
28
|
num = "^0.4.1"
|
|
29
|
-
csv = "^1.2.2"
|
|
30
29
|
r_mathlib="^0.2.0"
|
|
31
30
|
tokio = { version="1", features = ["full"] }
|
|
32
31
|
reqwest = "0.11"
|
package/package.json
CHANGED
|
@@ -1,37 +1,37 @@
|
|
|
1
1
|
{
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
2
|
+
"version": "2.44.0",
|
|
3
|
+
"name": "@sjcrh/proteinpaint-rust",
|
|
4
|
+
"description": "Rust-based utilities for proteinpaint",
|
|
5
|
+
"main": "index.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"proteinpaint-rust": "index.js"
|
|
8
|
+
},
|
|
9
|
+
"scripts": {
|
|
10
|
+
"dev": "cargo clean && cargo build --release",
|
|
11
|
+
"build": "cargo clean && cargo build --release",
|
|
12
|
+
"postinstall": "if [ ! -d ./test ] & [ ! -d ./target/release ]; then cargo clean && cargo build --release; fi",
|
|
13
|
+
"test": "tape **/test/*.spec.js",
|
|
14
|
+
"test:unit": "tape **/test/*.unit.spec.js",
|
|
15
|
+
"test:integration": "echo 'TODO: rust integration tests'"
|
|
16
|
+
},
|
|
17
|
+
"author": "Robin Paul",
|
|
18
|
+
"license": "SEE LICENSE IN ./LICENSE",
|
|
19
|
+
"repository": {
|
|
20
|
+
"type": "git",
|
|
21
|
+
"url": "https://github.com/stjude/proteinpaint.git",
|
|
22
|
+
"directory": "rust"
|
|
23
|
+
},
|
|
24
|
+
"files": [
|
|
25
|
+
"index.js",
|
|
26
|
+
"Cargo.toml",
|
|
27
|
+
"src",
|
|
28
|
+
"LICENSE/*"
|
|
29
|
+
],
|
|
30
|
+
"bugs": {
|
|
31
|
+
"url": "https://github.com/stjude/proteinpaint"
|
|
32
|
+
},
|
|
33
|
+
"homepage": "https://github.com/stjude/proteinpaint#readme",
|
|
34
|
+
"devDependencies": {
|
|
35
|
+
"tape": "^5.2.2"
|
|
36
|
+
}
|
|
37
37
|
}
|
package/src/DEanalysis.rs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
// cd .. && cargo build --release && json='{"case":"SJMB030827,SJMB030838,SJMB032893,SJMB031131,SJMB031227","control":"SJMB030488,SJMB030825,SJMB031110","input_file":"/Users/rpaul1/pp_data/files/hg38/sjmb12/rnaseq/geneCounts.txt"}' && time echo $json | target/release/DEanalysis
|
|
2
|
-
// cd .. && cargo build --release && cat ~/sjpp/test.txt | target/release/DEanalysis
|
|
2
|
+
// cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/DEanalysis
|
|
3
3
|
#![allow(non_snake_case)]
|
|
4
4
|
use json;
|
|
5
5
|
use nalgebra::base::dimension::Const;
|
|
@@ -15,13 +15,46 @@ use statrs::statistics::Data;
|
|
|
15
15
|
use statrs::statistics::Distribution;
|
|
16
16
|
use statrs::statistics::Median;
|
|
17
17
|
use std::cmp::Ordering;
|
|
18
|
-
use std::
|
|
18
|
+
use std::fs::File;
|
|
19
|
+
use std::io::Read;
|
|
19
20
|
use std::str::FromStr;
|
|
21
|
+
use std::sync::{Arc, Mutex}; // Multithreading library
|
|
22
|
+
use std::thread;
|
|
20
23
|
use std::time::Instant;
|
|
21
24
|
//use std::cmp::Ordering;
|
|
22
25
|
//use std::env;
|
|
23
26
|
use std::io;
|
|
24
27
|
//mod stats_functions; // Importing Wilcoxon function from stats_functions.rs
|
|
28
|
+
const PAR_CUTOFF: usize = 100000; // Cutoff for triggering multithreading processing of data
|
|
29
|
+
|
|
30
|
+
//const PAR_CUTOFF: usize = 1000000000000000;
|
|
31
|
+
#[allow(non_upper_case_globals)]
|
|
32
|
+
const max_threads: usize = 6; // Max number of threads in case the parallel processing of reads is invoked
|
|
33
|
+
|
|
34
|
+
fn binary_search(input: &Vec<usize>, y: usize) -> i64 {
|
|
35
|
+
let input_dup = &input[..];
|
|
36
|
+
let mut index: i64 = -1;
|
|
37
|
+
let mut l: usize = 0;
|
|
38
|
+
let mut r: usize = input_dup.len() - 1;
|
|
39
|
+
let mut m: usize;
|
|
40
|
+
while l <= r {
|
|
41
|
+
m = l + ((r - l) / 2);
|
|
42
|
+
if y == input_dup[m] {
|
|
43
|
+
index = m as i64;
|
|
44
|
+
break;
|
|
45
|
+
} else if y > input_dup[m] {
|
|
46
|
+
l = m + 1;
|
|
47
|
+
}
|
|
48
|
+
// If x is smaller, ignore right half
|
|
49
|
+
else {
|
|
50
|
+
if m == 0 as usize {
|
|
51
|
+
break;
|
|
52
|
+
}
|
|
53
|
+
r = m - 1;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
index
|
|
57
|
+
}
|
|
25
58
|
|
|
26
59
|
fn input_data(
|
|
27
60
|
filename: &String,
|
|
@@ -34,9 +67,9 @@ fn input_data(
|
|
|
34
67
|
Vec<String>,
|
|
35
68
|
Vec<String>,
|
|
36
69
|
) {
|
|
37
|
-
|
|
38
|
-
let
|
|
39
|
-
let mut
|
|
70
|
+
let input_time = Instant::now();
|
|
71
|
+
//let mut rdr = csv::Reader::from_path(path).unwrap();
|
|
72
|
+
let mut file = File::open(filename).unwrap();
|
|
40
73
|
let mut num_lines: usize = 0;
|
|
41
74
|
let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
|
|
42
75
|
let mut gene_names: Vec<String> = Vec::with_capacity(65000);
|
|
@@ -44,11 +77,12 @@ fn input_data(
|
|
|
44
77
|
let mut num_columns: usize = 0;
|
|
45
78
|
|
|
46
79
|
// Check headers for samples
|
|
47
|
-
let
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
80
|
+
let mut buffer = String::new();
|
|
81
|
+
file.read_to_string(&mut buffer).unwrap();
|
|
82
|
+
// Check headers for samples
|
|
83
|
+
let lines: Vec<&str> = buffer.split('\n').collect::<Vec<&str>>();
|
|
84
|
+
let total_lines = lines.len();
|
|
85
|
+
let headers: Vec<&str> = lines[0].split('\t').collect::<Vec<&str>>();
|
|
52
86
|
//println!("headers:{:?}", headers);
|
|
53
87
|
let mut case_indexes_original: Vec<usize> = Vec::with_capacity(case_list.len());
|
|
54
88
|
let mut control_indexes_original: Vec<usize> = Vec::with_capacity(control_list.len());
|
|
@@ -68,6 +102,7 @@ fn input_data(
|
|
|
68
102
|
}
|
|
69
103
|
}
|
|
70
104
|
}
|
|
105
|
+
let num_cases = case_list.len();
|
|
71
106
|
|
|
72
107
|
for item in control_list {
|
|
73
108
|
//println!("item:{}", item);
|
|
@@ -80,70 +115,223 @@ fn input_data(
|
|
|
80
115
|
}
|
|
81
116
|
}
|
|
82
117
|
}
|
|
118
|
+
let num_controls = control_list.len();
|
|
83
119
|
//println!("case_indexes_original:{:?}", case_indexes_original);
|
|
84
120
|
//println!("control_indexes_original:{:?}", control_indexes_original);
|
|
85
|
-
|
|
121
|
+
case_indexes_original.sort();
|
|
122
|
+
case_indexes_original.dedup();
|
|
123
|
+
control_indexes_original.sort();
|
|
124
|
+
control_indexes_original.dedup();
|
|
86
125
|
let mut case_indexes: Vec<usize> = Vec::with_capacity(case_list.len());
|
|
87
126
|
let mut control_indexes: Vec<usize> = Vec::with_capacity(control_list.len());
|
|
88
|
-
|
|
89
|
-
//
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
127
|
+
if lines.len() * (case_indexes_original.len() + control_indexes_original.len()) < PAR_CUTOFF {
|
|
128
|
+
// If number of lines is below this number
|
|
129
|
+
let lines_slice = &lines[..];
|
|
130
|
+
for line_iter in 1..lines_slice.len() - 1 {
|
|
131
|
+
// Subtracting 1 from total length of lines_slice because the last one will be empty
|
|
132
|
+
let line = lines_slice[line_iter];
|
|
133
|
+
let mut index = 0;
|
|
134
|
+
for field in line.split('\t').collect::<Vec<&str>>() {
|
|
135
|
+
if index == gene_name_index.unwrap() {
|
|
136
|
+
gene_names.push(field.to_string());
|
|
137
|
+
} else if index == gene_symbol_index.unwrap() {
|
|
138
|
+
gene_symbols.push(field.to_string());
|
|
139
|
+
} else if binary_search(&case_indexes_original, index) != -1 {
|
|
140
|
+
let num = FromStr::from_str(field);
|
|
141
|
+
match num {
|
|
142
|
+
Ok(n) => {
|
|
143
|
+
//println!("n:{}", n);
|
|
144
|
+
input_vector.push(n);
|
|
145
|
+
if num_lines == 0 {
|
|
146
|
+
case_indexes.push(num_columns);
|
|
147
|
+
num_columns += 1;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
Err(_n) => {
|
|
151
|
+
panic!(
|
|
152
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
153
|
+
field,
|
|
154
|
+
num_lines + 1,
|
|
155
|
+
index + 1
|
|
156
|
+
);
|
|
108
157
|
}
|
|
109
158
|
}
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
159
|
+
} else if binary_search(&control_indexes_original, index) != -1 {
|
|
160
|
+
let num = FromStr::from_str(field);
|
|
161
|
+
match num {
|
|
162
|
+
Ok(n) => {
|
|
163
|
+
//println!("n:{}", n);
|
|
164
|
+
input_vector.push(n);
|
|
165
|
+
if num_lines == 0 {
|
|
166
|
+
control_indexes.push(num_columns);
|
|
167
|
+
num_columns += 1;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
Err(_n) => {
|
|
171
|
+
panic!(
|
|
172
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
173
|
+
field,
|
|
174
|
+
num_lines + 1,
|
|
175
|
+
index + 1
|
|
176
|
+
);
|
|
177
|
+
}
|
|
117
178
|
}
|
|
118
179
|
}
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
180
|
+
index += 1;
|
|
181
|
+
}
|
|
182
|
+
num_lines += 1;
|
|
183
|
+
}
|
|
184
|
+
} else {
|
|
185
|
+
// Multithreaded implementation for parsing data in parallel starts from here
|
|
186
|
+
// Generally in rust one variable only own a data at a time, but `Arc` keyword is special and allows for multiple threads to access the same data.
|
|
187
|
+
let case_indexes_original = Arc::new(case_indexes_original);
|
|
188
|
+
let control_indexes_original = Arc::new(control_indexes_original);
|
|
189
|
+
let buffer = Arc::new(buffer);
|
|
190
|
+
let case_indexes_temp = Arc::new(Mutex::new(Vec::<usize>::with_capacity(case_list.len())));
|
|
191
|
+
let control_indexes_temp =
|
|
192
|
+
Arc::new(Mutex::new(Vec::<usize>::with_capacity(control_list.len())));
|
|
193
|
+
let num_lines_temp = Arc::new(Mutex::<usize>::new(0));
|
|
194
|
+
let num_columns_temp = Arc::new(Mutex::<usize>::new(0));
|
|
195
|
+
let genes_names_temp = Arc::new(Mutex::new(Vec::<String>::new()));
|
|
196
|
+
let genes_symbols_temp = Arc::new(Mutex::new(Vec::<String>::new()));
|
|
197
|
+
let input_vector_temp = Arc::new(Mutex::new(Vec::<f64>::new()));
|
|
198
|
+
let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
|
|
199
|
+
println!("Number of threads used:{}", max_threads);
|
|
200
|
+
for thread_num in 0..max_threads {
|
|
201
|
+
let case_indexes_original = Arc::clone(&case_indexes_original);
|
|
202
|
+
let control_indexes_original = Arc::clone(&control_indexes_original);
|
|
203
|
+
let case_indexes_temp = Arc::clone(&case_indexes_temp);
|
|
204
|
+
let control_indexes_temp = Arc::clone(&control_indexes_temp);
|
|
205
|
+
let input_vector_temp = Arc::clone(&input_vector_temp);
|
|
206
|
+
let genes_names_temp = Arc::clone(&genes_names_temp);
|
|
207
|
+
let genes_symbols_temp = Arc::clone(&genes_symbols_temp);
|
|
208
|
+
let num_lines_temp = Arc::clone(&num_lines_temp);
|
|
209
|
+
let num_columns_temp = Arc::clone(&num_columns_temp);
|
|
210
|
+
let buffer = Arc::clone(&buffer);
|
|
211
|
+
let handle = thread::spawn(move || {
|
|
212
|
+
let mut case_indexes_thread: Vec<usize> = Vec::with_capacity(num_cases);
|
|
213
|
+
let mut control_indexes_thread: Vec<usize> = Vec::with_capacity(num_controls);
|
|
214
|
+
let mut genes_names_thread: Vec<String> = Vec::with_capacity(65000);
|
|
215
|
+
let mut genes_symbols_thread: Vec<String> = Vec::with_capacity(65000);
|
|
216
|
+
let mut input_vector_thread: Vec<f64> = Vec::with_capacity(65000);
|
|
217
|
+
let mut num_columns_thread: usize = 0;
|
|
218
|
+
let mut num_lines_thread: usize = 0;
|
|
219
|
+
let lines: Vec<&str> = buffer.split('\n').collect();
|
|
220
|
+
//println!("case_indexes_original:{:?}", case_indexes_original);
|
|
221
|
+
//println!("control_indexes:{:?}", control_indexes);
|
|
222
|
+
for line_iter in 1..total_lines - 1 {
|
|
223
|
+
let remainder: usize = line_iter % max_threads; // Calculate remainder of line number divided by max_threads to decide which thread parses this line
|
|
224
|
+
if remainder == thread_num {
|
|
225
|
+
//println!("buffer:{}", buffer);
|
|
226
|
+
// Thread analyzing a particular line must have the same remainder as the thread_num, this avoids multiple threads from parsing the same line
|
|
227
|
+
let line = lines[line_iter];
|
|
228
|
+
let mut index = 0;
|
|
229
|
+
for field in line.split('\t').collect::<Vec<&str>>() {
|
|
230
|
+
if index == gene_name_index.unwrap() {
|
|
231
|
+
genes_names_thread.push(field.to_string());
|
|
232
|
+
} else if index == gene_symbol_index.unwrap() {
|
|
233
|
+
genes_symbols_thread.push(field.to_string());
|
|
234
|
+
} else if binary_search(&case_indexes_original, index) != -1 {
|
|
235
|
+
let num = FromStr::from_str(field);
|
|
236
|
+
match num {
|
|
237
|
+
Ok(n) => {
|
|
238
|
+
//println!("n:{}", n);
|
|
239
|
+
input_vector_thread.push(n);
|
|
240
|
+
if line_iter == 1 {
|
|
241
|
+
case_indexes_thread.push(num_columns_thread);
|
|
242
|
+
num_columns_thread += 1;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
Err(_n) => {
|
|
246
|
+
panic!(
|
|
247
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
248
|
+
field,
|
|
249
|
+
num_lines_thread + 1,
|
|
250
|
+
index + 1
|
|
251
|
+
);
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
} else if binary_search(&control_indexes_original, index) != -1 {
|
|
255
|
+
let num = FromStr::from_str(field);
|
|
256
|
+
match num {
|
|
257
|
+
Ok(n) => {
|
|
258
|
+
//println!("n:{}", n);
|
|
259
|
+
input_vector_thread.push(n);
|
|
260
|
+
if line_iter == 1 {
|
|
261
|
+
control_indexes_thread.push(num_columns_thread);
|
|
262
|
+
num_columns_thread += 1;
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
Err(_n) => {
|
|
266
|
+
panic!(
|
|
267
|
+
"Number {} in line {} and column {} is not a decimal number",
|
|
268
|
+
field,
|
|
269
|
+
num_lines_thread + 1,
|
|
270
|
+
index + 1
|
|
271
|
+
);
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
index += 1;
|
|
128
276
|
}
|
|
129
|
-
|
|
130
|
-
Err(_n) => {
|
|
131
|
-
panic!(
|
|
132
|
-
"Number {} in line {} and column {} is not a decimal number",
|
|
133
|
-
field,
|
|
134
|
-
num_lines + 1,
|
|
135
|
-
index + 1
|
|
136
|
-
);
|
|
277
|
+
num_lines_thread += 1;
|
|
137
278
|
}
|
|
138
279
|
}
|
|
139
|
-
|
|
140
|
-
|
|
280
|
+
input_vector_temp
|
|
281
|
+
.lock()
|
|
282
|
+
.unwrap()
|
|
283
|
+
.append(&mut input_vector_thread);
|
|
284
|
+
case_indexes_temp
|
|
285
|
+
.lock()
|
|
286
|
+
.unwrap()
|
|
287
|
+
.append(&mut case_indexes_thread);
|
|
288
|
+
control_indexes_temp
|
|
289
|
+
.lock()
|
|
290
|
+
.unwrap()
|
|
291
|
+
.append(&mut control_indexes_thread);
|
|
292
|
+
genes_names_temp
|
|
293
|
+
.lock()
|
|
294
|
+
.unwrap()
|
|
295
|
+
.append(&mut genes_names_thread);
|
|
296
|
+
genes_symbols_temp
|
|
297
|
+
.lock()
|
|
298
|
+
.unwrap()
|
|
299
|
+
.append(&mut genes_symbols_thread);
|
|
300
|
+
*num_lines_temp.lock().unwrap() += num_lines_thread;
|
|
301
|
+
if num_columns_thread > 0 {
|
|
302
|
+
*num_columns_temp.lock().unwrap() += num_columns_thread;
|
|
303
|
+
}
|
|
304
|
+
drop(input_vector_temp);
|
|
305
|
+
drop(case_indexes_temp);
|
|
306
|
+
drop(control_indexes_temp);
|
|
307
|
+
drop(genes_names_temp);
|
|
308
|
+
drop(genes_symbols_temp);
|
|
309
|
+
drop(num_lines_temp);
|
|
310
|
+
drop(num_columns_temp);
|
|
311
|
+
});
|
|
312
|
+
handles.push(handle); // The handle (which contains the thread) is stored in the handles vector
|
|
313
|
+
}
|
|
314
|
+
for handle in handles {
|
|
315
|
+
// Wait for all threads to finish before proceeding further
|
|
316
|
+
handle.join().unwrap();
|
|
141
317
|
}
|
|
142
|
-
|
|
318
|
+
// Combining data from all different threads
|
|
319
|
+
input_vector.append(&mut *input_vector_temp.lock().unwrap());
|
|
320
|
+
case_indexes.append(&mut *case_indexes_temp.lock().unwrap());
|
|
321
|
+
control_indexes.append(&mut *control_indexes_temp.lock().unwrap());
|
|
322
|
+
gene_names.append(&mut *genes_names_temp.lock().unwrap());
|
|
323
|
+
gene_symbols.append(&mut *genes_symbols_temp.lock().unwrap());
|
|
324
|
+
|
|
325
|
+
num_lines += *num_lines_temp.lock().unwrap();
|
|
326
|
+
num_columns += *num_columns_temp.lock().unwrap();
|
|
143
327
|
}
|
|
144
328
|
//println!("case_indexes:{:?}", case_indexes);
|
|
145
329
|
//println!("control_indexes:{:?}", control_indexes);
|
|
146
|
-
|
|
330
|
+
//println!("num_lines:{}", num_lines);
|
|
331
|
+
//println!("num_columns:{}", num_columns);
|
|
332
|
+
//println!("num_lines * num_columns:{}", num_lines * num_columns);
|
|
333
|
+
//println!("input_vector:{:?}", input_vector.len());
|
|
334
|
+
println!("Time for inputting data:{:?}", input_time.elapsed());
|
|
147
335
|
let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
|
|
148
336
|
//println!("dm:{:?}", dm);
|
|
149
337
|
(dm, case_indexes, control_indexes, gene_names, gene_symbols)
|
|
@@ -198,6 +386,7 @@ fn main() {
|
|
|
198
386
|
let control_list: Vec<&str> = control_string.split(",").collect();
|
|
199
387
|
let (input_matrix, case_indexes, control_indexes, gene_names, gene_symbols) =
|
|
200
388
|
input_data(file_name, &case_list, &control_list);
|
|
389
|
+
let filtering_time = Instant::now();
|
|
201
390
|
let (filtered_matrix, lib_sizes, filtered_genes, filtered_gene_symbols) =
|
|
202
391
|
filter_by_expr(
|
|
203
392
|
&input_matrix,
|
|
@@ -206,10 +395,21 @@ fn main() {
|
|
|
206
395
|
gene_names,
|
|
207
396
|
gene_symbols,
|
|
208
397
|
);
|
|
398
|
+
println!("filtering time:{:?}", filtering_time.elapsed());
|
|
209
399
|
//println!("filtered_matrix_rows:{:?}", filtered_matrix.nrows());
|
|
210
400
|
//println!("filtered_matrix_cols:{:?}", filtered_matrix.ncols());
|
|
401
|
+
let cpm_normalization_time = Instant::now();
|
|
211
402
|
let mut normalized_matrix = cpm(&filtered_matrix);
|
|
403
|
+
println!(
|
|
404
|
+
"cpm normalization time:{:?}",
|
|
405
|
+
cpm_normalization_time.elapsed()
|
|
406
|
+
);
|
|
407
|
+
let tmm_normalization_time = Instant::now();
|
|
212
408
|
let norm_factors = tmm_normalization(filtered_matrix, &lib_sizes);
|
|
409
|
+
println!(
|
|
410
|
+
"tmm normalization time:{:?}",
|
|
411
|
+
tmm_normalization_time.elapsed()
|
|
412
|
+
);
|
|
213
413
|
//println!("norm_factors:{:?}", norm_factors);
|
|
214
414
|
|
|
215
415
|
for col in 0..normalized_matrix.ncols() {
|
|
@@ -232,58 +432,154 @@ fn main() {
|
|
|
232
432
|
|
|
233
433
|
//println!("case_indexes:{:?}", case_indexes);
|
|
234
434
|
//println!("control_indexes:{:?}", control_indexes);
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
//println!("
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
435
|
+
let num_normalized_rows = normalized_matrix.nrows();
|
|
436
|
+
if normalized_matrix.nrows() * normalized_matrix.ncols() < PAR_CUTOFF {
|
|
437
|
+
for i in 0..normalized_matrix.nrows() {
|
|
438
|
+
let row = normalized_matrix.row(i);
|
|
439
|
+
//println!("row:{:?}", row);
|
|
440
|
+
let mut treated = Vec::<f64>::new();
|
|
441
|
+
let mut control = Vec::<f64>::new();
|
|
442
|
+
//println!("conditions:{:?}", conditions);
|
|
443
|
+
for j in 0..(case_indexes.len() + control_indexes.len()) {
|
|
444
|
+
//println!("row[(0, j)]:{}", row[(0, j)]);
|
|
445
|
+
if case_indexes.contains(&j) {
|
|
446
|
+
treated.push(row[(0, j)]);
|
|
447
|
+
//println!("{},{}", input_data_vec.0[i][j], "Diseased");
|
|
448
|
+
} else if control_indexes.contains(&j) {
|
|
449
|
+
// + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
|
|
450
|
+
control.push(row[(0, j)]);
|
|
451
|
+
//println!("{},{}", input_data_vec.0[i][j], "Control");
|
|
452
|
+
} else {
|
|
453
|
+
panic!(
|
|
454
|
+
"Column {} could not be classified into case/control",
|
|
455
|
+
j
|
|
456
|
+
);
|
|
457
|
+
}
|
|
252
458
|
}
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
true
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
if (treated_mean.unwrap() / control_mean.unwrap())
|
|
266
|
-
.log2()
|
|
267
|
-
.is_nan()
|
|
268
|
-
== false
|
|
269
|
-
&& (treated_mean.unwrap() / control_mean.unwrap())
|
|
459
|
+
//println!("treated{:?}", treated);
|
|
460
|
+
//println!("control{:?}", control);
|
|
461
|
+
let p_value = wilcoxon_rank_sum_test(
|
|
462
|
+
treated.clone(),
|
|
463
|
+
control.clone(),
|
|
464
|
+
THRESHOLD,
|
|
465
|
+
't',
|
|
466
|
+
true,
|
|
467
|
+
); // Setting continuity correction to true in case of normal approximation
|
|
468
|
+
let treated_mean = Data::new(treated).mean();
|
|
469
|
+
let control_mean = Data::new(control).mean();
|
|
470
|
+
if (treated_mean.unwrap() / control_mean.unwrap())
|
|
270
471
|
.log2()
|
|
271
|
-
.
|
|
472
|
+
.is_nan()
|
|
272
473
|
== false
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
474
|
+
&& (treated_mean.unwrap() / control_mean.unwrap())
|
|
475
|
+
.log2()
|
|
476
|
+
.is_infinite()
|
|
477
|
+
== false
|
|
478
|
+
{
|
|
479
|
+
p_values.push(PValueIndexes {
|
|
480
|
+
index: i,
|
|
481
|
+
gene_name: filtered_genes[i].to_owned(),
|
|
482
|
+
gene_symbol: filtered_gene_symbols[i].to_owned(),
|
|
483
|
+
fold_change: (treated_mean.unwrap() / control_mean.unwrap())
|
|
484
|
+
.log2(),
|
|
485
|
+
p_value: p_value,
|
|
486
|
+
});
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
} else {
|
|
490
|
+
// Multithreaded implementation of calculating wilcoxon p-values
|
|
491
|
+
let normalized_matrix_temp = Arc::new(normalized_matrix);
|
|
492
|
+
let filtered_genes_temp = Arc::new(filtered_genes);
|
|
493
|
+
let filtered_gene_symbols_temp = Arc::new(filtered_gene_symbols);
|
|
494
|
+
let case_indexes_temp = Arc::new(case_indexes);
|
|
495
|
+
let control_indexes_temp = Arc::new(control_indexes);
|
|
496
|
+
let p_values_temp = Arc::new(Mutex::new(Vec::<PValueIndexes>::new()));
|
|
497
|
+
let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
|
|
498
|
+
for thread_num in 0..max_threads {
|
|
499
|
+
let normalized_matrix_temp = Arc::clone(&normalized_matrix_temp);
|
|
500
|
+
let case_indexes_temp = Arc::clone(&case_indexes_temp);
|
|
501
|
+
let control_indexes_temp = Arc::clone(&control_indexes_temp);
|
|
502
|
+
let p_values_temp = Arc::clone(&p_values_temp);
|
|
503
|
+
let filtered_genes_temp = Arc::clone(&filtered_genes_temp);
|
|
504
|
+
let filtered_gene_symbols_temp =
|
|
505
|
+
Arc::clone(&filtered_gene_symbols_temp);
|
|
506
|
+
let handle = thread::spawn(move || {
|
|
507
|
+
let mut p_values_thread: Vec<PValueIndexes> = Vec::with_capacity(
|
|
508
|
+
normalized_matrix_temp.nrows() / max_threads,
|
|
509
|
+
);
|
|
510
|
+
for i in 0..normalized_matrix_temp.nrows() {
|
|
511
|
+
let remainder: usize = i % max_threads; // Calculate remainder of iteration number divided by max_threads to decide which thread parses the row
|
|
512
|
+
if remainder == thread_num {
|
|
513
|
+
let row = normalized_matrix_temp.row(i);
|
|
514
|
+
//println!("row:{:?}", row);
|
|
515
|
+
let mut treated = Vec::<f64>::new();
|
|
516
|
+
let mut control = Vec::<f64>::new();
|
|
517
|
+
//println!("conditions:{:?}", conditions);
|
|
518
|
+
for j in 0..(case_indexes_temp.len()
|
|
519
|
+
+ control_indexes_temp.len())
|
|
520
|
+
{
|
|
521
|
+
//println!("row[(0, j)]:{}", row[(0, j)]);
|
|
522
|
+
if case_indexes_temp.contains(&j) {
|
|
523
|
+
treated.push(row[(0, j)]);
|
|
524
|
+
//println!("{},{}", input_data_vec.0[i][j], "Diseased");
|
|
525
|
+
} else if control_indexes_temp.contains(&j) {
|
|
526
|
+
// + 1 was added because in the input file the first column of thw first row is blank as the first column consists of gene names
|
|
527
|
+
control.push(row[(0, j)]);
|
|
528
|
+
//println!("{},{}", input_data_vec.0[i][j], "Control");
|
|
529
|
+
} else {
|
|
530
|
+
panic!(
|
|
531
|
+
"Column {} could not be classified into case/control",
|
|
532
|
+
j
|
|
533
|
+
);
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
//println!("treated{:?}", treated);
|
|
537
|
+
//println!("control{:?}", control);
|
|
538
|
+
let p_value = wilcoxon_rank_sum_test(
|
|
539
|
+
treated.clone(),
|
|
540
|
+
control.clone(),
|
|
541
|
+
THRESHOLD,
|
|
542
|
+
't',
|
|
543
|
+
true,
|
|
544
|
+
); // Setting continuity correction to true in case of normal approximation
|
|
545
|
+
let treated_mean = Data::new(treated).mean();
|
|
546
|
+
let control_mean = Data::new(control).mean();
|
|
547
|
+
if (treated_mean.unwrap() / control_mean.unwrap())
|
|
548
|
+
.log2()
|
|
549
|
+
.is_nan()
|
|
550
|
+
== false
|
|
551
|
+
&& (treated_mean.unwrap() / control_mean.unwrap())
|
|
552
|
+
.log2()
|
|
553
|
+
.is_infinite()
|
|
554
|
+
== false
|
|
555
|
+
{
|
|
556
|
+
p_values_thread.push(PValueIndexes {
|
|
557
|
+
index: i,
|
|
558
|
+
gene_name: filtered_genes_temp[i].to_owned(),
|
|
559
|
+
gene_symbol: filtered_gene_symbols_temp[i]
|
|
560
|
+
.to_owned(),
|
|
561
|
+
fold_change: (treated_mean.unwrap()
|
|
562
|
+
/ control_mean.unwrap())
|
|
563
|
+
.log2(),
|
|
564
|
+
p_value: p_value,
|
|
565
|
+
});
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
p_values_temp.lock().unwrap().append(&mut p_values_thread);
|
|
280
570
|
});
|
|
571
|
+
handles.push(handle);
|
|
281
572
|
}
|
|
573
|
+
for handle in handles {
|
|
574
|
+
// Wait for all threads to finish before proceeding further
|
|
575
|
+
handle.join().unwrap();
|
|
576
|
+
}
|
|
577
|
+
p_values.append(&mut *p_values_temp.lock().unwrap());
|
|
282
578
|
}
|
|
283
579
|
//println!("p_values:{:?}", p_values);
|
|
284
580
|
println!(
|
|
285
581
|
"Time for running {} wilcoxon tests:{:?}",
|
|
286
|
-
|
|
582
|
+
num_normalized_rows,
|
|
287
583
|
now2.elapsed()
|
|
288
584
|
);
|
|
289
585
|
let adjusted_p_values = adjust_p_values(p_values);
|
|
@@ -408,18 +704,62 @@ fn tmm_normalization(
|
|
|
408
704
|
}
|
|
409
705
|
}
|
|
410
706
|
//println!("ref_column:{}", ref_column);
|
|
411
|
-
let
|
|
412
|
-
let ref_lib_size = lib_sizes[ref_column];
|
|
707
|
+
let num_cols = input_matrix.ncols();
|
|
413
708
|
let mut f: Vec<f64> = Vec::with_capacity(input_matrix.ncols());
|
|
414
|
-
|
|
415
|
-
let
|
|
416
|
-
let
|
|
417
|
-
|
|
418
|
-
obs_data
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
709
|
+
if input_matrix.nrows() * input_matrix.ncols() < PAR_CUTOFF {
|
|
710
|
+
let ref_data = input_matrix.column(ref_column);
|
|
711
|
+
let ref_lib_size = lib_sizes[ref_column];
|
|
712
|
+
for col in 0..input_matrix.ncols() {
|
|
713
|
+
let obs_data = input_matrix.column(col);
|
|
714
|
+
let obs_lib_size = lib_sizes[col];
|
|
715
|
+
f.push(calc_factor_tmm(
|
|
716
|
+
obs_data,
|
|
717
|
+
&ref_data,
|
|
718
|
+
ref_lib_size,
|
|
719
|
+
obs_lib_size,
|
|
720
|
+
));
|
|
721
|
+
}
|
|
722
|
+
} else {
|
|
723
|
+
// Multithreaded implementation of TMM normalization
|
|
724
|
+
let f_temp = Arc::new(Mutex::new(Vec::<f_index>::new()));
|
|
725
|
+
let lib_sizes_temp = Arc::new(lib_sizes.to_owned());
|
|
726
|
+
let input_matrix_temp = Arc::new(input_matrix);
|
|
727
|
+
let mut handles = vec![]; // Vector to store handle which is used to prevent one thread going ahead of another
|
|
728
|
+
for thread_num in 0..max_threads {
|
|
729
|
+
let f_temp = Arc::clone(&f_temp);
|
|
730
|
+
let lib_sizes_temp = Arc::clone(&lib_sizes_temp);
|
|
731
|
+
let input_matrix_temp = Arc::clone(&input_matrix_temp);
|
|
732
|
+
let handle = thread::spawn(move || {
|
|
733
|
+
let mut f_thread: Vec<f_index> =
|
|
734
|
+
Vec::with_capacity(input_matrix_temp.ncols() / max_threads);
|
|
735
|
+
let ref_data = input_matrix_temp.column(ref_column);
|
|
736
|
+
let ref_lib_size = lib_sizes_temp[ref_column];
|
|
737
|
+
for col in 0..input_matrix_temp.ncols() {
|
|
738
|
+
let remainder: usize = col % max_threads; // Calculate remainder of column number divided by max_threads to decide which thread parses this column
|
|
739
|
+
if remainder == thread_num {
|
|
740
|
+
let obs_data = input_matrix_temp.column(col);
|
|
741
|
+
let obs_lib_size = lib_sizes_temp[col];
|
|
742
|
+
f_thread.push(f_index {
|
|
743
|
+
f: calc_factor_tmm(obs_data, &ref_data, ref_lib_size, obs_lib_size),
|
|
744
|
+
ind: col,
|
|
745
|
+
})
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
f_temp.lock().unwrap().append(&mut f_thread);
|
|
749
|
+
});
|
|
750
|
+
handles.push(handle);
|
|
751
|
+
}
|
|
752
|
+
for handle in handles {
|
|
753
|
+
// Wait for all threads to finish before proceeding further
|
|
754
|
+
handle.join().unwrap();
|
|
755
|
+
}
|
|
756
|
+
let mut f_orig: Vec<f_index> = Vec::with_capacity(num_cols);
|
|
757
|
+
f_orig.append(&mut *f_temp.lock().unwrap());
|
|
758
|
+
// Need to sort vector because the vector will not be ordered accord to ind because of multithreading
|
|
759
|
+
f_orig
|
|
760
|
+
.as_mut_slice()
|
|
761
|
+
.sort_by(|a, b| (a.ind).partial_cmp(&b.ind).unwrap_or(Ordering::Equal));
|
|
762
|
+
f = f_orig.into_iter().map(|x| x.f).collect::<Vec<f64>>();
|
|
423
763
|
}
|
|
424
764
|
const NATURAL_E: f64 = 2.718281828459;
|
|
425
765
|
let log_f: Vec<f64> = f.clone().into_iter().map(|x| x.log(NATURAL_E)).collect();
|
|
@@ -427,6 +767,11 @@ fn tmm_normalization(
|
|
|
427
767
|
let final_f: Vec<f64> = f.into_iter().map(|x| x / exp_mean_log_f).collect();
|
|
428
768
|
final_f
|
|
429
769
|
}
|
|
770
|
+
#[allow(non_camel_case_types)]
|
|
771
|
+
struct f_index {
|
|
772
|
+
f: f64,
|
|
773
|
+
ind: usize,
|
|
774
|
+
}
|
|
430
775
|
|
|
431
776
|
fn calc_factor_tmm(
|
|
432
777
|
obs_data: Matrix<f64, Dyn, Const<1>, ViewStorage<'_, f64, Dyn, Const<1>, Const<1>, Dyn>>,
|
package/src/indel.rs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// Syntax: cd .. && cargo build --release
|
|
2
2
|
|
|
3
3
|
// Test case below:
|
|
4
|
-
//Debug syntax: cd .. && cargo build --release && time cat ~/sjpp/test.txt | ~/proteinpaint/rust/target/release/indel
|
|
4
|
+
//Debug syntax: cd .. && cargo build --release && time cat ~/sjpp/test.txt | ~/sjpp/proteinpaint/rust/target/release/indel
|
|
5
5
|
|
|
6
6
|
// Strictness:
|
|
7
7
|
// 0: No postprocessing, pure indel typing results
|
|
@@ -848,7 +848,7 @@ fn main() {
|
|
|
848
848
|
let remainder: usize = iter % max_threads; // Calculate remainder of read number divided by max_threads to decide which thread parses this read
|
|
849
849
|
//println!("iter:{}", iter);
|
|
850
850
|
if remainder == thread_num {
|
|
851
|
-
// Thread analyzing a particular read must have the same remainder as the thread_num, this avoids multiple
|
|
851
|
+
// Thread analyzing a particular read must have the same remainder as the thread_num, this avoids multiple threads from parsing the same read. Also checking if the read length > 0
|
|
852
852
|
|
|
853
853
|
//println!(
|
|
854
854
|
// "start_positions_list:{}",
|
|
@@ -1356,7 +1356,12 @@ fn main() {
|
|
|
1356
1356
|
//let mut output_string = "[".to_string();
|
|
1357
1357
|
//output_string += &all_alleles.to_string();
|
|
1358
1358
|
output_string.pop();
|
|
1359
|
-
output_string
|
|
1359
|
+
if output_string.len() == 0 {
|
|
1360
|
+
// Pass empty JSON "[]" when no reads are passed back to nodejs
|
|
1361
|
+
output_string = "[]".to_string();
|
|
1362
|
+
} else {
|
|
1363
|
+
output_string += &"]".to_string();
|
|
1364
|
+
}
|
|
1360
1365
|
println!("Final_output:{:?}", output_string);
|
|
1361
1366
|
}
|
|
1362
1367
|
Err(error) => println!("Incorrect json: {}", error),
|