@sjcrh/proteinpaint-rust 2.132.1-0 → 2.135.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/gdcGRIN2.rs +283 -169
package/package.json
CHANGED
package/src/gdcGRIN2.rs
CHANGED
|
@@ -6,14 +6,18 @@
|
|
|
6
6
|
2. Better timeout handling with retries
|
|
7
7
|
3. More detailed error reporting
|
|
8
8
|
4. Continues processing even when some files fail
|
|
9
|
+
5. Added chromosome filtering
|
|
9
10
|
|
|
10
11
|
Input JSON:
|
|
11
12
|
caseFiles
|
|
12
13
|
mafOptions: For SNVindel filtering
|
|
14
|
+
cnvOptions: For CNV filtering
|
|
15
|
+
chromosomes: chromosomes will be included:[]
|
|
16
|
+
|
|
13
17
|
Output mutations as JSON array.
|
|
14
18
|
|
|
15
19
|
Example of usage:
|
|
16
|
-
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20,"hyperMutator":
|
|
20
|
+
echo '{"caseFiles": {"MP2PRT-PATFJE": {"maf": "26ea7b6f-8bc4-4e83-ace1-2125b493a361"},"MP2PRT-PAPIGD": {"maf": "653d7458-f4af-4328-a1ce-3bbf22a2e347"}, "TCGA-CG-4300": { "cnv":"46372ec2-ff79-4d07-b375-9ba8a12c11f3", "maf":"c09b208d-2e7b-4116-9580-27f20f4c7e67"}},"mafOptions": {"minTotalDepth": 100,"minAltAlleleCount": 20,"hyperMutator":8000,"consequences":["missense_variant","frameshift_variant"]}, "cnvOptions":{"lossThreshold":-1, "gainThreshold": 1.5, "segLength":2000000, "hyperMutator":8000}, "chromosomes":["chr1","chr2","chr3"]}' | ./target/release/gdcGRIN2
|
|
17
21
|
*/
|
|
18
22
|
|
|
19
23
|
use flate2::read::GzDecoder;
|
|
@@ -21,7 +25,7 @@ use futures::StreamExt;
|
|
|
21
25
|
use memchr::memchr;
|
|
22
26
|
use serde::{Deserialize, Serialize};
|
|
23
27
|
use serde_json;
|
|
24
|
-
use std::collections::HashMap;
|
|
28
|
+
use std::collections::{HashMap, HashSet};
|
|
25
29
|
use std::io::{self, Read};
|
|
26
30
|
use std::sync::Arc;
|
|
27
31
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
@@ -69,6 +73,8 @@ struct CnvOptions {
|
|
|
69
73
|
gain_threshold: f32,
|
|
70
74
|
#[serde(rename = "segLength")]
|
|
71
75
|
seg_length: i32,
|
|
76
|
+
#[serde(rename = "hyperMutator")]
|
|
77
|
+
hyper_mutator: i32,
|
|
72
78
|
}
|
|
73
79
|
|
|
74
80
|
// Individual successful file output (JSONL format)
|
|
@@ -94,6 +100,7 @@ struct FilteredMafDetails {
|
|
|
94
100
|
excluded_by_consequence_type: usize,
|
|
95
101
|
total_processed: usize,
|
|
96
102
|
total_included: usize,
|
|
103
|
+
skipped_chromosomes: HashMap<String, usize>,
|
|
97
104
|
}
|
|
98
105
|
|
|
99
106
|
// struct for CNV filter details
|
|
@@ -107,6 +114,7 @@ struct FilteredCnvDetails {
|
|
|
107
114
|
excluded_by_segment_length: usize,
|
|
108
115
|
total_processed: usize,
|
|
109
116
|
total_included: usize,
|
|
117
|
+
skipped_chromosomes: HashMap<String, usize>,
|
|
110
118
|
}
|
|
111
119
|
|
|
112
120
|
// struct for per-case filter details
|
|
@@ -131,7 +139,7 @@ struct FinalSummary {
|
|
|
131
139
|
included_maf_records: usize,
|
|
132
140
|
included_cnv_records: usize,
|
|
133
141
|
filtered_records_by_case: HashMap<String, FilteredCaseDetails>,
|
|
134
|
-
hyper_mutator_records: Vec<String
|
|
142
|
+
hyper_mutator_records: HashMap<String, Vec<String>>,
|
|
135
143
|
}
|
|
136
144
|
|
|
137
145
|
// Define the top-level input structure
|
|
@@ -143,6 +151,7 @@ struct InputData {
|
|
|
143
151
|
maf_options: Option<MafOptions>,
|
|
144
152
|
#[serde(rename = "cnvOptions")]
|
|
145
153
|
cnv_options: Option<CnvOptions>,
|
|
154
|
+
chromosomes: Vec<String>,
|
|
146
155
|
}
|
|
147
156
|
|
|
148
157
|
// Configuration for different data types
|
|
@@ -159,17 +168,19 @@ async fn parse_content(
|
|
|
159
168
|
data_type: &str,
|
|
160
169
|
min_total_depth: i32,
|
|
161
170
|
min_alt_allele_count: i32,
|
|
162
|
-
|
|
171
|
+
maf_hyper_mutator: i32,
|
|
163
172
|
consequences: &Option<Vec<String>>,
|
|
164
173
|
gain_threshold: f32,
|
|
165
174
|
loss_threshold: f32,
|
|
166
175
|
seg_length: i32,
|
|
176
|
+
cnv_hyper_mutator: i32,
|
|
177
|
+
chromosomes: &HashSet<String>,
|
|
167
178
|
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
168
179
|
filtered_maf_records: &AtomicUsize,
|
|
169
180
|
filtered_cnv_records: &AtomicUsize,
|
|
170
181
|
included_maf_records: &AtomicUsize,
|
|
171
182
|
included_cnv_records: &AtomicUsize,
|
|
172
|
-
hyper_mutator_records: &Arc<Mutex<Vec<String
|
|
183
|
+
hyper_mutator_records: &Arc<Mutex<HashMap<String, Vec<String>>>>,
|
|
173
184
|
) -> Result<Vec<Vec<String>>, (String, String, String)> {
|
|
174
185
|
let config = match data_type {
|
|
175
186
|
"cnv" => DataTypeConfig {
|
|
@@ -189,13 +200,24 @@ async fn parse_content(
|
|
|
189
200
|
}
|
|
190
201
|
};
|
|
191
202
|
|
|
192
|
-
// check hyperMutator for MAF files
|
|
193
|
-
if data_type == "maf"
|
|
203
|
+
// check hyperMutator for MAF and CNV files
|
|
204
|
+
let hyper_mutator = if data_type == "maf" {
|
|
205
|
+
maf_hyper_mutator
|
|
206
|
+
} else {
|
|
207
|
+
cnv_hyper_mutator
|
|
208
|
+
};
|
|
209
|
+
if hyper_mutator > 0 {
|
|
194
210
|
let line_count = content.lines().count();
|
|
195
211
|
if line_count as i32 > hyper_mutator {
|
|
196
212
|
let mut hyper_records = hyper_mutator_records.lock().await;
|
|
197
|
-
|
|
198
|
-
|
|
213
|
+
hyper_records
|
|
214
|
+
.entry(data_type.to_string())
|
|
215
|
+
.or_insert_with(Vec::new)
|
|
216
|
+
.push(case_id.to_string());
|
|
217
|
+
if data_type == "maf" {
|
|
218
|
+
filtered_maf_records.fetch_add(line_count, Ordering::Relaxed);
|
|
219
|
+
} else if data_type == "cnv" {
|
|
220
|
+
filtered_cnv_records.fetch_add(line_count, Ordering::Relaxed);
|
|
199
221
|
}
|
|
200
222
|
return Ok(Vec::new());
|
|
201
223
|
}
|
|
@@ -205,9 +227,6 @@ async fn parse_content(
|
|
|
205
227
|
let mut parsed_data = Vec::new();
|
|
206
228
|
let mut columns_indices: Vec<usize> = Vec::new();
|
|
207
229
|
let mut variant_classification_index: Option<usize> = None;
|
|
208
|
-
//let mut header_mk: &str = "";
|
|
209
|
-
//let mut columns = Vec::new();
|
|
210
|
-
|
|
211
230
|
let mut header: Vec<String> = Vec::new();
|
|
212
231
|
|
|
213
232
|
for line in lines {
|
|
@@ -229,26 +248,49 @@ async fn parse_content(
|
|
|
229
248
|
continue;
|
|
230
249
|
};
|
|
231
250
|
|
|
232
|
-
let row =
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
251
|
+
let row = match data_type {
|
|
252
|
+
"maf" => {
|
|
253
|
+
process_mafline(
|
|
254
|
+
line,
|
|
255
|
+
case_id,
|
|
256
|
+
data_type,
|
|
257
|
+
&columns_indices,
|
|
258
|
+
variant_classification_index,
|
|
259
|
+
consequences,
|
|
260
|
+
min_total_depth,
|
|
261
|
+
min_alt_allele_count,
|
|
262
|
+
chromosomes,
|
|
263
|
+
filtered_records,
|
|
264
|
+
filtered_maf_records,
|
|
265
|
+
included_maf_records,
|
|
266
|
+
)
|
|
267
|
+
.await
|
|
268
|
+
}
|
|
269
|
+
"cnv" => {
|
|
270
|
+
process_cnvline(
|
|
271
|
+
line,
|
|
272
|
+
case_id,
|
|
273
|
+
data_type,
|
|
274
|
+
&header,
|
|
275
|
+
&columns_indices,
|
|
276
|
+
gain_threshold,
|
|
277
|
+
loss_threshold,
|
|
278
|
+
seg_length,
|
|
279
|
+
chromosomes,
|
|
280
|
+
filtered_records,
|
|
281
|
+
filtered_cnv_records,
|
|
282
|
+
included_cnv_records,
|
|
283
|
+
)
|
|
284
|
+
.await
|
|
285
|
+
}
|
|
286
|
+
_ => {
|
|
287
|
+
return Err((
|
|
288
|
+
case_id.to_string(),
|
|
289
|
+
data_type.to_string(),
|
|
290
|
+
"Invalid data type".to_string(),
|
|
291
|
+
));
|
|
292
|
+
}
|
|
293
|
+
}?;
|
|
252
294
|
|
|
253
295
|
if let Some(out_lst) = row {
|
|
254
296
|
parsed_data.push(out_lst);
|
|
@@ -302,25 +344,20 @@ fn setup_columns(
|
|
|
302
344
|
Ok(())
|
|
303
345
|
}
|
|
304
346
|
|
|
305
|
-
// Process a single row of
|
|
306
|
-
async fn
|
|
347
|
+
// Process a single row of MAF file
|
|
348
|
+
async fn process_mafline(
|
|
307
349
|
line: &str,
|
|
308
350
|
case_id: &str,
|
|
309
351
|
data_type: &str,
|
|
310
|
-
header: &[String],
|
|
311
352
|
columns_indices: &[usize],
|
|
312
353
|
variant_classification_index: Option<usize>,
|
|
313
354
|
consequences: &Option<Vec<String>>,
|
|
314
355
|
min_total_depth: i32,
|
|
315
356
|
min_alt_allele_count: i32,
|
|
316
|
-
|
|
317
|
-
loss_threshold: f32,
|
|
318
|
-
seg_length: i32,
|
|
357
|
+
chromosomes: &HashSet<String>,
|
|
319
358
|
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
320
359
|
filtered_maf_records: &AtomicUsize,
|
|
321
|
-
filtered_cnv_records: &AtomicUsize,
|
|
322
360
|
included_maf_records: &AtomicUsize,
|
|
323
|
-
included_cnv_records: &AtomicUsize,
|
|
324
361
|
) -> Result<Option<Vec<String>>, (String, String, String)> {
|
|
325
362
|
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
326
363
|
let mut out_lst = vec![case_id.to_string()];
|
|
@@ -333,51 +370,38 @@ async fn process_row(
|
|
|
333
370
|
maf: FilteredMafDetails::default(),
|
|
334
371
|
cnv: FilteredCnvDetails::default(),
|
|
335
372
|
});
|
|
336
|
-
|
|
337
373
|
let case_details = filtered_map.get_mut(case_id).unwrap();
|
|
338
374
|
|
|
339
375
|
// Track total processed records
|
|
340
|
-
|
|
341
|
-
case_details.maf.total_processed += 1;
|
|
342
|
-
} else if data_type == "cnv" {
|
|
343
|
-
case_details.cnv.total_processed += 1;
|
|
344
|
-
}
|
|
376
|
+
case_details.maf.total_processed += 1;
|
|
345
377
|
|
|
346
378
|
// Handle consequence filtering and counting for MAF files
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
*case_details
|
|
356
|
-
.maf
|
|
357
|
-
.matched_consequences
|
|
358
|
-
.entry(variant_classification.to_string())
|
|
359
|
-
.or_insert(0) += 1;
|
|
360
|
-
} else {
|
|
361
|
-
// Unmatched consequence
|
|
362
|
-
*case_details
|
|
363
|
-
.maf
|
|
364
|
-
.rejected_consequences
|
|
365
|
-
.entry(variant_classification.to_string())
|
|
366
|
-
.or_insert(0) += 1;
|
|
367
|
-
case_details.maf.excluded_by_consequence_type += 1;
|
|
368
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
369
|
-
return Ok(None);
|
|
370
|
-
}
|
|
371
|
-
} else {
|
|
372
|
-
// Empty filter, count as matched
|
|
379
|
+
|
|
380
|
+
if let Some(var_class_idx) = variant_classification_index {
|
|
381
|
+
if var_class_idx < cont_lst.len() {
|
|
382
|
+
let variant_classification = &cont_lst[var_class_idx];
|
|
383
|
+
if let Some(consequence_filter) = consequences {
|
|
384
|
+
if !consequence_filter.is_empty() {
|
|
385
|
+
if consequence_filter.contains(variant_classification) {
|
|
386
|
+
// Matched consequence
|
|
373
387
|
*case_details
|
|
374
388
|
.maf
|
|
375
389
|
.matched_consequences
|
|
376
390
|
.entry(variant_classification.to_string())
|
|
377
391
|
.or_insert(0) += 1;
|
|
392
|
+
} else {
|
|
393
|
+
// Unmatched consequence
|
|
394
|
+
*case_details
|
|
395
|
+
.maf
|
|
396
|
+
.rejected_consequences
|
|
397
|
+
.entry(variant_classification.to_string())
|
|
398
|
+
.or_insert(0) += 1;
|
|
399
|
+
case_details.maf.excluded_by_consequence_type += 1;
|
|
400
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
401
|
+
return Ok(None);
|
|
378
402
|
}
|
|
379
403
|
} else {
|
|
380
|
-
//
|
|
404
|
+
// Empty filter, count as matched
|
|
381
405
|
*case_details
|
|
382
406
|
.maf
|
|
383
407
|
.matched_consequences
|
|
@@ -385,32 +409,142 @@ async fn process_row(
|
|
|
385
409
|
.or_insert(0) += 1;
|
|
386
410
|
}
|
|
387
411
|
} else {
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
412
|
+
// No filter, count as matched
|
|
413
|
+
*case_details
|
|
414
|
+
.maf
|
|
415
|
+
.matched_consequences
|
|
416
|
+
.entry(variant_classification.to_string())
|
|
417
|
+
.or_insert(0) += 1;
|
|
391
418
|
}
|
|
392
419
|
} else {
|
|
393
420
|
case_details.maf.invalid_rows += 1;
|
|
394
421
|
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
395
422
|
return Ok(None);
|
|
396
423
|
}
|
|
424
|
+
} else {
|
|
425
|
+
case_details.maf.invalid_rows += 1;
|
|
426
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
427
|
+
return Ok(None);
|
|
397
428
|
}
|
|
398
429
|
|
|
399
430
|
// Extract relevant columns
|
|
400
431
|
for &x in columns_indices {
|
|
401
432
|
if x >= cont_lst.len() {
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
405
|
-
} else if data_type == "cnv" {
|
|
406
|
-
case_details.cnv.invalid_rows += 1;
|
|
407
|
-
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
408
|
-
}
|
|
433
|
+
case_details.maf.invalid_rows += 1;
|
|
434
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
409
435
|
return Ok(None); // Invalid row
|
|
410
436
|
}
|
|
437
|
+
let element = cont_lst[x].to_string();
|
|
438
|
+
out_lst.push(element);
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
// Additional MAF-specific processing
|
|
442
|
+
if out_lst.len() < 6 {
|
|
443
|
+
case_details.maf.invalid_rows += 1;
|
|
444
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
445
|
+
return Ok(None); // Not enough columns
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
let alle_depth = out_lst[4].parse::<i32>().map_err(|_| {
|
|
449
|
+
case_details.maf.invalid_rows += 1;
|
|
450
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
451
|
+
(
|
|
452
|
+
case_id.to_string(),
|
|
453
|
+
data_type.to_string(),
|
|
454
|
+
"Failed to convert t_depth to integer.".to_string(),
|
|
455
|
+
)
|
|
456
|
+
})?;
|
|
457
|
+
|
|
458
|
+
let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
|
|
459
|
+
case_details.maf.invalid_rows += 1;
|
|
460
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
461
|
+
(
|
|
462
|
+
case_id.to_string(),
|
|
463
|
+
data_type.to_string(),
|
|
464
|
+
"Failed to convert t_alt_count to integer.".to_string(),
|
|
465
|
+
)
|
|
466
|
+
})?;
|
|
467
|
+
|
|
468
|
+
if alle_depth < min_total_depth {
|
|
469
|
+
case_details.maf.t_depth += 1;
|
|
470
|
+
case_details.maf.excluded_by_min_depth += 1;
|
|
471
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
472
|
+
return Ok(None);
|
|
473
|
+
}
|
|
474
|
+
if alt_count < min_alt_allele_count {
|
|
475
|
+
case_details.maf.t_alt_count += 1;
|
|
476
|
+
case_details.maf.excluded_by_min_alt_count += 1;
|
|
477
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
478
|
+
return Ok(None);
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
// Keep case_id, chr, start, end, and add "mutation"
|
|
482
|
+
out_lst = out_lst[0..4].to_vec();
|
|
483
|
+
out_lst.push("mutation".to_string());
|
|
484
|
+
|
|
485
|
+
// adding 'chr' to chromosome if it is not start with 'chr'
|
|
486
|
+
if !out_lst[1].starts_with("chr") {
|
|
487
|
+
out_lst[1] = format!("chr{}", out_lst[1]);
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// Chromosome filtering
|
|
491
|
+
if !chromosomes.is_empty() && !chromosomes.contains(&out_lst[1]) {
|
|
492
|
+
*case_details
|
|
493
|
+
.maf
|
|
494
|
+
.skipped_chromosomes
|
|
495
|
+
.entry(out_lst[1].clone())
|
|
496
|
+
.or_insert(0) += 1;
|
|
497
|
+
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
498
|
+
return Ok(None);
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
// Update counters for included MAF records
|
|
502
|
+
case_details.maf.total_included += 1;
|
|
503
|
+
included_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
504
|
+
|
|
505
|
+
Ok(Some(out_lst))
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
// Process a single row of CNV file
|
|
509
|
+
async fn process_cnvline(
|
|
510
|
+
line: &str,
|
|
511
|
+
case_id: &str,
|
|
512
|
+
data_type: &str,
|
|
513
|
+
header: &[String],
|
|
514
|
+
columns_indices: &[usize],
|
|
515
|
+
gain_threshold: f32,
|
|
516
|
+
loss_threshold: f32,
|
|
517
|
+
seg_length: i32,
|
|
518
|
+
chromosomes: &HashSet<String>,
|
|
519
|
+
filtered_records: &Arc<Mutex<HashMap<String, FilteredCaseDetails>>>,
|
|
520
|
+
filtered_cnv_records: &AtomicUsize,
|
|
521
|
+
included_cnv_records: &AtomicUsize,
|
|
522
|
+
) -> Result<Option<Vec<String>>, (String, String, String)> {
|
|
523
|
+
let cont_lst: Vec<String> = line.split("\t").map(|s| s.to_string()).collect();
|
|
524
|
+
let mut out_lst = vec![case_id.to_string()];
|
|
525
|
+
|
|
526
|
+
// Initialize or update case details
|
|
527
|
+
let mut filtered_map = filtered_records.lock().await;
|
|
528
|
+
filtered_map
|
|
529
|
+
.entry(case_id.to_string())
|
|
530
|
+
.or_insert_with(|| FilteredCaseDetails {
|
|
531
|
+
maf: FilteredMafDetails::default(),
|
|
532
|
+
cnv: FilteredCnvDetails::default(),
|
|
533
|
+
});
|
|
534
|
+
let case_details = filtered_map.get_mut(case_id).unwrap();
|
|
535
|
+
|
|
536
|
+
// Track total processed records
|
|
537
|
+
case_details.cnv.total_processed += 1;
|
|
411
538
|
|
|
539
|
+
// Extract relevant columns
|
|
540
|
+
for &x in columns_indices {
|
|
541
|
+
if x >= cont_lst.len() {
|
|
542
|
+
case_details.cnv.invalid_rows += 1;
|
|
543
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
544
|
+
return Ok(None); // Invalid row
|
|
545
|
+
}
|
|
412
546
|
let mut element = cont_lst[x].to_string();
|
|
413
|
-
if
|
|
547
|
+
if header[x] == "Segment_Mean" {
|
|
414
548
|
element = process_segment_mean(&element, case_id, data_type, gain_threshold, loss_threshold)?;
|
|
415
549
|
if element.is_empty() {
|
|
416
550
|
case_details.cnv.segment_mean += 1;
|
|
@@ -430,89 +564,55 @@ async fn process_row(
|
|
|
430
564
|
out_lst.push(element);
|
|
431
565
|
}
|
|
432
566
|
|
|
433
|
-
//
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
(
|
|
445
|
-
case_id.to_string(),
|
|
446
|
-
data_type.to_string(),
|
|
447
|
-
"Failed to convert t_depth to integer.".to_string(),
|
|
448
|
-
)
|
|
449
|
-
})?;
|
|
450
|
-
|
|
451
|
-
let alt_count = out_lst[5].parse::<i32>().map_err(|_| {
|
|
452
|
-
case_details.maf.invalid_rows += 1;
|
|
453
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
454
|
-
(
|
|
455
|
-
case_id.to_string(),
|
|
456
|
-
data_type.to_string(),
|
|
457
|
-
"Failed to convert t_alt_count to integer.".to_string(),
|
|
458
|
-
)
|
|
459
|
-
})?;
|
|
460
|
-
|
|
461
|
-
if alle_depth < min_total_depth {
|
|
462
|
-
case_details.maf.t_depth += 1;
|
|
463
|
-
case_details.maf.excluded_by_min_depth += 1;
|
|
464
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
465
|
-
return Ok(None);
|
|
466
|
-
}
|
|
467
|
-
if alt_count < min_alt_allele_count {
|
|
468
|
-
case_details.maf.t_alt_count += 1;
|
|
469
|
-
case_details.maf.excluded_by_min_alt_count += 1;
|
|
470
|
-
filtered_maf_records.fetch_add(1, Ordering::Relaxed);
|
|
471
|
-
return Ok(None);
|
|
472
|
-
}
|
|
473
|
-
|
|
474
|
-
// Keep case_id, chr, start, end, and add "mutation"
|
|
475
|
-
out_lst = out_lst[0..4].to_vec();
|
|
476
|
-
out_lst.push("mutation".to_string());
|
|
567
|
+
// filter cnvs based on segment length. Default: 0 (no filtering)
|
|
568
|
+
// calculate segment length (End_Position - Start_Position)
|
|
569
|
+
let end_position = out_lst[3].parse::<i32>().map_err(|_| {
|
|
570
|
+
case_details.cnv.invalid_rows += 1;
|
|
571
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
572
|
+
(
|
|
573
|
+
case_id.to_string(),
|
|
574
|
+
data_type.to_string(),
|
|
575
|
+
"Failed to convert End Position of cnv to integer.".to_string(),
|
|
576
|
+
)
|
|
577
|
+
})?;
|
|
477
578
|
|
|
478
|
-
|
|
479
|
-
case_details.
|
|
480
|
-
|
|
579
|
+
let start_position = out_lst[2].parse::<i32>().map_err(|_| {
|
|
580
|
+
case_details.cnv.invalid_rows += 1;
|
|
581
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
582
|
+
(
|
|
583
|
+
case_id.to_string(),
|
|
584
|
+
data_type.to_string(),
|
|
585
|
+
"Failed to convert Start Position of cnv to integer.".to_string(),
|
|
586
|
+
)
|
|
587
|
+
})?;
|
|
588
|
+
let cnv_length = end_position - start_position;
|
|
589
|
+
if seg_length > 0 && cnv_length > seg_length {
|
|
590
|
+
case_details.cnv.seg_length += 1;
|
|
591
|
+
case_details.cnv.excluded_by_segment_length += 1;
|
|
592
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
593
|
+
return Ok(None);
|
|
481
594
|
}
|
|
482
595
|
|
|
483
|
-
//
|
|
484
|
-
if
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
case_details.cnv.invalid_rows += 1;
|
|
488
|
-
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
489
|
-
(
|
|
490
|
-
case_id.to_string(),
|
|
491
|
-
data_type.to_string(),
|
|
492
|
-
"Failed to convert End Position of cnv to integer.".to_string(),
|
|
493
|
-
)
|
|
494
|
-
})?;
|
|
596
|
+
// adding 'chr' to chromosome if it is not start with 'chr'
|
|
597
|
+
if !out_lst[1].starts_with("chr") {
|
|
598
|
+
out_lst[1] = format!("chr{}", out_lst[1]);
|
|
599
|
+
}
|
|
495
600
|
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
let cnv_length = end_position - start_position;
|
|
506
|
-
if seg_length > 0 && cnv_length > seg_length {
|
|
507
|
-
case_details.cnv.seg_length += 1;
|
|
508
|
-
case_details.cnv.excluded_by_segment_length += 1;
|
|
509
|
-
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
510
|
-
return Ok(None);
|
|
511
|
-
}
|
|
512
|
-
case_details.cnv.total_included += 1;
|
|
513
|
-
included_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
601
|
+
// Chromosome filtering
|
|
602
|
+
if !chromosomes.is_empty() && !chromosomes.contains(&out_lst[1]) {
|
|
603
|
+
*case_details
|
|
604
|
+
.cnv
|
|
605
|
+
.skipped_chromosomes
|
|
606
|
+
.entry(out_lst[1].clone())
|
|
607
|
+
.or_insert(0) += 1;
|
|
608
|
+
filtered_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
609
|
+
return Ok(None);
|
|
514
610
|
}
|
|
515
611
|
|
|
612
|
+
// Update counters for included MAF records
|
|
613
|
+
case_details.cnv.total_included += 1;
|
|
614
|
+
included_cnv_records.fetch_add(1, Ordering::Relaxed);
|
|
615
|
+
|
|
516
616
|
Ok(Some(out_lst))
|
|
517
617
|
}
|
|
518
618
|
|
|
@@ -652,11 +752,13 @@ async fn download_data_streaming(
|
|
|
652
752
|
host: &str,
|
|
653
753
|
min_total_depth: i32,
|
|
654
754
|
min_alt_allele_count: i32,
|
|
655
|
-
|
|
755
|
+
maf_hyper_mutator: i32,
|
|
656
756
|
consequences: &Option<Vec<String>>,
|
|
657
757
|
gain_threshold: f32,
|
|
658
758
|
loss_threshold: f32,
|
|
659
759
|
seg_length: i32,
|
|
760
|
+
cnv_hyper_mutator: i32,
|
|
761
|
+
chromosomes: &HashSet<String>,
|
|
660
762
|
) {
|
|
661
763
|
let data_urls: Vec<(String, String, String)> = data4dl
|
|
662
764
|
.into_iter()
|
|
@@ -680,7 +782,7 @@ async fn download_data_streaming(
|
|
|
680
782
|
let filtered_maf_records = Arc::new(AtomicUsize::new(0));
|
|
681
783
|
let filtered_cnv_records = Arc::new(AtomicUsize::new(0));
|
|
682
784
|
let filtered_records = Arc::new(Mutex::new(HashMap::<String, FilteredCaseDetails>::new()));
|
|
683
|
-
let hyper_mutator_records = Arc::new(Mutex::new(
|
|
785
|
+
let hyper_mutator_records = Arc::new(Mutex::new(HashMap::<String, Vec<String>>::new()));
|
|
684
786
|
let included_maf_records = Arc::new(AtomicUsize::new(0));
|
|
685
787
|
let included_cnv_records = Arc::new(AtomicUsize::new(0));
|
|
686
788
|
|
|
@@ -717,11 +819,13 @@ async fn download_data_streaming(
|
|
|
717
819
|
&data_type,
|
|
718
820
|
min_total_depth,
|
|
719
821
|
min_alt_allele_count,
|
|
720
|
-
|
|
822
|
+
maf_hyper_mutator,
|
|
721
823
|
&consequences,
|
|
722
824
|
gain_threshold,
|
|
723
825
|
loss_threshold,
|
|
724
826
|
seg_length,
|
|
827
|
+
cnv_hyper_mutator,
|
|
828
|
+
&chromosomes,
|
|
725
829
|
&filtered_records,
|
|
726
830
|
&filtered_maf_records,
|
|
727
831
|
&filtered_cnv_records,
|
|
@@ -865,7 +969,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
865
969
|
let case_files = input_js.case_files;
|
|
866
970
|
|
|
867
971
|
// Set default maf_options
|
|
868
|
-
let (min_total_depth, min_alt_allele_count,
|
|
972
|
+
let (min_total_depth, min_alt_allele_count, maf_hyper_mutator, consequences) = match input_js.maf_options {
|
|
869
973
|
Some(options) => (
|
|
870
974
|
options.min_total_depth,
|
|
871
975
|
options.min_alt_allele_count,
|
|
@@ -876,22 +980,32 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
876
980
|
};
|
|
877
981
|
|
|
878
982
|
// Set default cnv_options
|
|
879
|
-
let (gain_threshold, loss_threshold, seg_length) = match input_js.cnv_options {
|
|
880
|
-
Some(options) => (
|
|
881
|
-
|
|
983
|
+
let (gain_threshold, loss_threshold, seg_length, cnv_hyper_mutator) = match input_js.cnv_options {
|
|
984
|
+
Some(options) => (
|
|
985
|
+
options.gain_threshold,
|
|
986
|
+
options.loss_threshold,
|
|
987
|
+
options.seg_length,
|
|
988
|
+
options.hyper_mutator,
|
|
989
|
+
),
|
|
990
|
+
None => (0.3, -0.4, 0, 500), // Default values
|
|
882
991
|
};
|
|
883
992
|
|
|
993
|
+
// Convert Vec<String> to HashSet<String> for faster lookup
|
|
994
|
+
let chromosomes = input_js.chromosomes.into_iter().collect::<HashSet<String>>();
|
|
995
|
+
|
|
884
996
|
// Download data - this will now handle errors gracefully
|
|
885
997
|
download_data_streaming(
|
|
886
998
|
case_files,
|
|
887
999
|
HOST,
|
|
888
1000
|
min_total_depth,
|
|
889
1001
|
min_alt_allele_count,
|
|
890
|
-
|
|
1002
|
+
maf_hyper_mutator,
|
|
891
1003
|
&consequences,
|
|
892
1004
|
gain_threshold,
|
|
893
1005
|
loss_threshold,
|
|
894
1006
|
seg_length,
|
|
1007
|
+
cnv_hyper_mutator,
|
|
1008
|
+
&chromosomes,
|
|
895
1009
|
)
|
|
896
1010
|
.await;
|
|
897
1011
|
|