nci-cidc-schemas 0.28.1__py2.py3-none-any.whl → 0.28.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cidc_schemas/__init__.py +1 -1
- cidc_schemas/ngs_pipeline_api/__init__.py +29 -0
- cidc_schemas/ngs_pipeline_api/atacseq/atacseq.md +55 -0
- cidc_schemas/ngs_pipeline_api/atacseq/atacseq_output_API.json +39 -0
- cidc_schemas/ngs_pipeline_api/atacseq/imgs/atacseq.png +0 -0
- cidc_schemas/ngs_pipeline_api/output_API.schema.json +45 -0
- cidc_schemas/ngs_pipeline_api/rna/imgs/RIMA.png +0 -0
- cidc_schemas/ngs_pipeline_api/rna/rna.md +54 -0
- cidc_schemas/ngs_pipeline_api/rna/rna_config.schema.json +39 -0
- cidc_schemas/ngs_pipeline_api/rna/rna_output_API.json +195 -0
- cidc_schemas/ngs_pipeline_api/tcr/imgs/TCRseq.png +0 -0
- cidc_schemas/ngs_pipeline_api/tcr/tcr.md +101 -0
- cidc_schemas/ngs_pipeline_api/wes/imgs/wes.png +0 -0
- cidc_schemas/ngs_pipeline_api/wes/wes.md +46 -0
- cidc_schemas/ngs_pipeline_api/wes/wes_config.schema.json +82 -0
- cidc_schemas/ngs_pipeline_api/wes/wes_output_API.json +503 -0
- cidc_schemas/ngs_pipeline_api/wes/wes_output_API.py +548 -0
- cidc_schemas/ngs_pipeline_api/wes/wes_tumor_only_output_API.json +213 -0
- {nci_cidc_schemas-0.28.1.dist-info → nci_cidc_schemas-0.28.2.dist-info}/METADATA +1 -1
- {nci_cidc_schemas-0.28.1.dist-info → nci_cidc_schemas-0.28.2.dist-info}/RECORD +24 -7
- {nci_cidc_schemas-0.28.1.dist-info → nci_cidc_schemas-0.28.2.dist-info}/WHEEL +0 -0
- {nci_cidc_schemas-0.28.1.dist-info → nci_cidc_schemas-0.28.2.dist-info}/entry_points.txt +0 -0
- {nci_cidc_schemas-0.28.1.dist-info → nci_cidc_schemas-0.28.2.dist-info}/licenses/LICENSE +0 -0
- {nci_cidc_schemas-0.28.1.dist-info → nci_cidc_schemas-0.28.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,548 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""Len Taing 2020 (TGBTG)"""
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
import json
|
|
7
|
+
from optparse import OptionParser
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Wesfile:
|
|
11
|
+
"""General wes file object that will handle outputing to appropriate
|
|
12
|
+
json API format"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, file_dict):
|
|
15
|
+
"""Given a file path, initializes this object to that path
|
|
16
|
+
NOTE: filepath may include snakemake wildcards, e.g.
|
|
17
|
+
analysis/germline/{run id}/{run id}_vcfcompare.txt"""
|
|
18
|
+
# print(file_tuple)
|
|
19
|
+
self.file_path_template = file_dict["file_path"]
|
|
20
|
+
self.short_description = file_dict["short_descr"]
|
|
21
|
+
self.long_description = file_dict["long_descr"]
|
|
22
|
+
self.filter_group = file_dict["filter_group"]
|
|
23
|
+
self.file_purpose = file_dict.get("file_purpose", "Analysis view")
|
|
24
|
+
self.optional = file_dict.get("optional", False)
|
|
25
|
+
self.tumor_only_assay = file_dict.get(
|
|
26
|
+
"tumor_only_assay", True
|
|
27
|
+
) # default everything is part of tumor_only assay; below i mark this field false for normal files
|
|
28
|
+
|
|
29
|
+
def __str__(self):
|
|
30
|
+
return self.__dict__.__str__()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def dumper(obj):
|
|
34
|
+
# ref: https://www.semicolonworld.com/question/42934/how-to-make-a-class-json-serializable
|
|
35
|
+
try:
|
|
36
|
+
return obj.toJSON()
|
|
37
|
+
except:
|
|
38
|
+
return obj.__dict__
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def evalWildcards(file_tuple, wildcard, s, is_optional=False):
|
|
42
|
+
# file_tuple[0] = file_tuple[0].replace(wildcard, s)
|
|
43
|
+
# Non-destructive replacement
|
|
44
|
+
# print(file_tuple)
|
|
45
|
+
ret = file_tuple.copy()
|
|
46
|
+
ret["file_path"] = ret["file_path"].replace(wildcard, s)
|
|
47
|
+
return ret
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
sample_files = [
|
|
51
|
+
############################## ALIGN ##############################
|
|
52
|
+
{
|
|
53
|
+
"file_path": "analysis/align/{sample}/{sample}.sorted.dedup.bam",
|
|
54
|
+
"short_descr": "alignment: bam file with deduplicated reads",
|
|
55
|
+
"long_descr": "Aligned reads were sorted and marked duplicates were removed using the Sentieon Dedup tool (https://support.sentieon.com/manual/usages/general/#dedup-algorithm)",
|
|
56
|
+
"filter_group": "alignment",
|
|
57
|
+
"file_purpose": "Source view",
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"file_path": "analysis/align/{sample}/{sample}.sorted.dedup.bam.bai",
|
|
61
|
+
"short_descr": "alignment: index file for deduplicated bam",
|
|
62
|
+
"long_descr": "Bam index file for deduplicated bam file generated by the Sentieon Dedup tool (https://support.sentieon.com/manual/usages/general/#dedup-algorithm)",
|
|
63
|
+
"filter_group": "alignment",
|
|
64
|
+
"file_purpose": "Source view",
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"file_path": "analysis/align/{sample}/{sample}_recalibrated.bam",
|
|
68
|
+
"short_descr": "alignment: Base Qualtiy Score Recalibration (BQSR) bam file",
|
|
69
|
+
"long_descr": "The Sentieon QualCal (https://support.sentieon.com/manual/usages/general/#qualcal-algorithm) is used to perform BSQR and remove any technical artifacts in the base quality scores.",
|
|
70
|
+
"filter_group": "alignment",
|
|
71
|
+
"file_purpose": "Source view",
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
"file_path": "analysis/align/{sample}/{sample}_recalibrated.bam.bai",
|
|
75
|
+
"short_descr": "alignment: index file for Base Qualtiy Score Recalibration (BQSR) bam file",
|
|
76
|
+
"long_descr": "Index file for the BQSR bam file",
|
|
77
|
+
"filter_group": "alignment",
|
|
78
|
+
"file_purpose": "Source view",
|
|
79
|
+
},
|
|
80
|
+
############################## germline ##############################
|
|
81
|
+
# NOTE: we're ingesting both tumor and normal samples germline call, but
|
|
82
|
+
# the one that only counts is the normal sample's germline call
|
|
83
|
+
{
|
|
84
|
+
"file_path": "analysis/germline/{sample}/{sample}_haplotyper.output.vcf",
|
|
85
|
+
"short_descr": "germline: germline variants",
|
|
86
|
+
"long_descr": "Haplotype variants using Sentieon Haplotyper algorithm (https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm)",
|
|
87
|
+
"filter_group": "germline",
|
|
88
|
+
"tumor_only_assay": False,
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"file_path": "analysis/germline/{sample}/{sample}_haplotyper.targets.vcf.gz",
|
|
92
|
+
"short_descr": "germline: vcf of haplotype variants in targeted regions",
|
|
93
|
+
"long_descr": "Haplotype variants within targeted capture regions using Sentieon Haplotyper algorithm (https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm)",
|
|
94
|
+
"filter_group": "germline",
|
|
95
|
+
"tumor_only_assay": False,
|
|
96
|
+
},
|
|
97
|
+
###################################################################
|
|
98
|
+
# NOTE: FOR all hla callers we're ingesting both tumor and normal
|
|
99
|
+
# sample, but the one that counts is the tumor sample's
|
|
100
|
+
#
|
|
101
|
+
# ALSO: for hla, we're not stashing the chr6 fastqs (which are used
|
|
102
|
+
# as input for the HLA callers b/c those can be quickly re-derived
|
|
103
|
+
# from sorted.dedup.bam --under 30secs per sample)
|
|
104
|
+
############################## hlahd ##############################
|
|
105
|
+
{
|
|
106
|
+
"file_path": "analysis/hlahd/{sample}/result/{sample}_final.result.txt",
|
|
107
|
+
"short_descr": "hla: MHC Class I and II results (using HLA-HD)",
|
|
108
|
+
"long_descr": "Predicted MHC Class II and II results using the HLA-HD software (https://www.genome.med.kyoto-u.ac.jp/HLA-HD/). Chromosome 6 reads from the deduplicated bam file were extracted and fed into the HLA-HD prediction algorithm.",
|
|
109
|
+
"filter_group": "HLA",
|
|
110
|
+
},
|
|
111
|
+
############################## optitype #######################
|
|
112
|
+
{
|
|
113
|
+
"file_path": "analysis/optitype/{sample}/{sample}_result.tsv",
|
|
114
|
+
"short_descr": "hla: MHC Class I results (using OptiType)",
|
|
115
|
+
"long_descr": "Predicted MHC Class I alleles using the Optitype software (https://github.com/FRED-2/OptiType). Chromosome 6 reads from the deduplicated bam file were extracted and fed into the Optitype prediction algorithm.",
|
|
116
|
+
"filter_group": "HLA",
|
|
117
|
+
},
|
|
118
|
+
############################## xhla #######################
|
|
119
|
+
{
|
|
120
|
+
"file_path": "analysis/xhla/{sample}/report-{sample}-hla.json",
|
|
121
|
+
"short_descr": "hla: MHC Class I and II results (using xhla)",
|
|
122
|
+
"long_descr": "Predicted MHC Class I and II results using the xHLA software(https://github.com/humanlongevity/HLA). Chromosome 6 reads from the deduplicated bam file were extracted and fed into the xHLA prediction algorithm.",
|
|
123
|
+
"filter_group": "HLA",
|
|
124
|
+
},
|
|
125
|
+
#####################################################################
|
|
126
|
+
# 2022-06-14 DEPRECATING ALL metrics for ingestion for the following
|
|
127
|
+
# reasons:
|
|
128
|
+
# 1. They are BIG and USELESS (we have never looked at them)
|
|
129
|
+
# 2. the sample's summary coverage, e.g. total_reads, mean_depth,
|
|
130
|
+
# percent_bases_gt_50, etc. are sufficient in 99.9% of the cases!
|
|
131
|
+
# 3. since we are stashing the sorted.dedup.bam file, we can always
|
|
132
|
+
# re-derive them IF we need!
|
|
133
|
+
############################## metrics ##############################
|
|
134
|
+
# {
|
|
135
|
+
# "file_path": "analysis/metrics/{sample}/{sample}_coverage_metrics.txt",
|
|
136
|
+
# "short_descr": "coverage: global coverage file",
|
|
137
|
+
# "long_descr": "Genome wide coverage file generated using the Sentieon CoverageMetrics algorithm (https://support.sentieon.com/manual/usages/general/#coveragemetrics-algorithm) with a coverage threshold (cov_thresh) set to 50.",
|
|
138
|
+
# "filter_group": "coverage",
|
|
139
|
+
# },
|
|
140
|
+
# {
|
|
141
|
+
# "file_path": "analysis/metrics/{sample}/{sample}_target_metrics.txt",
|
|
142
|
+
# "short_descr": "coverage: target region coverage file",
|
|
143
|
+
# "long_descr": "Targeted exome regions coverage file using the Sentieon CoverageMetrics algorithm (https://support.sentieon.com/manual/usages/general/#coveragemetrics-algorithm) with a coverage threshold (cov_thresh) set to 50.",
|
|
144
|
+
# "filter_group": "coverage",
|
|
145
|
+
# },
|
|
146
|
+
# {
|
|
147
|
+
# "file_path": "analysis/metrics/{sample}/{sample}_coverage_metrics.sample_summary.txt",
|
|
148
|
+
# "short_descr": "coverage: global coverage summary file",
|
|
149
|
+
# "long_descr": "Genome wide coverage summary file generated by the Sentieon CoverageMetrics algorithm (https://support.sentieon.com/manual/usages/general/#coveragemetrics-algorithm).",
|
|
150
|
+
# "filter_group": "coverage",
|
|
151
|
+
# },
|
|
152
|
+
]
|
|
153
|
+
|
|
154
|
+
run_files = [
|
|
155
|
+
############################## MISC ##############################
|
|
156
|
+
{
|
|
157
|
+
"file_path": "analysis/{run}_error.yaml",
|
|
158
|
+
"short_descr": "yaml file that specifies error codes for files",
|
|
159
|
+
"long_descr": "Explanation of all files which are expected to be empty due to a failed/missing module.",
|
|
160
|
+
"optional": True, # optional
|
|
161
|
+
"filter_group": "",
|
|
162
|
+
},
|
|
163
|
+
############################## clonality ##############################
|
|
164
|
+
# sequenza files
|
|
165
|
+
{
|
|
166
|
+
"file_path": "analysis/clonality/{run}/{run}_segments.txt",
|
|
167
|
+
"short_descr": "copynumber: Sequenza CNV segments file",
|
|
168
|
+
"long_descr": "Copy number variation segments file called by the Sequenza software package. The column descriptions for the segment file could be found here (https://cran.r-project.org/web/packages/sequenza/vignettes/sequenza.html#plots-and-results)",
|
|
169
|
+
"filter_group": "copynumber",
|
|
170
|
+
"tumor_only_assay": False,
|
|
171
|
+
},
|
|
172
|
+
{
|
|
173
|
+
"file_path": "analysis/clonality/{run}/{run}_genome_view.pdf",
|
|
174
|
+
"short_descr": "copynumber: Sequenza genome-wide plot of depth.ratio and B-allele frequency.",
|
|
175
|
+
"long_descr": "Genome-wide plot (generated by Sequenza) showing depth.ratio and B-allele frequency.",
|
|
176
|
+
"filter_group": "copynumber",
|
|
177
|
+
"tumor_only_assay": False,
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
"file_path": "analysis/clonality/{run}/{run}_chromosome_view.pdf",
|
|
181
|
+
"short_descr": "copynumber: Sequenza plot of depth.ratio and B-allele frequency chromosome by chromosome.",
|
|
182
|
+
"long_descr": "Chromosome by chromosome plot (generated by Sequenza) showing depth.ratio and B-allele frequency.",
|
|
183
|
+
"filter_group": "copynumber",
|
|
184
|
+
"tumor_only_assay": False,
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
"file_path": "analysis/clonality/{run}/{run}_sequenza_gainLoss.bed",
|
|
188
|
+
"short_descr": "copynumber: Sequenza CNV segments file filtered with hard cut-offs to call regions of GAIN/LOSS",
|
|
189
|
+
"long_descr": "Filtered Sequenza segments file after applying a hard cut-off to call regions of GAIN (total copy number >= 3) and regions of LOSS (total copy number <= 1.5).",
|
|
190
|
+
"filter_group": "copynumber",
|
|
191
|
+
"tumor_only_assay": False,
|
|
192
|
+
},
|
|
193
|
+
######################################################################
|
|
194
|
+
# NOTE: stashing final.seqz file because it's 1. SMALL (under 100MB)
|
|
195
|
+
# and 2. is very hard to re-derive from recalibrated.bam
|
|
196
|
+
######################################################################
|
|
197
|
+
{
|
|
198
|
+
"file_path": "analysis/clonality/{run}/{run}.bin50.final.seqz.txt.gz",
|
|
199
|
+
"short_descr": "copynumber: Sequenza post-processed seqz file used for input to Sequenza CNV caller",
|
|
200
|
+
"long_descr": "Sequenza seqz file generated by the bam2seqz software using a GC wiggle track with a window size of 50 (-w 50).",
|
|
201
|
+
"filter_group": "copynumber",
|
|
202
|
+
"tumor_only_assay": False,
|
|
203
|
+
},
|
|
204
|
+
# purity files
|
|
205
|
+
{
|
|
206
|
+
"file_path": "analysis/clonality/{run}/{run}_alternative_solutions.txt",
|
|
207
|
+
"short_descr": "purity: Sequenza Cellularity and Ploidy estimate file",
|
|
208
|
+
"long_descr": "Cellularity and ploidy estimates of the tumor sample using the Sequenza software package. The columns of the file are follows: Cellularity, Ploidy, and SLPP (Scaled Log Posterior Probability).",
|
|
209
|
+
"filter_group": "purity",
|
|
210
|
+
"tumor_only_assay": False,
|
|
211
|
+
},
|
|
212
|
+
{
|
|
213
|
+
"file_path": "analysis/clonality/{run}/{run}_CP_contours.pdf",
|
|
214
|
+
"short_descr": "purity: Sequenza plot of likelihood densities for all cellularity/ploidy solutions.",
|
|
215
|
+
"long_descr": "Sequenza generated plot showing the likelihood densities for each cellularity/ploidy solution (https://cran.r-project.org/web/packages/sequenza/vignettes/sequenza.html#plots-and-results).",
|
|
216
|
+
"filter_group": "purity",
|
|
217
|
+
"tumor_only_assay": False,
|
|
218
|
+
},
|
|
219
|
+
# pyclone files
|
|
220
|
+
{
|
|
221
|
+
"file_path": "analysis/clonality/{run}/{run}_pyclone6.input.tsv",
|
|
222
|
+
"short_descr": "tumor clonality: PyClone-VI input file generated by sequenza library (https://cran.r-project.org/web/packages/sequenza/index.html)",
|
|
223
|
+
"long_descr": "Input file generated for PyClone-VI analysis. Sequenza was used to generate the expected file format (https://github.com/Roth-Lab/pyclone-vi#input-format).",
|
|
224
|
+
"filter_group": "clonality",
|
|
225
|
+
"tumor_only_assay": False,
|
|
226
|
+
},
|
|
227
|
+
{
|
|
228
|
+
"file_path": "analysis/clonality/{run}/{run}_pyclone6.results.tsv",
|
|
229
|
+
"short_descr": "tumor clonality: PyClone-VI tumor clonality results file",
|
|
230
|
+
"long_descr": "Tumor clone/cluster prevalence estimations generated by the PyClone-VI software package. The format of the results file is described here (https://github.com/Roth-Lab/pyclone-vi#output-format).",
|
|
231
|
+
"filter_group": "clonality",
|
|
232
|
+
"tumor_only_assay": False,
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
"file_path": "analysis/clonality/{run}/{run}_pyclone6.results.summary.tsv",
|
|
236
|
+
"short_descr": "tumor clonality: PyClone-VI tumor clonality results summary file",
|
|
237
|
+
"long_descr": "Summary of Pyclone-VI results file condensed to only show the cluster_id, cellular_prevalence, and cellular_prevalence_std columns.",
|
|
238
|
+
"filter_group": "clonality",
|
|
239
|
+
"tumor_only_assay": False,
|
|
240
|
+
},
|
|
241
|
+
############################## CNVkit ##############################
|
|
242
|
+
{
|
|
243
|
+
"file_path": "analysis/cnvkit/{run}/{run}.call.cns",
|
|
244
|
+
"short_descr": "copynumber: CNVkit segments file",
|
|
245
|
+
"long_descr": "CNVkit's Segmented log2 ratios file. The 'cn' column representes the total copy number of the segment. The other columns of the results file are described here (https://cnvkit.readthedocs.io/en/stable/fileformats.html#segmented-log2-ratios-cns)",
|
|
246
|
+
"filter_group": "copynumber",
|
|
247
|
+
"tumor_only_assay": False,
|
|
248
|
+
},
|
|
249
|
+
{
|
|
250
|
+
"file_path": "analysis/cnvkit/{run}/{run}.call.enhanced.cns",
|
|
251
|
+
"short_descr": "copynumber: Enhanced CNVkit segments file with BAF and Major/minor allele information",
|
|
252
|
+
"long_descr": "The enhanced CNVkit segments file incoporates somatic sNP and tumor purity information (called by the pipeline) to incorporate B-allele frequencies, major and minor allele (cn1 and cn2 respectively), and correct for tumor sample purity level.",
|
|
253
|
+
"filter_group": "copynumber",
|
|
254
|
+
"tumor_only_assay": False,
|
|
255
|
+
},
|
|
256
|
+
{
|
|
257
|
+
"file_path": "analysis/cnvkit/{run}/{run}.scatter.png",
|
|
258
|
+
"short_descr": "copynumber: scatter plot of log2 coverage and segmentation call information",
|
|
259
|
+
"long_descr": "Genome-wide scatter plot of log2 coverage ratios and called CNV segments",
|
|
260
|
+
"filter_group": "copynumber",
|
|
261
|
+
"tumor_only_assay": False,
|
|
262
|
+
},
|
|
263
|
+
{
|
|
264
|
+
"file_path": "analysis/cnvkit/{run}/{run}_cnvkit_gainLoss.bed",
|
|
265
|
+
"short_descr": "copynumber: CNVkit segments file filtered with hard cut-offs to call regions of GAIN/LOSS",
|
|
266
|
+
"long_descr": "Filtered CNVkit segments file after applying a hard cut-off to call regions of GAIN (total copy number >= 3) and regions of LOSS (total copy number <= 1.5).",
|
|
267
|
+
"filter_group": "copynumber",
|
|
268
|
+
"tumor_only_assay": False,
|
|
269
|
+
},
|
|
270
|
+
############################## Copynumber ##############################
|
|
271
|
+
{
|
|
272
|
+
"file_path": "analysis/copynumber/{run}/{run}_consensus.bed",
|
|
273
|
+
"short_descr": "copynumber: Consensus CNV segments file",
|
|
274
|
+
"long_descr": "Consensus CNV regions that are called by at least 2 of the 3 callers (CNVkit, Sequenza, or FACETS). CNV Callers must agree on both the region (intersection of overlapped regions) and the call (GAIN or LOSS).",
|
|
275
|
+
"filter_group": "copynumber",
|
|
276
|
+
"tumor_only_assay": False,
|
|
277
|
+
},
|
|
278
|
+
{
|
|
279
|
+
"file_path": "analysis/copynumber/{run}/{run}_consensus_merged_GAIN.bed",
|
|
280
|
+
"short_descr": "copynumber: Consensus CNV segments file of only GAIN regions",
|
|
281
|
+
"long_descr": "GAIN only CNV regions derived from the consensus CNV file. Regions are also merged if they have an overlap of at least 1bp. ",
|
|
282
|
+
"filter_group": "copynumber",
|
|
283
|
+
"tumor_only_assay": False,
|
|
284
|
+
},
|
|
285
|
+
{
|
|
286
|
+
"file_path": "analysis/copynumber/{run}/{run}_consensus_merged_LOSS.bed",
|
|
287
|
+
"short_descr": "copynumber: Consensus CNV segments file of only LOSS regions",
|
|
288
|
+
"long_descr": "LOSS only CNV regions derived from the consensus CNV file. Regions are also merged if they have an overlap of at least 1bp. ",
|
|
289
|
+
"filter_group": "copynumber",
|
|
290
|
+
"tumor_only_assay": False,
|
|
291
|
+
},
|
|
292
|
+
############################## msisensor2 ##############################
|
|
293
|
+
{
|
|
294
|
+
"file_path": "analysis/msisensor2/{run}/{run}_msisensor2.txt",
|
|
295
|
+
"short_descr": "msisensor2: microsatellite instability calculation",
|
|
296
|
+
"long_descr": "Microsatellite instability calculation using msisensor2 (https://github.com/niu-lab/msisensor2)",
|
|
297
|
+
"filter_group": "msisensor2",
|
|
298
|
+
},
|
|
299
|
+
############################## neoantigen ##############################
|
|
300
|
+
{
|
|
301
|
+
"file_path": "analysis/neoantigen/{run}/combined/{run}.filtered.tsv",
|
|
302
|
+
"short_descr": "neaontigen: list of predicted neoantigens",
|
|
303
|
+
"long_descr": "The combined MHC class I and II predicted neoantigens using the pVACseq software. The column definitions are given here (ref: https://pvactools.readthedocs.io/en/latest/pvacseq/output_files.html)",
|
|
304
|
+
"filter_group": "neoantigen",
|
|
305
|
+
},
|
|
306
|
+
############################## purity ##############################
|
|
307
|
+
{
|
|
308
|
+
"file_path": "analysis/purity/{run}/{run}.optimalpurityvalue.txt",
|
|
309
|
+
"short_descr": "tumor purity: tumor purity estimates using the FACETS software package",
|
|
310
|
+
"long_descr": "Tumor purity estimates using the FACETS software (https://github.com/mskcc/facets).",
|
|
311
|
+
"filter_group": "purity",
|
|
312
|
+
"tumor_only_assay": False,
|
|
313
|
+
},
|
|
314
|
+
{
|
|
315
|
+
"file_path": "analysis/purity/{run}/{run}.cncf",
|
|
316
|
+
"short_descr": "copynumber: FACETS CNV segments file",
|
|
317
|
+
"long_descr": "Copy number variation segments file called by the FACETS software (https://github.com/mskcc/facets).",
|
|
318
|
+
"filter_group": "copynumber",
|
|
319
|
+
"tumor_only_assay": False,
|
|
320
|
+
},
|
|
321
|
+
{
|
|
322
|
+
"file_path": "analysis/purity/{run}/{run}_facets_gainLoss.bed",
|
|
323
|
+
"short_descr": "copynumber: FACETS CNV segments file filtered with hard-cutoff to call regions of GAIN/LOSS",
|
|
324
|
+
"long_descr": "Filtered FACETS segments file after applying a hard cut-off to call regions of GAIN (total copy number >= 3) and regions of LOSS (total copy number <= 1.5).",
|
|
325
|
+
"filter_group": "copynumber",
|
|
326
|
+
"tumor_only_assay": False,
|
|
327
|
+
},
|
|
328
|
+
############################## report ##############################
|
|
329
|
+
{
|
|
330
|
+
"file_path": "analysis/report.tar.gz",
|
|
331
|
+
"short_descr": "wes report: wes summary html report",
|
|
332
|
+
"long_descr": "This is a gzipped file of the report directory, which contains the report.html file. After unzipping the file, the user can load report/report.html into any browser to view the WES Summary Report. The report contains run information (i.e. wes software version used to run the analysis as well as the software version of the major tools) as well as summarizations of sample quality, copy number variation, somatic variants, and HLA-type/neoantigen predictions.",
|
|
333
|
+
"filter_group": "report",
|
|
334
|
+
},
|
|
335
|
+
{
|
|
336
|
+
"file_path": "analysis/report/somatic_variants/05_tumor_germline_overlap.tsv",
|
|
337
|
+
"short_descr": "somatic variants: report file of tumor vs germline variants overlap",
|
|
338
|
+
"long_descr": "This file derived from the somatic and germline variants comparison results generated by vcf-compare (http://vcftools.sourceforge.net/perl_module.html#vcf-compare) and is formatted to be human readable. The file reports the number of somatic/tumor only variants (unfiltered), germline/normal only variants (unfiltered), the number of shared variants, and the percent overlap (using the total number of somatic variants as the denominator).",
|
|
339
|
+
"filter_group": "somatic",
|
|
340
|
+
"tumor_only_assay": False,
|
|
341
|
+
},
|
|
342
|
+
# DEPRECATED! b/c we have 3 other HLA files!
|
|
343
|
+
# {
|
|
344
|
+
# "file_path": "analysis/report/neoantigens/01_HLA_Results.tsv", # HLA
|
|
345
|
+
# "short_descr": "hla: report file of combined MHC class I and II results",
|
|
346
|
+
# "long_descr": "This file reports the MHC class I and II results. The class I alleles are derived from the OptiType results and the class II alleles come from the HLA-HD results. ",
|
|
347
|
+
# "filter_group": "HLA",
|
|
348
|
+
# },
|
|
349
|
+
{
|
|
350
|
+
"file_path": "analysis/report/WES_Meta/02_WES_Run_Version.tsv",
|
|
351
|
+
"short_descr": "wes pipeline version- INTERNAL ONLY- for reproducibility",
|
|
352
|
+
"long_descr": "wes pipeline version- INTERNAL ONLY- for reproducibility",
|
|
353
|
+
"filter_group": "report",
|
|
354
|
+
"file_purpose": "Miscellaneous",
|
|
355
|
+
},
|
|
356
|
+
{
|
|
357
|
+
"file_path": "analysis/report/config.yaml",
|
|
358
|
+
"short_descr": "wes pipeline config file- INTERNAL ONLY- for reproducibility",
|
|
359
|
+
"long_descr": "wes pipeline config file- INTERNAL ONLY- for reproducibility",
|
|
360
|
+
"filter_group": "report",
|
|
361
|
+
"file_purpose": "Miscellaneous",
|
|
362
|
+
},
|
|
363
|
+
{
|
|
364
|
+
"file_path": "analysis/report/metasheet.csv",
|
|
365
|
+
"short_descr": "wes pipeline metasheet file- INTERNAL ONLY- for reproducibility",
|
|
366
|
+
"long_descr": "wes pipeline metasheet file- INTERNAL ONLY- for reproducibility",
|
|
367
|
+
"filter_group": "report",
|
|
368
|
+
"file_purpose": "Miscellaneous",
|
|
369
|
+
},
|
|
370
|
+
{
|
|
371
|
+
"file_path": "analysis/report/json/{run}.wes.json",
|
|
372
|
+
"short_descr": "wes sample json for cohort report generation-INTERNAL ONLY",
|
|
373
|
+
"long_descr": "wes sample json for cohort report generation-INTERNAL ONLY",
|
|
374
|
+
"filter_group": "report",
|
|
375
|
+
"file_purpose": "Miscellaneous",
|
|
376
|
+
},
|
|
377
|
+
############################## rna ##############################
|
|
378
|
+
{
|
|
379
|
+
"file_path": "analysis/rna/{run}/{run}.haplotyper.rna.vcf.gz", # RNA
|
|
380
|
+
"short_descr": "rna: Variants called from RNA-seq data",
|
|
381
|
+
"long_descr": "RNA-seq variants called using the Sentieon RNA Variant Calling pipeline(https://support.sentieon.com/manual/RNA_call/rna/). Sentieon's Haplotyper algorithm was used for the variant calling.",
|
|
382
|
+
"filter_group": "rna",
|
|
383
|
+
"optional": True, # optional
|
|
384
|
+
},
|
|
385
|
+
{
|
|
386
|
+
"file_path": "analysis/rna/{run}/{run}_{caller}.output.twist.neoantigen.vep.rna.vcf",
|
|
387
|
+
"short_descr": "rna: Shared RNA and WES variants that is used for neoantigen prediction when RNA-seq data is provided with the WES run",
|
|
388
|
+
"long_descr": "Variants file representing the common variants between RNA (haplotyper.rna.vcf.gz) and WES data (output.twist.neoantigen.vep.vcf).",
|
|
389
|
+
"filter_group": "rna",
|
|
390
|
+
"optional": True, # optional
|
|
391
|
+
},
|
|
392
|
+
############################## somatic ##############################
|
|
393
|
+
{
|
|
394
|
+
"file_path": "analysis/somatic/{run}/{run}_{caller}.output.vcf.gz",
|
|
395
|
+
"short_descr": "somatic variants: vcf file of somatic variants",
|
|
396
|
+
"long_descr": """VCF file of somatic variants using one of the following the Sentieon somatic callers {tnscope (default), tnhaplotyper2, tnsnv}.
|
|
397
|
+
|
|
398
|
+
TNscope algorithm- https://support.sentieon.com/manual/usages/general/#tnscope-algorithm
|
|
399
|
+
TNhaplotyper2- https://support.sentieon.com/manual/usages/general/#tnhaplotyper2-algorithm
|
|
400
|
+
TNsnv - https://support.sentieon.com/manual/usages/general/#tnsnv-algorithm""",
|
|
401
|
+
"filter_group": "somatic",
|
|
402
|
+
},
|
|
403
|
+
# another way of describing it way of doing it
|
|
404
|
+
# {
|
|
405
|
+
# "file_path": "analysis/somatic/{run}/{run}_{caller}.output.twist.vcf",
|
|
406
|
+
# "short_descr": "somatic variants: vcf file of somatic variants in TWIST targeted capture regions",
|
|
407
|
+
# "long_descr": "VCF file of somatic variants that are within the TWIST exome capture regions. bcftools is used to filter reads in output.vcf.gz that intersect with the TWIST capture regions.",
|
|
408
|
+
# "filter_group": "somatic",
|
|
409
|
+
# },
|
|
410
|
+
# {
|
|
411
|
+
# "file_path": "analysis/somatic/{run}/{run}_{caller}.twist.maf",
|
|
412
|
+
# "short_descr": "somatic variants: maf file of somatic variants in TWIST targeted capture regions",
|
|
413
|
+
# "long_descr": "MAF file of TWIST variants using vcf2maf tool (https://github.com/mskcc/vcf2maf). The vep annotated vcf of the TWIST variants (output.twist.vcf) was converted to maf using vcf2maf.",
|
|
414
|
+
# "filter_group": "somatic",
|
|
415
|
+
# },
|
|
416
|
+
{
|
|
417
|
+
"file_path": "analysis/somatic/{run}/{run}_{caller}.output.twist.vcf",
|
|
418
|
+
"short_descr": "somatic variants: vcf file of somatic variants in TWIST targed capture region",
|
|
419
|
+
"long_descr": "VCF file of variants that fall within the TWIST excome capture regions. bcftools is used to filter reads in output.vcf.gz that intersect with the TWIST capture regions.",
|
|
420
|
+
"filter_group": "somatic",
|
|
421
|
+
},
|
|
422
|
+
{
|
|
423
|
+
"file_path": "analysis/somatic/{run}/{run}_{caller}.output.twist.maf",
|
|
424
|
+
"short_descr": "somatic variants: maf file of somatic variants in TWIST targed capture region",
|
|
425
|
+
"long_descr": "MAF file of variants that fall within the TWIST excome capture regions generated using vcf2maf tool (https://github.com/mskcc/vcf2maf). VEP was used to annotate twist.vcf file, which was then used as input to vcf2maf. NOTE: Some columns in this maf file may be affected by the ExACdb assembly compatibility issue discussed in the WES pipeline overview page (https://cidc.nci.nih.gov/pipelines/wes).",
|
|
426
|
+
"filter_group": "somatic",
|
|
427
|
+
},
|
|
428
|
+
{
|
|
429
|
+
"file_path": "analysis/somatic/{run}/{run}_{caller}.output.twist.filtered.vcf",
|
|
430
|
+
"short_descr": "somatic variants: vcf file of somatic variants in TWIST targed capture region filtered by PASS column",
|
|
431
|
+
"long_descr": "VCF file of variants that fall within the TWIST excome capture regions filtered to remove vairants where the PASS column contained one of the following- germline-risk, low_t_alt_frac, t_lod_fstar, or triallelic_site",
|
|
432
|
+
"filter_group": "somatic",
|
|
433
|
+
},
|
|
434
|
+
{
|
|
435
|
+
"file_path": "analysis/somatic/{run}/{run}_{caller}.output.twist.filtered.maf",
|
|
436
|
+
"short_descr": "somatic variants: maf file of somatic variants in TWIST targed capture region filtered by PASS column",
|
|
437
|
+
"long_descr": "MAF file generated by converting twist.filtered.vcf to maf using VEP to annotate variants and vcf2maf to do the conversion. NOTE: Some columns in this maf file may be affected by the ExACdb assembly compatibility issue discussed in the WES pipeline overview page (https://cidc.nci.nih.gov/pipelines/wes).",
|
|
438
|
+
"filter_group": "somatic",
|
|
439
|
+
},
|
|
440
|
+
############################## tcellextrect ##############################
|
|
441
|
+
{
|
|
442
|
+
"file_path": "analysis/tcellextrect/{run}/{run}_tcellextrect.txt",
|
|
443
|
+
"short_descr": "tcell: TCell fraction estimates generated by TcellExTRECT",
|
|
444
|
+
"long_descr": "TCell fraction estimates generated by the TcellExTRECT software (https://github.com/McGranahanLab/TcellExTRECT)",
|
|
445
|
+
"filter_group": "tcell",
|
|
446
|
+
},
|
|
447
|
+
# DEPRECATED by storing output.twist.vcf
|
|
448
|
+
# {
|
|
449
|
+
# "file_path": "analysis/somatic/{run}/{run}_{caller}.filter.exons.center_targets.vcf.gz",
|
|
450
|
+
# "short_descr": "somatic variants: vcf file of filtered somatic variants from center target regions",
|
|
451
|
+
# "long_descr": "VCF file of filtered somatic variants from center target regions using bcftools (http://samtools.github.io/bcftools/bcftools.html).",
|
|
452
|
+
# "filter_group": "somatic",
|
|
453
|
+
# },
|
|
454
|
+
# DEPRECATED b/c we already have report/somatic_variants/05_tumor_germline_overlap.tsv",
|
|
455
|
+
# {
|
|
456
|
+
# "file_path": "analysis/germline/{run}/{run}_vcfcompare.txt", # GERMLINE
|
|
457
|
+
# "short_descr": "somatic variants: overlap of somatic and germline variants",
|
|
458
|
+
# "long_descr": "VCFtool's vcf-compare (http://vcftools.sourceforge.net/perl_module.html#vcf-compare) is used to compare somatic and germline variants. The file shows the number of common variants, somatic only, and germline only variants.",
|
|
459
|
+
# "filter_group": "somatic",
|
|
460
|
+
# "tumor_only_assay": False,
|
|
461
|
+
# },
|
|
462
|
+
# DEPRECATED b/c sentieon cnv caller is removed
|
|
463
|
+
# {
|
|
464
|
+
# "file_path": "analysis/copynumber/{run}/{run}_cnvcalls.txt", # CNV
|
|
465
|
+
# "short_descr": "copynumber: copynumber analysis results",
|
|
466
|
+
# "long_descr": "Copy number variation analysis results using Sentieon CNV algorithm (https://support.sentieon.com/appnotes/cnv/)",
|
|
467
|
+
# "filter_group": "copynumber",
|
|
468
|
+
# },
|
|
469
|
+
# {
|
|
470
|
+
# "file_path": "analysis/copynumber/{run}/{run}_cnvcalls.txt.tn.tsv",
|
|
471
|
+
# "short_descr": "copynumber: copynumber analysis results",
|
|
472
|
+
# "long_descr": "Segmented copy number variation file using Sentieon CNV algorithm (https://support.sentieon.com/appnotes/cnv/)",
|
|
473
|
+
# "filter_group": "copynumber",
|
|
474
|
+
# },
|
|
475
|
+
]
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def main():
|
|
479
|
+
usage = "USAGE: %prog -t [tumor_only assay files (default: False--prints tumor/normal assay files)"
|
|
480
|
+
optparser = OptionParser(usage=usage)
|
|
481
|
+
optparser.add_option(
|
|
482
|
+
"-t",
|
|
483
|
+
"--tumor_only",
|
|
484
|
+
help="print files for tumor_only assay (default: False)",
|
|
485
|
+
default=False,
|
|
486
|
+
action="store_true",
|
|
487
|
+
)
|
|
488
|
+
(options, args) = optparser.parse_args(sys.argv)
|
|
489
|
+
|
|
490
|
+
run_id_files = [
|
|
491
|
+
r for r in map(lambda x: evalWildcards(x, "{run}", "{run id}"), run_files)
|
|
492
|
+
]
|
|
493
|
+
run_id_files = [
|
|
494
|
+
Wesfile(r)
|
|
495
|
+
for r in map(lambda x: evalWildcards(x, "{caller}", "tnscope"), run_id_files)
|
|
496
|
+
]
|
|
497
|
+
|
|
498
|
+
normal_files = [
|
|
499
|
+
Wesfile(s)
|
|
500
|
+
# NOTE: sending in the is_optional param: True for evalWildcards for normal samples
|
|
501
|
+
for s in map(
|
|
502
|
+
lambda x: evalWildcards(x, "{sample}", "{normal cimac id}", True),
|
|
503
|
+
sample_files,
|
|
504
|
+
)
|
|
505
|
+
]
|
|
506
|
+
# Will remove normals below IF options.tumor_only is True
|
|
507
|
+
# #remove normal files from tumor_only_assay
|
|
508
|
+
# for nf in normal_files:
|
|
509
|
+
# nf.tumor_only_assay = False
|
|
510
|
+
|
|
511
|
+
tumor_files = [
|
|
512
|
+
Wesfile(s)
|
|
513
|
+
for s in map(
|
|
514
|
+
lambda x: evalWildcards(x, "{sample}", "{tumor cimac id}"), sample_files
|
|
515
|
+
)
|
|
516
|
+
]
|
|
517
|
+
|
|
518
|
+
tmp = {
|
|
519
|
+
"run id": run_id_files,
|
|
520
|
+
"normal cimac id": normal_files,
|
|
521
|
+
"tumor cimac id": tumor_files,
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
if options.tumor_only: # REMOVE normal files for tumor_only assay
|
|
525
|
+
del tmp["normal cimac id"]
|
|
526
|
+
# also remove any item that is marked tumor_only_assay
|
|
527
|
+
tmp["run id"] = list(
|
|
528
|
+
filter(lambda x: getattr(x, "tumor_only_assay"), tmp["run id"])
|
|
529
|
+
)
|
|
530
|
+
tmp["tumor cimac id"] = list(
|
|
531
|
+
filter(lambda x: getattr(x, "tumor_only_assay"), tmp["tumor cimac id"])
|
|
532
|
+
)
|
|
533
|
+
output_f = "wes_tumor_only_output_API.json"
|
|
534
|
+
else:
|
|
535
|
+
output_f = "wes_output_API.json"
|
|
536
|
+
|
|
537
|
+
# DUMP the file
|
|
538
|
+
json.dump(
|
|
539
|
+
tmp,
|
|
540
|
+
open(os.path.join(os.path.dirname(__file__), output_f), "w"),
|
|
541
|
+
default=dumper,
|
|
542
|
+
indent=4,
|
|
543
|
+
)
|
|
544
|
+
# print(json.dumps(tmp, default=dumper, indent=4))
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
if __name__ == "__main__":
|
|
548
|
+
main()
|