nci-cidc-schemas 0.28.0__py2.py3-none-any.whl → 0.28.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. cidc_schemas/__init__.py +1 -1
  2. cidc_schemas/ngs_pipeline_api/__init__.py +29 -0
  3. cidc_schemas/ngs_pipeline_api/atacseq/atacseq.md +55 -0
  4. cidc_schemas/ngs_pipeline_api/atacseq/atacseq_output_API.json +39 -0
  5. cidc_schemas/ngs_pipeline_api/atacseq/imgs/atacseq.png +0 -0
  6. cidc_schemas/ngs_pipeline_api/output_API.schema.json +45 -0
  7. cidc_schemas/ngs_pipeline_api/rna/imgs/RIMA.png +0 -0
  8. cidc_schemas/ngs_pipeline_api/rna/rna.md +54 -0
  9. cidc_schemas/ngs_pipeline_api/rna/rna_config.schema.json +39 -0
  10. cidc_schemas/ngs_pipeline_api/rna/rna_output_API.json +195 -0
  11. cidc_schemas/ngs_pipeline_api/tcr/imgs/TCRseq.png +0 -0
  12. cidc_schemas/ngs_pipeline_api/tcr/tcr.md +101 -0
  13. cidc_schemas/ngs_pipeline_api/wes/imgs/wes.png +0 -0
  14. cidc_schemas/ngs_pipeline_api/wes/wes.md +46 -0
  15. cidc_schemas/ngs_pipeline_api/wes/wes_config.schema.json +82 -0
  16. cidc_schemas/ngs_pipeline_api/wes/wes_output_API.json +503 -0
  17. cidc_schemas/ngs_pipeline_api/wes/wes_output_API.py +548 -0
  18. cidc_schemas/ngs_pipeline_api/wes/wes_tumor_only_output_API.json +213 -0
  19. {nci_cidc_schemas-0.28.0.dist-info → nci_cidc_schemas-0.28.2.dist-info}/METADATA +2 -2
  20. {nci_cidc_schemas-0.28.0.dist-info → nci_cidc_schemas-0.28.2.dist-info}/RECORD +24 -7
  21. {nci_cidc_schemas-0.28.0.dist-info → nci_cidc_schemas-0.28.2.dist-info}/WHEEL +0 -0
  22. {nci_cidc_schemas-0.28.0.dist-info → nci_cidc_schemas-0.28.2.dist-info}/entry_points.txt +0 -0
  23. {nci_cidc_schemas-0.28.0.dist-info → nci_cidc_schemas-0.28.2.dist-info}/licenses/LICENSE +0 -0
  24. {nci_cidc_schemas-0.28.0.dist-info → nci_cidc_schemas-0.28.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,548 @@
1
+ #!/usr/bin/env python
2
+ """Len Taing 2020 (TGBTG)"""
3
+
4
+ import os
5
+ import sys
6
+ import json
7
+ from optparse import OptionParser
8
+
9
+
10
+ class Wesfile:
11
+ """General wes file object that will handle outputing to appropriate
12
+ json API format"""
13
+
14
+ def __init__(self, file_dict):
15
+ """Given a file path, initializes this object to that path
16
+ NOTE: filepath may include snakemake wildcards, e.g.
17
+ analysis/germline/{run id}/{run id}_vcfcompare.txt"""
18
+ # print(file_tuple)
19
+ self.file_path_template = file_dict["file_path"]
20
+ self.short_description = file_dict["short_descr"]
21
+ self.long_description = file_dict["long_descr"]
22
+ self.filter_group = file_dict["filter_group"]
23
+ self.file_purpose = file_dict.get("file_purpose", "Analysis view")
24
+ self.optional = file_dict.get("optional", False)
25
+ self.tumor_only_assay = file_dict.get(
26
+ "tumor_only_assay", True
27
+ ) # default everything is part of tumor_only assay; below i mark this field false for normal files
28
+
29
+ def __str__(self):
30
+ return self.__dict__.__str__()
31
+
32
+
33
+ def dumper(obj):
34
+ # ref: https://www.semicolonworld.com/question/42934/how-to-make-a-class-json-serializable
35
+ try:
36
+ return obj.toJSON()
37
+ except:
38
+ return obj.__dict__
39
+
40
+
41
+ def evalWildcards(file_tuple, wildcard, s, is_optional=False):
42
+ # file_tuple[0] = file_tuple[0].replace(wildcard, s)
43
+ # Non-destructive replacement
44
+ # print(file_tuple)
45
+ ret = file_tuple.copy()
46
+ ret["file_path"] = ret["file_path"].replace(wildcard, s)
47
+ return ret
48
+
49
+
50
+ sample_files = [
51
+ ############################## ALIGN ##############################
52
+ {
53
+ "file_path": "analysis/align/{sample}/{sample}.sorted.dedup.bam",
54
+ "short_descr": "alignment: bam file with deduplicated reads",
55
+ "long_descr": "Aligned reads were sorted and marked duplicates were removed using the Sentieon Dedup tool (https://support.sentieon.com/manual/usages/general/#dedup-algorithm)",
56
+ "filter_group": "alignment",
57
+ "file_purpose": "Source view",
58
+ },
59
+ {
60
+ "file_path": "analysis/align/{sample}/{sample}.sorted.dedup.bam.bai",
61
+ "short_descr": "alignment: index file for deduplicated bam",
62
+ "long_descr": "Bam index file for deduplicated bam file generated by the Sentieon Dedup tool (https://support.sentieon.com/manual/usages/general/#dedup-algorithm)",
63
+ "filter_group": "alignment",
64
+ "file_purpose": "Source view",
65
+ },
66
+ {
67
+ "file_path": "analysis/align/{sample}/{sample}_recalibrated.bam",
68
+ "short_descr": "alignment: Base Qualtiy Score Recalibration (BQSR) bam file",
69
+ "long_descr": "The Sentieon QualCal (https://support.sentieon.com/manual/usages/general/#qualcal-algorithm) is used to perform BSQR and remove any technical artifacts in the base quality scores.",
70
+ "filter_group": "alignment",
71
+ "file_purpose": "Source view",
72
+ },
73
+ {
74
+ "file_path": "analysis/align/{sample}/{sample}_recalibrated.bam.bai",
75
+ "short_descr": "alignment: index file for Base Qualtiy Score Recalibration (BQSR) bam file",
76
+ "long_descr": "Index file for the BQSR bam file",
77
+ "filter_group": "alignment",
78
+ "file_purpose": "Source view",
79
+ },
80
+ ############################## germline ##############################
81
+ # NOTE: we're ingesting both tumor and normal samples germline call, but
82
+ # the one that only counts is the normal sample's germline call
83
+ {
84
+ "file_path": "analysis/germline/{sample}/{sample}_haplotyper.output.vcf",
85
+ "short_descr": "germline: germline variants",
86
+ "long_descr": "Haplotype variants using Sentieon Haplotyper algorithm (https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm)",
87
+ "filter_group": "germline",
88
+ "tumor_only_assay": False,
89
+ },
90
+ {
91
+ "file_path": "analysis/germline/{sample}/{sample}_haplotyper.targets.vcf.gz",
92
+ "short_descr": "germline: vcf of haplotype variants in targeted regions",
93
+ "long_descr": "Haplotype variants within targeted capture regions using Sentieon Haplotyper algorithm (https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm)",
94
+ "filter_group": "germline",
95
+ "tumor_only_assay": False,
96
+ },
97
+ ###################################################################
98
+ # NOTE: FOR all hla callers we're ingesting both tumor and normal
99
+ # sample, but the one that counts is the tumor sample's
100
+ #
101
+ # ALSO: for hla, we're not stashing the chr6 fastqs (which are used
102
+ # as input for the HLA callers b/c those can be quickly re-derived
103
+ # from sorted.dedup.bam --under 30secs per sample)
104
+ ############################## hlahd ##############################
105
+ {
106
+ "file_path": "analysis/hlahd/{sample}/result/{sample}_final.result.txt",
107
+ "short_descr": "hla: MHC Class I and II results (using HLA-HD)",
108
+ "long_descr": "Predicted MHC Class II and II results using the HLA-HD software (https://www.genome.med.kyoto-u.ac.jp/HLA-HD/). Chromosome 6 reads from the deduplicated bam file were extracted and fed into the HLA-HD prediction algorithm.",
109
+ "filter_group": "HLA",
110
+ },
111
+ ############################## optitype #######################
112
+ {
113
+ "file_path": "analysis/optitype/{sample}/{sample}_result.tsv",
114
+ "short_descr": "hla: MHC Class I results (using OptiType)",
115
+ "long_descr": "Predicted MHC Class I alleles using the Optitype software (https://github.com/FRED-2/OptiType). Chromosome 6 reads from the deduplicated bam file were extracted and fed into the Optitype prediction algorithm.",
116
+ "filter_group": "HLA",
117
+ },
118
+ ############################## xhla #######################
119
+ {
120
+ "file_path": "analysis/xhla/{sample}/report-{sample}-hla.json",
121
+ "short_descr": "hla: MHC Class I and II results (using xhla)",
122
+ "long_descr": "Predicted MHC Class I and II results using the xHLA software(https://github.com/humanlongevity/HLA). Chromosome 6 reads from the deduplicated bam file were extracted and fed into the xHLA prediction algorithm.",
123
+ "filter_group": "HLA",
124
+ },
125
+ #####################################################################
126
+ # 2022-06-14 DEPRECATING ALL metrics for ingestion for the following
127
+ # reasons:
128
+ # 1. They are BIG and USELESS (we have never looked at them)
129
+ # 2. the sample's summary coverage, e.g. total_reads, mean_depth,
130
+ # percent_bases_gt_50, etc. are sufficient in 99.9% of the cases!
131
+ # 3. since we are stashing the sorted.dedup.bam file, we can always
132
+ # re-derive them IF we need!
133
+ ############################## metrics ##############################
134
+ # {
135
+ # "file_path": "analysis/metrics/{sample}/{sample}_coverage_metrics.txt",
136
+ # "short_descr": "coverage: global coverage file",
137
+ # "long_descr": "Genome wide coverage file generated using the Sentieon CoverageMetrics algorithm (https://support.sentieon.com/manual/usages/general/#coveragemetrics-algorithm) with a coverage threshold (cov_thresh) set to 50.",
138
+ # "filter_group": "coverage",
139
+ # },
140
+ # {
141
+ # "file_path": "analysis/metrics/{sample}/{sample}_target_metrics.txt",
142
+ # "short_descr": "coverage: target region coverage file",
143
+ # "long_descr": "Targeted exome regions coverage file using the Sentieon CoverageMetrics algorithm (https://support.sentieon.com/manual/usages/general/#coveragemetrics-algorithm) with a coverage threshold (cov_thresh) set to 50.",
144
+ # "filter_group": "coverage",
145
+ # },
146
+ # {
147
+ # "file_path": "analysis/metrics/{sample}/{sample}_coverage_metrics.sample_summary.txt",
148
+ # "short_descr": "coverage: global coverage summary file",
149
+ # "long_descr": "Genome wide coverage summary file generated by the Sentieon CoverageMetrics algorithm (https://support.sentieon.com/manual/usages/general/#coveragemetrics-algorithm).",
150
+ # "filter_group": "coverage",
151
+ # },
152
+ ]
153
+
154
+ run_files = [
155
+ ############################## MISC ##############################
156
+ {
157
+ "file_path": "analysis/{run}_error.yaml",
158
+ "short_descr": "yaml file that specifies error codes for files",
159
+ "long_descr": "Explanation of all files which are expected to be empty due to a failed/missing module.",
160
+ "optional": True, # optional
161
+ "filter_group": "",
162
+ },
163
+ ############################## clonality ##############################
164
+ # sequenza files
165
+ {
166
+ "file_path": "analysis/clonality/{run}/{run}_segments.txt",
167
+ "short_descr": "copynumber: Sequenza CNV segments file",
168
+ "long_descr": "Copy number variation segments file called by the Sequenza software package. The column descriptions for the segment file could be found here (https://cran.r-project.org/web/packages/sequenza/vignettes/sequenza.html#plots-and-results)",
169
+ "filter_group": "copynumber",
170
+ "tumor_only_assay": False,
171
+ },
172
+ {
173
+ "file_path": "analysis/clonality/{run}/{run}_genome_view.pdf",
174
+ "short_descr": "copynumber: Sequenza genome-wide plot of depth.ratio and B-allele frequency.",
175
+ "long_descr": "Genome-wide plot (generated by Sequenza) showing depth.ratio and B-allele frequency.",
176
+ "filter_group": "copynumber",
177
+ "tumor_only_assay": False,
178
+ },
179
+ {
180
+ "file_path": "analysis/clonality/{run}/{run}_chromosome_view.pdf",
181
+ "short_descr": "copynumber: Sequenza plot of depth.ratio and B-allele frequency chromosome by chromosome.",
182
+ "long_descr": "Chromosome by chromosome plot (generated by Sequenza) showing depth.ratio and B-allele frequency.",
183
+ "filter_group": "copynumber",
184
+ "tumor_only_assay": False,
185
+ },
186
+ {
187
+ "file_path": "analysis/clonality/{run}/{run}_sequenza_gainLoss.bed",
188
+ "short_descr": "copynumber: Sequenza CNV segments file filtered with hard cut-offs to call regions of GAIN/LOSS",
189
+ "long_descr": "Filtered Sequenza segments file after applying a hard cut-off to call regions of GAIN (total copy number >= 3) and regions of LOSS (total copy number <= 1.5).",
190
+ "filter_group": "copynumber",
191
+ "tumor_only_assay": False,
192
+ },
193
+ ######################################################################
194
+ # NOTE: stashing final.seqz file because it's 1. SMALL (under 100MB)
195
+ # and 2. is very hard to re-derive from recalibrated.bam
196
+ ######################################################################
197
+ {
198
+ "file_path": "analysis/clonality/{run}/{run}.bin50.final.seqz.txt.gz",
199
+ "short_descr": "copynumber: Sequenza post-processed seqz file used for input to Sequenza CNV caller",
200
+ "long_descr": "Sequenza seqz file generated by the bam2seqz software using a GC wiggle track with a window size of 50 (-w 50).",
201
+ "filter_group": "copynumber",
202
+ "tumor_only_assay": False,
203
+ },
204
+ # purity files
205
+ {
206
+ "file_path": "analysis/clonality/{run}/{run}_alternative_solutions.txt",
207
+ "short_descr": "purity: Sequenza Cellularity and Ploidy estimate file",
208
+ "long_descr": "Cellularity and ploidy estimates of the tumor sample using the Sequenza software package. The columns of the file are follows: Cellularity, Ploidy, and SLPP (Scaled Log Posterior Probability).",
209
+ "filter_group": "purity",
210
+ "tumor_only_assay": False,
211
+ },
212
+ {
213
+ "file_path": "analysis/clonality/{run}/{run}_CP_contours.pdf",
214
+ "short_descr": "purity: Sequenza plot of likelihood densities for all cellularity/ploidy solutions.",
215
+ "long_descr": "Sequenza generated plot showing the likelihood densities for each cellularity/ploidy solution (https://cran.r-project.org/web/packages/sequenza/vignettes/sequenza.html#plots-and-results).",
216
+ "filter_group": "purity",
217
+ "tumor_only_assay": False,
218
+ },
219
+ # pyclone files
220
+ {
221
+ "file_path": "analysis/clonality/{run}/{run}_pyclone6.input.tsv",
222
+ "short_descr": "tumor clonality: PyClone-VI input file generated by sequenza library (https://cran.r-project.org/web/packages/sequenza/index.html)",
223
+ "long_descr": "Input file generated for PyClone-VI analysis. Sequenza was used to generate the expected file format (https://github.com/Roth-Lab/pyclone-vi#input-format).",
224
+ "filter_group": "clonality",
225
+ "tumor_only_assay": False,
226
+ },
227
+ {
228
+ "file_path": "analysis/clonality/{run}/{run}_pyclone6.results.tsv",
229
+ "short_descr": "tumor clonality: PyClone-VI tumor clonality results file",
230
+ "long_descr": "Tumor clone/cluster prevalence estimations generated by the PyClone-VI software package. The format of the results file is described here (https://github.com/Roth-Lab/pyclone-vi#output-format).",
231
+ "filter_group": "clonality",
232
+ "tumor_only_assay": False,
233
+ },
234
+ {
235
+ "file_path": "analysis/clonality/{run}/{run}_pyclone6.results.summary.tsv",
236
+ "short_descr": "tumor clonality: PyClone-VI tumor clonality results summary file",
237
+ "long_descr": "Summary of Pyclone-VI results file condensed to only show the cluster_id, cellular_prevalence, and cellular_prevalence_std columns.",
238
+ "filter_group": "clonality",
239
+ "tumor_only_assay": False,
240
+ },
241
+ ############################## CNVkit ##############################
242
+ {
243
+ "file_path": "analysis/cnvkit/{run}/{run}.call.cns",
244
+ "short_descr": "copynumber: CNVkit segments file",
245
+ "long_descr": "CNVkit's Segmented log2 ratios file. The 'cn' column representes the total copy number of the segment. The other columns of the results file are described here (https://cnvkit.readthedocs.io/en/stable/fileformats.html#segmented-log2-ratios-cns)",
246
+ "filter_group": "copynumber",
247
+ "tumor_only_assay": False,
248
+ },
249
+ {
250
+ "file_path": "analysis/cnvkit/{run}/{run}.call.enhanced.cns",
251
+ "short_descr": "copynumber: Enhanced CNVkit segments file with BAF and Major/minor allele information",
252
+ "long_descr": "The enhanced CNVkit segments file incoporates somatic sNP and tumor purity information (called by the pipeline) to incorporate B-allele frequencies, major and minor allele (cn1 and cn2 respectively), and correct for tumor sample purity level.",
253
+ "filter_group": "copynumber",
254
+ "tumor_only_assay": False,
255
+ },
256
+ {
257
+ "file_path": "analysis/cnvkit/{run}/{run}.scatter.png",
258
+ "short_descr": "copynumber: scatter plot of log2 coverage and segmentation call information",
259
+ "long_descr": "Genome-wide scatter plot of log2 coverage ratios and called CNV segments",
260
+ "filter_group": "copynumber",
261
+ "tumor_only_assay": False,
262
+ },
263
+ {
264
+ "file_path": "analysis/cnvkit/{run}/{run}_cnvkit_gainLoss.bed",
265
+ "short_descr": "copynumber: CNVkit segments file filtered with hard cut-offs to call regions of GAIN/LOSS",
266
+ "long_descr": "Filtered CNVkit segments file after applying a hard cut-off to call regions of GAIN (total copy number >= 3) and regions of LOSS (total copy number <= 1.5).",
267
+ "filter_group": "copynumber",
268
+ "tumor_only_assay": False,
269
+ },
270
+ ############################## Copynumber ##############################
271
+ {
272
+ "file_path": "analysis/copynumber/{run}/{run}_consensus.bed",
273
+ "short_descr": "copynumber: Consensus CNV segments file",
274
+ "long_descr": "Consensus CNV regions that are called by at least 2 of the 3 callers (CNVkit, Sequenza, or FACETS). CNV Callers must agree on both the region (intersection of overlapped regions) and the call (GAIN or LOSS).",
275
+ "filter_group": "copynumber",
276
+ "tumor_only_assay": False,
277
+ },
278
+ {
279
+ "file_path": "analysis/copynumber/{run}/{run}_consensus_merged_GAIN.bed",
280
+ "short_descr": "copynumber: Consensus CNV segments file of only GAIN regions",
281
+ "long_descr": "GAIN only CNV regions derived from the consensus CNV file. Regions are also merged if they have an overlap of at least 1bp. ",
282
+ "filter_group": "copynumber",
283
+ "tumor_only_assay": False,
284
+ },
285
+ {
286
+ "file_path": "analysis/copynumber/{run}/{run}_consensus_merged_LOSS.bed",
287
+ "short_descr": "copynumber: Consensus CNV segments file of only LOSS regions",
288
+ "long_descr": "LOSS only CNV regions derived from the consensus CNV file. Regions are also merged if they have an overlap of at least 1bp. ",
289
+ "filter_group": "copynumber",
290
+ "tumor_only_assay": False,
291
+ },
292
+ ############################## msisensor2 ##############################
293
+ {
294
+ "file_path": "analysis/msisensor2/{run}/{run}_msisensor2.txt",
295
+ "short_descr": "msisensor2: microsatellite instability calculation",
296
+ "long_descr": "Microsatellite instability calculation using msisensor2 (https://github.com/niu-lab/msisensor2)",
297
+ "filter_group": "msisensor2",
298
+ },
299
+ ############################## neoantigen ##############################
300
+ {
301
+ "file_path": "analysis/neoantigen/{run}/combined/{run}.filtered.tsv",
302
+ "short_descr": "neaontigen: list of predicted neoantigens",
303
+ "long_descr": "The combined MHC class I and II predicted neoantigens using the pVACseq software. The column definitions are given here (ref: https://pvactools.readthedocs.io/en/latest/pvacseq/output_files.html)",
304
+ "filter_group": "neoantigen",
305
+ },
306
+ ############################## purity ##############################
307
+ {
308
+ "file_path": "analysis/purity/{run}/{run}.optimalpurityvalue.txt",
309
+ "short_descr": "tumor purity: tumor purity estimates using the FACETS software package",
310
+ "long_descr": "Tumor purity estimates using the FACETS software (https://github.com/mskcc/facets).",
311
+ "filter_group": "purity",
312
+ "tumor_only_assay": False,
313
+ },
314
+ {
315
+ "file_path": "analysis/purity/{run}/{run}.cncf",
316
+ "short_descr": "copynumber: FACETS CNV segments file",
317
+ "long_descr": "Copy number variation segments file called by the FACETS software (https://github.com/mskcc/facets).",
318
+ "filter_group": "copynumber",
319
+ "tumor_only_assay": False,
320
+ },
321
+ {
322
+ "file_path": "analysis/purity/{run}/{run}_facets_gainLoss.bed",
323
+ "short_descr": "copynumber: FACETS CNV segments file filtered with hard-cutoff to call regions of GAIN/LOSS",
324
+ "long_descr": "Filtered FACETS segments file after applying a hard cut-off to call regions of GAIN (total copy number >= 3) and regions of LOSS (total copy number <= 1.5).",
325
+ "filter_group": "copynumber",
326
+ "tumor_only_assay": False,
327
+ },
328
+ ############################## report ##############################
329
+ {
330
+ "file_path": "analysis/report.tar.gz",
331
+ "short_descr": "wes report: wes summary html report",
332
+ "long_descr": "This is a gzipped file of the report directory, which contains the report.html file. After unzipping the file, the user can load report/report.html into any browser to view the WES Summary Report. The report contains run information (i.e. wes software version used to run the analysis as well as the software version of the major tools) as well as summarizations of sample quality, copy number variation, somatic variants, and HLA-type/neoantigen predictions.",
333
+ "filter_group": "report",
334
+ },
335
+ {
336
+ "file_path": "analysis/report/somatic_variants/05_tumor_germline_overlap.tsv",
337
+ "short_descr": "somatic variants: report file of tumor vs germline variants overlap",
338
+ "long_descr": "This file derived from the somatic and germline variants comparison results generated by vcf-compare (http://vcftools.sourceforge.net/perl_module.html#vcf-compare) and is formatted to be human readable. The file reports the number of somatic/tumor only variants (unfiltered), germline/normal only variants (unfiltered), the number of shared variants, and the percent overlap (using the total number of somatic variants as the denominator).",
339
+ "filter_group": "somatic",
340
+ "tumor_only_assay": False,
341
+ },
342
+ # DEPRECATED! b/c we have 3 other HLA files!
343
+ # {
344
+ # "file_path": "analysis/report/neoantigens/01_HLA_Results.tsv", # HLA
345
+ # "short_descr": "hla: report file of combined MHC class I and II results",
346
+ # "long_descr": "This file reports the MHC class I and II results. The class I alleles are derived from the OptiType results and the class II alleles come from the HLA-HD results. ",
347
+ # "filter_group": "HLA",
348
+ # },
349
+ {
350
+ "file_path": "analysis/report/WES_Meta/02_WES_Run_Version.tsv",
351
+ "short_descr": "wes pipeline version- INTERNAL ONLY- for reproducibility",
352
+ "long_descr": "wes pipeline version- INTERNAL ONLY- for reproducibility",
353
+ "filter_group": "report",
354
+ "file_purpose": "Miscellaneous",
355
+ },
356
+ {
357
+ "file_path": "analysis/report/config.yaml",
358
+ "short_descr": "wes pipeline config file- INTERNAL ONLY- for reproducibility",
359
+ "long_descr": "wes pipeline config file- INTERNAL ONLY- for reproducibility",
360
+ "filter_group": "report",
361
+ "file_purpose": "Miscellaneous",
362
+ },
363
+ {
364
+ "file_path": "analysis/report/metasheet.csv",
365
+ "short_descr": "wes pipeline metasheet file- INTERNAL ONLY- for reproducibility",
366
+ "long_descr": "wes pipeline metasheet file- INTERNAL ONLY- for reproducibility",
367
+ "filter_group": "report",
368
+ "file_purpose": "Miscellaneous",
369
+ },
370
+ {
371
+ "file_path": "analysis/report/json/{run}.wes.json",
372
+ "short_descr": "wes sample json for cohort report generation-INTERNAL ONLY",
373
+ "long_descr": "wes sample json for cohort report generation-INTERNAL ONLY",
374
+ "filter_group": "report",
375
+ "file_purpose": "Miscellaneous",
376
+ },
377
+ ############################## rna ##############################
378
+ {
379
+ "file_path": "analysis/rna/{run}/{run}.haplotyper.rna.vcf.gz", # RNA
380
+ "short_descr": "rna: Variants called from RNA-seq data",
381
+ "long_descr": "RNA-seq variants called using the Sentieon RNA Variant Calling pipeline(https://support.sentieon.com/manual/RNA_call/rna/). Sentieon's Haplotyper algorithm was used for the variant calling.",
382
+ "filter_group": "rna",
383
+ "optional": True, # optional
384
+ },
385
+ {
386
+ "file_path": "analysis/rna/{run}/{run}_{caller}.output.twist.neoantigen.vep.rna.vcf",
387
+ "short_descr": "rna: Shared RNA and WES variants that is used for neoantigen prediction when RNA-seq data is provided with the WES run",
388
+ "long_descr": "Variants file representing the common variants between RNA (haplotyper.rna.vcf.gz) and WES data (output.twist.neoantigen.vep.vcf).",
389
+ "filter_group": "rna",
390
+ "optional": True, # optional
391
+ },
392
+ ############################## somatic ##############################
393
+ {
394
+ "file_path": "analysis/somatic/{run}/{run}_{caller}.output.vcf.gz",
395
+ "short_descr": "somatic variants: vcf file of somatic variants",
396
+ "long_descr": """VCF file of somatic variants using one of the following the Sentieon somatic callers {tnscope (default), tnhaplotyper2, tnsnv}.
397
+
398
+ TNscope algorithm- https://support.sentieon.com/manual/usages/general/#tnscope-algorithm
399
+ TNhaplotyper2- https://support.sentieon.com/manual/usages/general/#tnhaplotyper2-algorithm
400
+ TNsnv - https://support.sentieon.com/manual/usages/general/#tnsnv-algorithm""",
401
+ "filter_group": "somatic",
402
+ },
403
+ # another way of describing it way of doing it
404
+ # {
405
+ # "file_path": "analysis/somatic/{run}/{run}_{caller}.output.twist.vcf",
406
+ # "short_descr": "somatic variants: vcf file of somatic variants in TWIST targeted capture regions",
407
+ # "long_descr": "VCF file of somatic variants that are within the TWIST exome capture regions. bcftools is used to filter reads in output.vcf.gz that intersect with the TWIST capture regions.",
408
+ # "filter_group": "somatic",
409
+ # },
410
+ # {
411
+ # "file_path": "analysis/somatic/{run}/{run}_{caller}.twist.maf",
412
+ # "short_descr": "somatic variants: maf file of somatic variants in TWIST targeted capture regions",
413
+ # "long_descr": "MAF file of TWIST variants using vcf2maf tool (https://github.com/mskcc/vcf2maf). The vep annotated vcf of the TWIST variants (output.twist.vcf) was converted to maf using vcf2maf.",
414
+ # "filter_group": "somatic",
415
+ # },
416
+ {
417
+ "file_path": "analysis/somatic/{run}/{run}_{caller}.output.twist.vcf",
418
+ "short_descr": "somatic variants: vcf file of somatic variants in TWIST targed capture region",
419
+ "long_descr": "VCF file of variants that fall within the TWIST excome capture regions. bcftools is used to filter reads in output.vcf.gz that intersect with the TWIST capture regions.",
420
+ "filter_group": "somatic",
421
+ },
422
+ {
423
+ "file_path": "analysis/somatic/{run}/{run}_{caller}.output.twist.maf",
424
+ "short_descr": "somatic variants: maf file of somatic variants in TWIST targed capture region",
425
+ "long_descr": "MAF file of variants that fall within the TWIST excome capture regions generated using vcf2maf tool (https://github.com/mskcc/vcf2maf). VEP was used to annotate twist.vcf file, which was then used as input to vcf2maf. NOTE: Some columns in this maf file may be affected by the ExACdb assembly compatibility issue discussed in the WES pipeline overview page (https://cidc.nci.nih.gov/pipelines/wes).",
426
+ "filter_group": "somatic",
427
+ },
428
+ {
429
+ "file_path": "analysis/somatic/{run}/{run}_{caller}.output.twist.filtered.vcf",
430
+ "short_descr": "somatic variants: vcf file of somatic variants in TWIST targed capture region filtered by PASS column",
431
+ "long_descr": "VCF file of variants that fall within the TWIST excome capture regions filtered to remove vairants where the PASS column contained one of the following- germline-risk, low_t_alt_frac, t_lod_fstar, or triallelic_site",
432
+ "filter_group": "somatic",
433
+ },
434
+ {
435
+ "file_path": "analysis/somatic/{run}/{run}_{caller}.output.twist.filtered.maf",
436
+ "short_descr": "somatic variants: maf file of somatic variants in TWIST targed capture region filtered by PASS column",
437
+ "long_descr": "MAF file generated by converting twist.filtered.vcf to maf using VEP to annotate variants and vcf2maf to do the conversion. NOTE: Some columns in this maf file may be affected by the ExACdb assembly compatibility issue discussed in the WES pipeline overview page (https://cidc.nci.nih.gov/pipelines/wes).",
438
+ "filter_group": "somatic",
439
+ },
440
+ ############################## tcellextrect ##############################
441
+ {
442
+ "file_path": "analysis/tcellextrect/{run}/{run}_tcellextrect.txt",
443
+ "short_descr": "tcell: TCell fraction estimates generated by TcellExTRECT",
444
+ "long_descr": "TCell fraction estimates generated by the TcellExTRECT software (https://github.com/McGranahanLab/TcellExTRECT)",
445
+ "filter_group": "tcell",
446
+ },
447
+ # DEPRECATED by storing output.twist.vcf
448
+ # {
449
+ # "file_path": "analysis/somatic/{run}/{run}_{caller}.filter.exons.center_targets.vcf.gz",
450
+ # "short_descr": "somatic variants: vcf file of filtered somatic variants from center target regions",
451
+ # "long_descr": "VCF file of filtered somatic variants from center target regions using bcftools (http://samtools.github.io/bcftools/bcftools.html).",
452
+ # "filter_group": "somatic",
453
+ # },
454
+ # DEPRECATED b/c we already have report/somatic_variants/05_tumor_germline_overlap.tsv",
455
+ # {
456
+ # "file_path": "analysis/germline/{run}/{run}_vcfcompare.txt", # GERMLINE
457
+ # "short_descr": "somatic variants: overlap of somatic and germline variants",
458
+ # "long_descr": "VCFtool's vcf-compare (http://vcftools.sourceforge.net/perl_module.html#vcf-compare) is used to compare somatic and germline variants. The file shows the number of common variants, somatic only, and germline only variants.",
459
+ # "filter_group": "somatic",
460
+ # "tumor_only_assay": False,
461
+ # },
462
+ # DEPRECATED b/c sentieon cnv caller is removed
463
+ # {
464
+ # "file_path": "analysis/copynumber/{run}/{run}_cnvcalls.txt", # CNV
465
+ # "short_descr": "copynumber: copynumber analysis results",
466
+ # "long_descr": "Copy number variation analysis results using Sentieon CNV algorithm (https://support.sentieon.com/appnotes/cnv/)",
467
+ # "filter_group": "copynumber",
468
+ # },
469
+ # {
470
+ # "file_path": "analysis/copynumber/{run}/{run}_cnvcalls.txt.tn.tsv",
471
+ # "short_descr": "copynumber: copynumber analysis results",
472
+ # "long_descr": "Segmented copy number variation file using Sentieon CNV algorithm (https://support.sentieon.com/appnotes/cnv/)",
473
+ # "filter_group": "copynumber",
474
+ # },
475
+ ]
476
+
477
+
478
+ def main():
479
+ usage = "USAGE: %prog -t [tumor_only assay files (default: False--prints tumor/normal assay files)"
480
+ optparser = OptionParser(usage=usage)
481
+ optparser.add_option(
482
+ "-t",
483
+ "--tumor_only",
484
+ help="print files for tumor_only assay (default: False)",
485
+ default=False,
486
+ action="store_true",
487
+ )
488
+ (options, args) = optparser.parse_args(sys.argv)
489
+
490
+ run_id_files = [
491
+ r for r in map(lambda x: evalWildcards(x, "{run}", "{run id}"), run_files)
492
+ ]
493
+ run_id_files = [
494
+ Wesfile(r)
495
+ for r in map(lambda x: evalWildcards(x, "{caller}", "tnscope"), run_id_files)
496
+ ]
497
+
498
+ normal_files = [
499
+ Wesfile(s)
500
+ # NOTE: sending in the is_optional param: True for evalWildcards for normal samples
501
+ for s in map(
502
+ lambda x: evalWildcards(x, "{sample}", "{normal cimac id}", True),
503
+ sample_files,
504
+ )
505
+ ]
506
+ # Will remove normals below IF options.tumor_only is True
507
+ # #remove normal files from tumor_only_assay
508
+ # for nf in normal_files:
509
+ # nf.tumor_only_assay = False
510
+
511
+ tumor_files = [
512
+ Wesfile(s)
513
+ for s in map(
514
+ lambda x: evalWildcards(x, "{sample}", "{tumor cimac id}"), sample_files
515
+ )
516
+ ]
517
+
518
+ tmp = {
519
+ "run id": run_id_files,
520
+ "normal cimac id": normal_files,
521
+ "tumor cimac id": tumor_files,
522
+ }
523
+
524
+ if options.tumor_only: # REMOVE normal files for tumor_only assay
525
+ del tmp["normal cimac id"]
526
+ # also remove any item that is marked tumor_only_assay
527
+ tmp["run id"] = list(
528
+ filter(lambda x: getattr(x, "tumor_only_assay"), tmp["run id"])
529
+ )
530
+ tmp["tumor cimac id"] = list(
531
+ filter(lambda x: getattr(x, "tumor_only_assay"), tmp["tumor cimac id"])
532
+ )
533
+ output_f = "wes_tumor_only_output_API.json"
534
+ else:
535
+ output_f = "wes_output_API.json"
536
+
537
+ # DUMP the file
538
+ json.dump(
539
+ tmp,
540
+ open(os.path.join(os.path.dirname(__file__), output_f), "w"),
541
+ default=dumper,
542
+ indent=4,
543
+ )
544
+ # print(json.dumps(tmp, default=dumper, indent=4))
545
+
546
+
547
+ if __name__ == "__main__":
548
+ main()