biopipen 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.py +0 -5
- biopipen/core/config.toml +4 -4
- biopipen/core/defaults.py +3 -3
- biopipen/core/filters.py +1 -0
- biopipen/core/proc.py +1 -3
- biopipen/core/testing.py +1 -2
- biopipen/ns/bam.py +10 -14
- biopipen/ns/bcftools.py +37 -7
- biopipen/ns/bed.py +9 -16
- biopipen/ns/cnv.py +8 -11
- biopipen/ns/cnvkit.py +32 -59
- biopipen/ns/cnvkit_pipeline.py +266 -310
- biopipen/ns/csv.py +0 -2
- biopipen/ns/gene.py +0 -1
- biopipen/ns/gsea.py +4 -10
- biopipen/ns/misc.py +0 -5
- biopipen/ns/plot.py +2 -4
- biopipen/ns/rnaseq.py +0 -1
- biopipen/ns/scrna.py +78 -120
- biopipen/ns/scrna_metabolic_landscape.py +306 -348
- biopipen/ns/tcgamaf.py +52 -0
- biopipen/ns/tcr.py +5 -15
- biopipen/ns/vcf.py +52 -34
- biopipen/ns/web.py +8 -19
- biopipen/reports/bam/CNAClinic.svelte +1 -1
- biopipen/reports/bam/CNVpytor.svelte +2 -2
- biopipen/reports/bam/ControlFREEC.svelte +1 -1
- biopipen/reports/cnv/AneuploidyScore.svelte +2 -2
- biopipen/reports/cnv/AneuploidyScoreSummary.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
- biopipen/reports/gsea/FGSEA.svelte +1 -1
- biopipen/reports/gsea/GSEA.svelte +2 -2
- biopipen/reports/scrna/CellsDistribution.svelte +1 -1
- biopipen/reports/scrna/DimPlots.svelte +1 -1
- biopipen/reports/scrna/GeneExpressionInvistigation.svelte +1 -1
- biopipen/reports/scrna/MarkersFinder.svelte +42 -39
- biopipen/reports/scrna/ScFGSEA.svelte +3 -3
- biopipen/reports/scrna/SeuratClusterStats.svelte +3 -3
- biopipen/reports/scrna/SeuratPreparing.svelte +2 -2
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubsets.svelte +2 -2
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +1 -1
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +1 -1
- biopipen/reports/tcr/CloneResidency.svelte +4 -4
- biopipen/reports/tcr/Immunarch.svelte +2 -2
- biopipen/reports/tcr/SampleDiversity.svelte +2 -2
- biopipen/reports/tcr/TCRClusteringStats.svelte +3 -3
- biopipen/reports/tcr/VJUsage.svelte +1 -1
- biopipen/reports/utils/gsea.liq +1 -1
- biopipen/reports/utils/misc.liq +1 -1
- biopipen/reports/vcf/TruvariBenchSummary.svelte +1 -1
- biopipen/reports/vcf/TruvariConsistency.svelte +3 -3
- biopipen/scripts/bcftools/BcftoolsSort.py +19 -0
- biopipen/scripts/scrna/MarkersFinder.R +73 -35
- biopipen/scripts/tcgamaf/Maf2Vcf.py +22 -0
- biopipen/scripts/tcgamaf/MafAddChr.py +14 -0
- biopipen/scripts/tcgamaf/maf2vcf.pl +427 -0
- biopipen/scripts/vcf/VcfAnno.py +26 -0
- biopipen/scripts/vcf/VcfFix_utils.py +3 -2
- {biopipen-0.7.0.dist-info → biopipen-0.8.0.dist-info}/METADATA +7 -8
- {biopipen-0.7.0.dist-info → biopipen-0.8.0.dist-info}/RECORD +65 -59
- {biopipen-0.7.0.dist-info → biopipen-0.8.0.dist-info}/WHEEL +1 -1
- {biopipen-0.7.0.dist-info → biopipen-0.8.0.dist-info}/entry_points.txt +2 -1
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
#!/usr/bin/env perl
|
|
2
|
+
|
|
3
|
+
# maf2vcf - Reformat variants in a given MAF into generic VCFs with GT:AD:DP data if available
|
|
4
|
+
|
|
5
|
+
# Modified version of the original script by @mskcc, which can be found here:
|
|
6
|
+
# https://github.com/mskcc/vcf2maf
|
|
7
|
+
# This is modified to:
|
|
8
|
+
# - Add path to samtools to arguments
|
|
9
|
+
# - Add Variant_Classification and Variant_Type to INFO field
|
|
10
|
+
# - Fix https://github.com/mskcc/vcf2maf/issues/234
|
|
11
|
+
# - Adding logs
|
|
12
|
+
|
|
13
|
+
use strict;
|
|
14
|
+
use warnings;
|
|
15
|
+
use IO::File;
|
|
16
|
+
use Getopt::Long qw( GetOptions );
|
|
17
|
+
use Pod::Usage qw( pod2usage );
|
|
18
|
+
|
|
19
|
+
# Set any default paths and constants
|
|
20
|
+
my $ref_fasta = "$ENV{HOME}/.vep/homo_sapiens/102_GRCh37/Homo_sapiens.GRCh37.dna.toplevel.fa.gz";
|
|
21
|
+
my ( $tum_depth_col, $tum_rad_col, $tum_vad_col ) = qw( t_depth t_ref_count t_alt_count );
|
|
22
|
+
my ( $nrm_depth_col, $nrm_rad_col, $nrm_vad_col ) = qw( n_depth n_ref_count n_alt_count );
|
|
23
|
+
|
|
24
|
+
# Find out if samtools is properly installed, and warn the user if it's not
|
|
25
|
+
my ( $samtools ) = map{chomp; $_}`which samtools`;
|
|
26
|
+
# ( $samtools and -e $samtools ) or die "ERROR: Please install samtools, and make sure it's in your PATH\n";
|
|
27
|
+
|
|
28
|
+
# Check for missing or crappy arguments
|
|
29
|
+
unless( @ARGV and $ARGV[0]=~m/^-/ ) {
|
|
30
|
+
pod2usage( -verbose => 0, -message => "$0: Missing or invalid arguments!\n", -exitval => 2 );
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# Parse options and print usage syntax on a syntax error, or if help was explicitly requested
|
|
34
|
+
my ( $man, $help, $per_tn_vcfs ) = ( 0, 0, 0 );
|
|
35
|
+
my ( $input_maf, $output_dir, $output_vcf );
|
|
36
|
+
GetOptions(
|
|
37
|
+
'help!' => \$help,
|
|
38
|
+
'man!' => \$man,
|
|
39
|
+
'samtools=s' => \$samtools,
|
|
40
|
+
'input-maf=s' => \$input_maf,
|
|
41
|
+
'output-dir=s' => \$output_dir,
|
|
42
|
+
'output-vcf=s' => \$output_vcf,
|
|
43
|
+
'ref-fasta=s' => \$ref_fasta,
|
|
44
|
+
'per-tn-vcfs!' => \$per_tn_vcfs,
|
|
45
|
+
'tum-depth-col=s' => \$tum_depth_col,
|
|
46
|
+
'tum-rad-col=s' => \$tum_rad_col,
|
|
47
|
+
'tum-vad-col=s' => \$tum_vad_col,
|
|
48
|
+
'nrm-depth-col=s' => \$nrm_depth_col,
|
|
49
|
+
'nrm-rad-col=s' => \$nrm_rad_col,
|
|
50
|
+
'nrm-vad-col=s' => \$nrm_vad_col
|
|
51
|
+
) or pod2usage( -verbose => 1, -input => \*DATA, -exitval => 2 );
|
|
52
|
+
pod2usage( -verbose => 1, -input => \*DATA, -exitval => 0 ) if( $help );
|
|
53
|
+
pod2usage( -verbose => 2, -input => \*DATA, -exitval => 0 ) if( $man );
|
|
54
|
+
|
|
55
|
+
# Check if required arguments are missing or problematic, fix as needed
|
|
56
|
+
( defined $input_maf and defined $output_dir ) or die "ERROR: --input-maf and --output-dir must be defined!\n";
|
|
57
|
+
( -s $ref_fasta ) or die "ERROR: Provided Reference FASTA is missing or empty! Path: $ref_fasta\n";
|
|
58
|
+
unless( defined $output_vcf ) {
|
|
59
|
+
$output_vcf = "$output_dir/" . substr( $input_maf, rindex( $input_maf, '/' ) + 1 );
|
|
60
|
+
$output_vcf =~ s/(\.)?(maf|tsv|txt)?$/.vcf/;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# Before anything, let's parse the headers of this supposed "MAF-like" file and do some checks
|
|
64
|
+
my $maf_fh = IO::File->new( $input_maf ) or die "ERROR: Couldn't open input MAF: $input_maf!\n";
|
|
65
|
+
my ( %uniq_regions, %filter_tags, %flanking_bps, @tn_pair, %col_idx, $header_line );
|
|
66
|
+
my $i = 0;
|
|
67
|
+
print STDOUT "INFO: Parsing maf file to gather information for vcf header\n";
|
|
68
|
+
while( my $line = $maf_fh->getline ) {
|
|
69
|
+
++$i;
|
|
70
|
+
if ( $i % 1000 == 0 ) {
|
|
71
|
+
print STDOUT "INFO: Parsing line $i of input maf\n";
|
|
72
|
+
}
|
|
73
|
+
# If the file uses Mac OS 9 newlines, quit with an error
|
|
74
|
+
( $line !~ m/\r$/ ) or die "ERROR: Your MAF uses CR line breaks, which we can't support. Please use LF or CRLF.\n";
|
|
75
|
+
|
|
76
|
+
# Skip comment lines
|
|
77
|
+
next if( $line =~ m/^#/ );
|
|
78
|
+
|
|
79
|
+
# Instead of a chomp, do a thorough removal of carriage returns, line feeds, and prefixed/suffixed whitespace
|
|
80
|
+
my @cols = map{s/^\s+|\s+$|\r|\n//g; $_} split( /\t/, $line );
|
|
81
|
+
|
|
82
|
+
# Parse the header line to map column names to their indexes
|
|
83
|
+
if( $line =~ m/^(Hugo_Symbol|Chromosome|Tumor_Sample_Barcode)/i ) {
|
|
84
|
+
|
|
85
|
+
# Fetch the column names and do some sanity checks (don't be case-sensitive)
|
|
86
|
+
my $idx = 0;
|
|
87
|
+
$header_line = $line;
|
|
88
|
+
map{ my $c = lc; $col_idx{$c} = $idx; ++$idx; } @cols;
|
|
89
|
+
map{ my $c = lc; ( defined $col_idx{$c} ) or die "ERROR: $_ is a required MAF column!\n" } qw( Chromosome Start_Position Reference_Allele Tumor_Sample_Barcode );
|
|
90
|
+
( defined $col_idx{tumor_seq_allele1} or defined $col_idx{tumor_seq_allele2} ) or die "ERROR: At least one MAF column for Tumor_Seq_Allele must be defined!\n";
|
|
91
|
+
|
|
92
|
+
# Fetch all tumor-normal paired IDs from the MAF, doing some whitespace cleanup in the same step
|
|
93
|
+
my $tn_idx = $col_idx{tumor_sample_barcode} + 1;
|
|
94
|
+
$tn_idx .= ( "," . ( $col_idx{matched_norm_sample_barcode} + 1 )) if( defined $col_idx{matched_norm_sample_barcode} );
|
|
95
|
+
@tn_pair = map{s/^\s+|\s+$|\r|\n//g; s/\s*\t\s*/\t/; $_}`grep -aEiv "^#|^Hugo_Symbol|^Chromosome|^Tumor_Sample_Barcode" '$input_maf' | cut -f $tn_idx | sort -u`;
|
|
96
|
+
|
|
97
|
+
# Quit if one of the TN barcodes are missing, or they contain characters not allowed in Unix filenames
|
|
98
|
+
map{ ( !m/^\s*$|^#|\0|\// ) or die "ERROR: Invalid Tumor_Sample_Barcode in MAF: \"$_\"\n"} @tn_pair;
|
|
99
|
+
next; # Code below is only for lines with variants
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
# Print an error if we got to this point without parsing a header line
|
|
103
|
+
( %col_idx ) or die "ERROR: Couldn't find a header line (must start with Hugo_Symbol, Chromosome, or Tumor_Sample_Barcode): $input_maf\n";
|
|
104
|
+
|
|
105
|
+
# For each variant in the MAF, parse out the locus for running samtools faidx later
|
|
106
|
+
my ( $chr, $pos, $ref, $filter ) = map{ my $c = lc; ( defined $col_idx{$c} ? $cols[$col_idx{$c}] : "" )} qw( Chromosome Start_Position Reference_Allele FILTER );
|
|
107
|
+
$ref =~ s/^(\?|-|0)+$//; # Blank out the dashes (or other weird chars) used with indels
|
|
108
|
+
my $region = "$chr:" . ( $pos - 1 ) . "-" . ( $pos + length( $ref ));
|
|
109
|
+
$uniq_regions{$region} = 1;
|
|
110
|
+
# Also track the unique FILTER tags seen, so we can construct VCF header lines for each
|
|
111
|
+
map{ $filter_tags{$_} = 1 unless( $_ eq "PASS" or $_ eq "." )} split( /,|;/, $filter );
|
|
112
|
+
}
|
|
113
|
+
$maf_fh->close;
|
|
114
|
+
|
|
115
|
+
# samtools runs faster when passed many loci at a time, but limited to around 125k args at least on
|
|
116
|
+
# CentOS6. If there are too many loci, let's split them into smaller chunks and run separately
|
|
117
|
+
my ( @regions_split, $lines );
|
|
118
|
+
my @regions = keys %uniq_regions;
|
|
119
|
+
# https://github.com/mskcc/vcf2maf/issues/234
|
|
120
|
+
push( @regions_split, [ splice( @regions, 0, 1 ) ] ) while @regions;
|
|
121
|
+
print STDOUT "INFO: Fetching flanking sequences for the regions using samtools faidx\n";
|
|
122
|
+
map{ my $loci = join( " ", @{$_} ); $lines .= `'$samtools' faidx '$ref_fasta' $loci` } @regions_split;
|
|
123
|
+
|
|
124
|
+
my $k = 0;
|
|
125
|
+
foreach my $line ( grep( length, split( ">", $lines ))) {
|
|
126
|
+
$k ++;
|
|
127
|
+
if ( $k % 1000 == 0 ) {
|
|
128
|
+
print STDOUT "INFO: Handling line $k\n";
|
|
129
|
+
}
|
|
130
|
+
# Carefully split this FASTA entry, properly chomping newlines for long indels
|
|
131
|
+
my ( $locus, $bps ) = split( "\n", $line, 2 );
|
|
132
|
+
$bps =~ s/\r|\n//g;
|
|
133
|
+
if( $bps ){
|
|
134
|
+
$bps = uc( $bps );
|
|
135
|
+
$flanking_bps{$locus} = $bps;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
# If flanking_bps is entirely empty, then it's most likely that the user chose the wrong ref-fasta
|
|
140
|
+
( %flanking_bps ) or die "ERROR: Make sure that ref-fasta is the same genome build as your MAF: $ref_fasta\n";
|
|
141
|
+
|
|
142
|
+
# Create VCF header lines for the reference FASTA, its contigs, and their lengths
|
|
143
|
+
my $ref_fai = $ref_fasta . ".fai";
|
|
144
|
+
print STDOUT "INFO: Creating index if not exists for the reference FASTA\n";
|
|
145
|
+
`'$samtools' faidx '$ref_fasta'` unless( -s $ref_fai );
|
|
146
|
+
print STDOUT "INFO: Fetching contig lengths from the reference FASTA\n";
|
|
147
|
+
my @ref_contigs = map { chomp; my ($chr, $len)=split("\t"); "##contig=<ID=$chr,length=$len>\n" } `cut -f1,2 '$ref_fai' | sort -k1,1V`;
|
|
148
|
+
my $ref_header = "##reference=file://$ref_fasta\n" . join( "", @ref_contigs );
|
|
149
|
+
|
|
150
|
+
# Parse through each variant in the MAF, and fill up the respective per-sample VCFs
|
|
151
|
+
$maf_fh = IO::File->new( $input_maf ) or die "ERROR: Couldn't open file: $input_maf\n";
|
|
152
|
+
my %tn_vcf = (); # In-memory cache to speed up writing per-TN pair VCFs
|
|
153
|
+
my $skipped_fh; # If any variants have ref mismatch issues, skip and store them separately
|
|
154
|
+
my ( @var_key, %var_frmt, %var_fltr, %var_id, %var_qual ); # Retain variant info for printing later
|
|
155
|
+
my %vcf_col_idx = (); # Tracks the position of genotype columns for each sample
|
|
156
|
+
my $line_count = 0;
|
|
157
|
+
|
|
158
|
+
my $j = 0;
|
|
159
|
+
print STDOUT "INFO: Converting each line from maf file\n";
|
|
160
|
+
while( my $line = $maf_fh->getline ) {
|
|
161
|
+
$j ++;
|
|
162
|
+
if ( $j % 1000 == 0 ) {
|
|
163
|
+
print STDOUT "INFO: Converting line $j\n";
|
|
164
|
+
}
|
|
165
|
+
# Skip comment lines
|
|
166
|
+
next if( $line =~ m/^#/ );
|
|
167
|
+
|
|
168
|
+
# Instead of a chomp, do a thorough removal of carriage returns, line feeds, and prefixed/suffixed whitespace
|
|
169
|
+
my @cols = map{s/^\s+|\s+$|\r|\n//g; $_} split( /\t/, $line );
|
|
170
|
+
|
|
171
|
+
if( $line =~ m/^(Hugo_Symbol|Chromosome|Tumor_Sample_Barcode)/i ) {
|
|
172
|
+
|
|
173
|
+
unless( -e $output_dir ) { mkdir $output_dir or die "ERROR: Couldn't create directory $output_dir! $!"; }
|
|
174
|
+
|
|
175
|
+
# Create a T-N pairing TSV file, since it's lost in translation to multi-sample VCF
|
|
176
|
+
my $tsv_file = "$output_dir/" . substr( $input_maf, rindex( $input_maf, '/' ) + 1 );
|
|
177
|
+
$tsv_file =~ s/(\.)?(maf|tsv|txt)?$/.pairs.tsv/;
|
|
178
|
+
my $tsv_fh = IO::File->new( $tsv_file, ">" ) or die "ERROR: Failed to create file $tsv_file\n";
|
|
179
|
+
$tsv_fh->print( "#Tumor_Sample_Barcode\tMatched_Norm_Sample_Barcode\n" );
|
|
180
|
+
|
|
181
|
+
# For each TN-pair in the MAF, initialize a blank VCF with proper VCF headers in output directory
|
|
182
|
+
my $idx = 0;
|
|
183
|
+
foreach my $pair ( @tn_pair ) {
|
|
184
|
+
my ( $t_id, $n_id ) = split( /\t/, $pair );
|
|
185
|
+
$n_id = "NORMAL" unless( defined $n_id ); # Use a placeholder name for normal if its undefined
|
|
186
|
+
if( $per_tn_vcfs ) {
|
|
187
|
+
my $vcf_file = "$output_dir/$t_id\_vs_$n_id.vcf";
|
|
188
|
+
$tn_vcf{$vcf_file} .= "##fileformat=VCFv4.2\n";
|
|
189
|
+
$tn_vcf{$vcf_file} .= $ref_header;
|
|
190
|
+
$tn_vcf{$vcf_file} .= "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n";
|
|
191
|
+
$tn_vcf{$vcf_file} .= "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths of REF and ALT(s) in the order listed\">\n";
|
|
192
|
+
$tn_vcf{$vcf_file} .= "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Total read depth across this site\">\n";
|
|
193
|
+
$tn_vcf{$vcf_file} .= "##INFO=<ID=VC,Number=1,Type=String,Description=\"Variant_Classification\">\n";
|
|
194
|
+
$tn_vcf{$vcf_file} .= "##INFO=<ID=VT,Number=1,Type=String,Description=\"Variant_Type\">\n";
|
|
195
|
+
$tn_vcf{$vcf_file} .= "##FILTER=<ID=$_,Description=\"$_\">\n" foreach ( sort keys %filter_tags );
|
|
196
|
+
$tn_vcf{$vcf_file} .= "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$t_id\t$n_id\n";
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
# Set genotype column indexes for the multi-sample VCF, and keep the pairing info
|
|
200
|
+
$vcf_col_idx{ $t_id } = $idx++ if ( !exists $vcf_col_idx{ $t_id } );
|
|
201
|
+
$vcf_col_idx{ $n_id } = $idx++ if ( !exists $vcf_col_idx{ $n_id } );
|
|
202
|
+
$tsv_fh->print( "$t_id\t$n_id\n" );
|
|
203
|
+
}
|
|
204
|
+
$tsv_fh->close;
|
|
205
|
+
next;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
# For each variant in the MAF, parse out data that can go into the output VCF
|
|
209
|
+
my ( $chr, $pos, $ref, $al1, $al2, $t_id, $n_id, $n_al1, $n_al2, $id, $qual, $filter, $vc, $vt ) = map{ my $c = lc; ( defined $col_idx{$c} ? $cols[$col_idx{$c}] : "" )} qw( Chromosome Start_Position Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 variant_id variant_qual FILTER Variant_Classification Variant_Type );
|
|
210
|
+
$filter =~ s/,/;/g;
|
|
211
|
+
++$line_count;
|
|
212
|
+
|
|
213
|
+
# Handle a situation in Oncotator MAFs where alleles of DNPs are pipe "|" delimited:
|
|
214
|
+
( $ref, $al1, $al2, $n_al1, $n_al2 ) = map{s/\|//g; $_} ( $ref, $al1, $al2, $n_al1, $n_al2 );
|
|
215
|
+
|
|
216
|
+
# Make sure that our minimum required columns contain proper data
|
|
217
|
+
map{( !m/^\s*$/ ) or die "ERROR: $_ is empty in MAF line $line_count!\n" } qw( Chromosome Start_Position Reference_Allele Tumor_Sample_Barcode );
|
|
218
|
+
(( $col_idx{tumor_seq_allele1} and $col_idx{tumor_seq_allele1} !~ m/^\s*$/ ) or ( $col_idx{tumor_seq_allele2} and $col_idx{tumor_seq_allele2} !~ m/^\s*$/ )) or die "ERROR: At least one of the Tumor_Seq_Allele columns must be non-empty in MAF line $line_count!\n";
|
|
219
|
+
|
|
220
|
+
# Parse out read counts for ref/var alleles, if available
|
|
221
|
+
my ( $t_dp, $t_rad, $t_vad, $n_dp, $n_rad, $n_vad ) = map{ my $c = lc; (( defined $col_idx{$c} and defined $cols[$col_idx{$c}] and $cols[$col_idx{$c}] =~ m/^\d+/ ) ? sprintf( "%.0f", $cols[$col_idx{$c}] ) : '.' )} ( $tum_depth_col, $tum_rad_col, $tum_vad_col, $nrm_depth_col, $nrm_rad_col, $nrm_vad_col );
|
|
222
|
+
|
|
223
|
+
# Normal sample ID could be undefined for legit reasons, but we need a placeholder name
|
|
224
|
+
$n_id = "NORMAL" if( !defined $n_id or $n_id eq "" );
|
|
225
|
+
|
|
226
|
+
# If VCF ID, QUAL, or FILTER are undefined or empty, set them to "." per proper VCF specs
|
|
227
|
+
$id = "." if( !defined $id or $id eq "" );
|
|
228
|
+
$qual = "." if( !defined $qual or $qual eq "" );
|
|
229
|
+
$filter = "." if( !defined $filter or $filter eq "" );
|
|
230
|
+
|
|
231
|
+
# Make sure we have at least one variant allele. If not, die with an error
|
|
232
|
+
if( $al1 eq "" and $al2 eq "" ) {
|
|
233
|
+
die "ERROR: MAF line $line_count has no variant allele specified at $chr:$pos!\n";
|
|
234
|
+
}
|
|
235
|
+
# If one of the variant alleles is unset, assume that it's the same as the reference allele
|
|
236
|
+
$al1 = $ref if( $al1 eq "" );
|
|
237
|
+
$al2 = $ref if( $al2 eq "" );
|
|
238
|
+
|
|
239
|
+
# When variant alleles are a SNP and a "-", warn user of misusing "-" to denote REF
|
|
240
|
+
if( $al1 ne $ref and $al2 ne $ref and $al1 ne $al2 and ( $al1 eq "-" or $al2 eq "-" ) and
|
|
241
|
+
length( $al1 ) == 1 and length( $al2 ) == 1 and length( $ref ) == 1 ) {
|
|
242
|
+
$al1 = $ref if( $al1 eq "-" );
|
|
243
|
+
$al2 = $ref if( $al2 eq "-" );
|
|
244
|
+
warn "WARNING: Replacing '-' with reference allele in: $line";
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
# Blank out the dashes (or other weird chars) used with indels
|
|
248
|
+
( $ref, $al1, $al2, $n_al1, $n_al2 ) = map{s/^(\?|-|0)+$//; $_} ( $ref, $al1, $al2, $n_al1, $n_al2 );
|
|
249
|
+
|
|
250
|
+
# If normal alleles are unset in the MAF (quite common), assume homozygous reference
|
|
251
|
+
$n_al1 = $ref if( $n_al1 eq "" );
|
|
252
|
+
$n_al2 = $ref if( $n_al2 eq "" );
|
|
253
|
+
|
|
254
|
+
# Do a sanity check on all the alleles
|
|
255
|
+
unless( $al1=~m/^[ACGT-]*$/ and $al2=~m/^[ACGT-]*$/ and $n_al1=~m/^[ACGT-]*$/ and $n_al2=~m/^[ACGT-]*$/ ) {
|
|
256
|
+
die "ERROR: MAF line $line_count (at $chr:$pos) contains invalid alleles in Tumor_Seq_Allele or Match_Norm_Seq_Allele columns!\n";
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
# To simplify code coming up below, ensure that $al2 is always non-REF
|
|
260
|
+
( $al1, $al2 ) = ( $al2, $al1 ) if( $al2 eq $ref );
|
|
261
|
+
# Do the same for the normal alleles, though it makes no difference if both are REF
|
|
262
|
+
( $n_al1, $n_al2 ) = ( $n_al2, $n_al1 ) if( $n_al2 eq $ref );
|
|
263
|
+
|
|
264
|
+
# Except for MAF-format simple insertions, check ref alleles, and skip lines that mismatch
|
|
265
|
+
my $locus = "$chr:" . ( $pos - 1 ) . "-" . ( $pos + length( $ref ));
|
|
266
|
+
if( $ref ne "" or !defined $flanking_bps{$locus} ) {
|
|
267
|
+
my $ref_from_fasta = ( defined $flanking_bps{$locus} ? substr( $flanking_bps{$locus}, 1, -1 ) : "" );
|
|
268
|
+
if( $ref ne $ref_from_fasta or !defined $flanking_bps{$locus} ) {
|
|
269
|
+
# Create the file for skipped variants, if it wasn't already
|
|
270
|
+
unless( $skipped_fh ) {
|
|
271
|
+
my $skip_file = "$output_dir/" . substr( $input_maf, rindex( $input_maf, '/' ) + 1 );
|
|
272
|
+
$skip_file =~ s/(\.)?(maf|tsv|txt)?$/.skipped.tsv/;
|
|
273
|
+
$skipped_fh = IO::File->new( $skip_file, ">" ) or die "ERROR: Failed to create file $skip_file\n";
|
|
274
|
+
warn "WARNING: Reference allele mismatches found. Storing them here for debugging: $skip_file\n";
|
|
275
|
+
$skipped_fh->print( $header_line );
|
|
276
|
+
}
|
|
277
|
+
$skipped_fh->print( $line );
|
|
278
|
+
next;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
# To represent indels in VCF format, we need the preceding bp in the reference FASTA
|
|
283
|
+
my ( $ref_len, $al1_len, $al2_len ) = map{ length( $_ ) } ( $ref, $al1, $al2 );
|
|
284
|
+
if( $ref_len == 0 or $al1_len == 0 or $al2_len == 0 or ( $ref_len ne $al2_len and substr( $ref, 0, 1 ) ne substr( $al2, 0, 1 ))) {
|
|
285
|
+
my $prefix_bp = substr( $flanking_bps{$locus}, 0, 1 );
|
|
286
|
+
# For MAF-format simple insertions, $pos is already the locus of the preceding bp
|
|
287
|
+
$prefix_bp = substr( $flanking_bps{$locus}, 1, 1 ) if( $ref eq "" );
|
|
288
|
+
# If this is not a MAF-format simple insertion, decrement $pos
|
|
289
|
+
--$pos unless( $ref eq "" );
|
|
290
|
+
# Prefix the fetched reference bp to all the alleles
|
|
291
|
+
( $ref, $al1, $al2, $n_al1, $n_al2 ) = map{$prefix_bp.$_} ( $ref, $al1, $al2, $n_al1, $n_al2 );
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
# Fill an array with all unique REF/ALT alleles, and set their 0-based indexes like in a VCF
|
|
295
|
+
# Notice how we ensure that $alleles[0] is REF and $alleles[1] is the major ALT allele in tumor
|
|
296
|
+
my ( @alleles, %al_idx );
|
|
297
|
+
my $idx = 0;
|
|
298
|
+
foreach my $al ( $ref, $al2, $al1, $n_al2, $n_al1 ) {
|
|
299
|
+
unless( defined $al_idx{$al} ) {
|
|
300
|
+
push( @alleles, $al );
|
|
301
|
+
$al_idx{$al} = $idx++;
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
# Set tumor and normal genotypes (FORMAT tag GT in VCF)
|
|
306
|
+
my ( $t_gt, $n_gt ) = ( "0/1", "0/0" ); # Set defaults
|
|
307
|
+
$t_gt = join( "/", $al_idx{$al2}, $al_idx{$al1} ) if( $al_idx{$al1} ne "0" );
|
|
308
|
+
$n_gt = join( "/", $al_idx{$n_al2}, $al_idx{$n_al1} ) if( $al_idx{$n_al1} ne "0" );
|
|
309
|
+
$n_gt = join( "/", $al_idx{$n_al1}, $al_idx{$n_al2} ) if( $al_idx{$n_al2} ne "0" );
|
|
310
|
+
|
|
311
|
+
# Create the VCF's comma-delimited ALT field that must list all non-REF (variant) alleles
|
|
312
|
+
my $alt = join( ",", @alleles[1..$#alleles] );
|
|
313
|
+
|
|
314
|
+
# If there are >1 variant alleles, assume that depths in $t_vad and $n_vad are for $al2
|
|
315
|
+
if( scalar( @alleles ) > 2 ) {
|
|
316
|
+
$t_vad = join( ",", $t_vad, map{"."}@alleles[2..$#alleles] );
|
|
317
|
+
$n_vad = join( ",", $n_vad, map{"."}@alleles[2..$#alleles] );
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
# Construct genotype fields for FORMAT tags GT:AD:DP
|
|
321
|
+
my $t_fmt = "$t_gt:$t_rad,$t_vad:$t_dp";
|
|
322
|
+
my $n_fmt = "$n_gt:$n_rad,$n_vad:$n_dp";
|
|
323
|
+
|
|
324
|
+
# Contruct a VCF formatted line and append it to the respective VCF
|
|
325
|
+
if( $per_tn_vcfs ) {
|
|
326
|
+
my $vcf_file = "$output_dir/$t_id\_vs_$n_id.vcf";
|
|
327
|
+
my $vcf_line = join( "\t", $chr, $pos, $id, $ref, $alt, $qual, $filter, "VC=$vc:VT=$vt", "GT:AD:DP", $t_fmt, $n_fmt );
|
|
328
|
+
$tn_vcf{$vcf_file} .= "$vcf_line\n";
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
# Store VCF formatted data for the multi-sample VCF
|
|
332
|
+
my $key = join( "\t", $chr, $pos, $ref, $alt, $vc, $vt );
|
|
333
|
+
push( @var_key, $key ) unless( exists $var_frmt{ $key } );
|
|
334
|
+
$var_frmt{ $key }{ $vcf_col_idx{ $t_id }} = $t_fmt;
|
|
335
|
+
$var_frmt{ $key }{ $vcf_col_idx{ $n_id }} = $n_fmt;
|
|
336
|
+
# ::NOTE:: Samples shouldn't have different ID, QUAL, or FILTERs for the same loci+alleles
|
|
337
|
+
$var_fltr{ $key } = $filter;
|
|
338
|
+
$var_id{ $key } = $id;
|
|
339
|
+
$var_qual{ $key } = $qual;
|
|
340
|
+
}
|
|
341
|
+
$maf_fh->close;
|
|
342
|
+
$skipped_fh->close if( $skipped_fh );
|
|
343
|
+
|
|
344
|
+
# Write the cached contents of per-TN VCFs into files
|
|
345
|
+
if( $per_tn_vcfs ) {
|
|
346
|
+
print STDOUT "INFO: Writing per-TN VCFs to output directory\n";
|
|
347
|
+
foreach my $vcf_file ( keys %tn_vcf ) {
|
|
348
|
+
my $tn_vcf_fh = IO::File->new( $vcf_file, ">" ) or die "ERROR: Failed to create file $vcf_file\n";
|
|
349
|
+
$tn_vcf_fh->print( $tn_vcf{$vcf_file} );
|
|
350
|
+
$tn_vcf_fh->close;
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
# Initialize header lines for the multi-sample VCF
|
|
355
|
+
my @vcf_cols = sort { $vcf_col_idx{$a} <=> $vcf_col_idx{$b} } keys %vcf_col_idx;
|
|
356
|
+
my $vcf_fh = IO::File->new( $output_vcf, ">" ) or die "ERROR: Fail to create file $output_vcf\n";
|
|
357
|
+
$vcf_fh->print( "##fileformat=VCFv4.2\n" );
|
|
358
|
+
$vcf_fh->print( $ref_header );
|
|
359
|
+
$vcf_fh->print( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n" );
|
|
360
|
+
$vcf_fh->print( "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic Depths of REF and ALT(s) in the order listed\">\n" );
|
|
361
|
+
$vcf_fh->print( "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n" );
|
|
362
|
+
$vcf_fh->print( "##INFO=<ID=VC,Number=1,Type=String,Description=\"Variant_Classification\">\n" );
|
|
363
|
+
$vcf_fh->print( "##INFO=<ID=VT,Number=1,Type=String,Description=\"Variant_Type\">\n" );
|
|
364
|
+
$vcf_fh->print( "##FILTER=<ID=$_,Description=\"$_\">\n" ) foreach ( sort keys %filter_tags );
|
|
365
|
+
$vcf_fh->print( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" . join("\t", @vcf_cols) . "\n" );
|
|
366
|
+
|
|
367
|
+
# Write each variant into the multi-sample VCF
|
|
368
|
+
print STDOUT "INFO: Writing multi-sample VCF\n";
|
|
369
|
+
foreach my $key ( @var_key ) {
|
|
370
|
+
my ( $chr, $pos, $ref, $alt, $vc, $vt ) = split( "\t", $key );
|
|
371
|
+
$vcf_fh->print( join( "\t", $chr, $pos, $var_id{ $key }, $ref, $alt, $var_qual{ $key }, $var_fltr{ $key }, "VC=$vc;VT=$vt", "GT:AD:DP" ));
|
|
372
|
+
map{ $vcf_fh->print( "\t" . (( exists $var_frmt{$key}{$_} ) ? $var_frmt{$key}{$_} : './.:.:.' ))}( 0..$#vcf_cols );
|
|
373
|
+
$vcf_fh->print( "\n" );
|
|
374
|
+
}
|
|
375
|
+
$vcf_fh->close;
|
|
376
|
+
|
|
377
|
+
# Make sure that we handled a positive non-zero number of lines in the MAF
|
|
378
|
+
( $line_count > 0 ) or die "ERROR: No variant lines in the input MAF!\n";
|
|
379
|
+
|
|
380
|
+
__DATA__
|
|
381
|
+
|
|
382
|
+
=head1 NAME
|
|
383
|
+
|
|
384
|
+
maf2vcf.pl - Reformat variants in a MAF into a multisample VCF with GT:AD:DP data if available
|
|
385
|
+
|
|
386
|
+
=head1 SYNOPSIS
|
|
387
|
+
|
|
388
|
+
perl maf2vcf.pl --help
|
|
389
|
+
perl maf2vcf.pl --input-maf test.maf --output-dir vcfs
|
|
390
|
+
|
|
391
|
+
=head1 OPTIONS
|
|
392
|
+
|
|
393
|
+
--input-maf Path to input file in MAF format
|
|
394
|
+
--output-dir Path to output directory where VCFs will be stored, one per TN-pair
|
|
395
|
+
--output-vcf Path to output multi-sample VCF containing all TN-pairs [<output-dir>/<input-maf-name>.vcf]
|
|
396
|
+
--ref-fasta Path to reference Fasta file [~/.vep/homo_sapiens/102_GRCh37/Homo_sapiens.GRCh37.dna.toplevel.fa.gz]
|
|
397
|
+
--samtools Path to samtools binary [samtools]
|
|
398
|
+
--per-tn-vcfs Specify this to generate VCFs per-TN pair, in addition to the multi-sample VCF
|
|
399
|
+
--tum-depth-col Name of MAF column for read depth in tumor BAM [t_depth]
|
|
400
|
+
--tum-rad-col Name of MAF column for reference allele depth in tumor BAM [t_ref_count]
|
|
401
|
+
--tum-vad-col Name of MAF column for variant allele depth in tumor BAM [t_alt_count]
|
|
402
|
+
--nrm-depth-col Name of MAF column for read depth in normal BAM [n_depth]
|
|
403
|
+
--nrm-rad-col Name of MAF column for reference allele depth in normal BAM [n_ref_count]
|
|
404
|
+
--nrm-vad-col Name of MAF column for variant allele depth in normal BAM [n_alt_count]
|
|
405
|
+
--help Print a brief help message and quit
|
|
406
|
+
--man Print the detailed manual
|
|
407
|
+
|
|
408
|
+
=head1 DESCRIPTION
|
|
409
|
+
|
|
410
|
+
This script breaks down variants in a MAF into a multi-sample VCF, in preparation for annotation with VEP. Can also create VCFs per-TN pair.
|
|
411
|
+
|
|
412
|
+
=head2 Relevant links:
|
|
413
|
+
|
|
414
|
+
Homepage: https://github.com/ckandoth/vcf2maf
|
|
415
|
+
VCF format: http://samtools.github.io/hts-specs/
|
|
416
|
+
MAF format: https://wiki.nci.nih.gov/x/eJaPAQ
|
|
417
|
+
|
|
418
|
+
=head1 AUTHORS
|
|
419
|
+
|
|
420
|
+
Cyriac Kandoth (ckandoth@gmail.com)
|
|
421
|
+
Qingguo Wang (josephw10000@gmail.com)
|
|
422
|
+
|
|
423
|
+
=head1 LICENSE
|
|
424
|
+
|
|
425
|
+
Apache-2.0 | Apache License, Version 2.0 | https://www.apache.org/licenses/LICENSE-2.0
|
|
426
|
+
|
|
427
|
+
=cut
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from os import path
|
|
2
|
+
import cmdy
|
|
3
|
+
|
|
4
|
+
infile = {{in.infile | quote}} # pyright: ignore
|
|
5
|
+
outfile = {{out.outfile | quote}} # pyright: ignore
|
|
6
|
+
joboutdir = {{job.outdir | quote}} # pyright: ignore
|
|
7
|
+
vcfanno = {{envs.vcfanno | quote}} # pyright: ignore
|
|
8
|
+
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
9
|
+
args = {{envs.args | repr}} # pyright: ignore
|
|
10
|
+
|
|
11
|
+
{% set conf = envs.conffile or in.conffile %}
|
|
12
|
+
{% if conf | isinstance: dict %}
|
|
13
|
+
conffile = path.join(joboutdir, "config.toml")
|
|
14
|
+
conf = {{ conf | toml | quote }}
|
|
15
|
+
with open(conffile, "w") as f:
|
|
16
|
+
f.write(conf)
|
|
17
|
+
{% else %}
|
|
18
|
+
conffile = {{conf | quote}}
|
|
19
|
+
{% endif %}
|
|
20
|
+
|
|
21
|
+
args["p"] = ncores
|
|
22
|
+
args["_"] = [conffile, infile]
|
|
23
|
+
args["_exe"] = vcfanno
|
|
24
|
+
args["_prefix"] = "-"
|
|
25
|
+
|
|
26
|
+
cmdy.vcfanno(**args).r() > outfile
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
|
-
|
|
2
|
+
import gzip
|
|
3
3
|
from biopipen.utils.vcf import * # noqa: F401, F403
|
|
4
4
|
|
|
5
5
|
|
|
@@ -63,7 +63,8 @@ def fix_vcffile(vcffile, outfile, fixes):
|
|
|
63
63
|
else:
|
|
64
64
|
modify_fixes.append(fix)
|
|
65
65
|
|
|
66
|
-
|
|
66
|
+
inopen = gzip.open if vcffile.endswith(".gz") else open
|
|
67
|
+
with inopen(vcffile, "rt") as fin, open(outfile, "w") as fout:
|
|
67
68
|
for line in fin:
|
|
68
69
|
obj = line_to_obj(line)
|
|
69
70
|
out = handle_obj(obj, modify_fixes)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: biopipen
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Bioinformatics processes/pipelines that can be run from `pipen run`
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: pwwang
|
|
@@ -12,12 +12,11 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.9
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.10
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
-
Provides-Extra: test
|
|
16
15
|
Requires-Dist: cmdy (>=0.5,<0.6)
|
|
17
16
|
Requires-Dist: datar[pandas] (>=0.11,<0.12)
|
|
18
|
-
Requires-Dist: pipen (>=0.
|
|
19
|
-
Requires-Dist: pipen-args (>=0.
|
|
20
|
-
Requires-Dist: pipen-cli-run (>=0.
|
|
21
|
-
Requires-Dist: pipen-filters (>=0.
|
|
22
|
-
Requires-Dist: pipen-report (>=0.
|
|
23
|
-
Requires-Dist: pipen-verbose (>=0.
|
|
17
|
+
Requires-Dist: pipen (>=0.5,<0.6)
|
|
18
|
+
Requires-Dist: pipen-args (>=0.6,<0.7)
|
|
19
|
+
Requires-Dist: pipen-cli-run (>=0.5,<0.6)
|
|
20
|
+
Requires-Dist: pipen-filters (>=0.4,<0.5)
|
|
21
|
+
Requires-Dist: pipen-report (>=0.6,<0.7)
|
|
22
|
+
Requires-Dist: pipen-verbose (>=0.3,<0.4)
|