biopipen 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (65) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.py +0 -5
  3. biopipen/core/config.toml +4 -4
  4. biopipen/core/defaults.py +3 -3
  5. biopipen/core/filters.py +1 -0
  6. biopipen/core/proc.py +1 -3
  7. biopipen/core/testing.py +1 -2
  8. biopipen/ns/bam.py +10 -14
  9. biopipen/ns/bcftools.py +37 -7
  10. biopipen/ns/bed.py +9 -16
  11. biopipen/ns/cnv.py +8 -11
  12. biopipen/ns/cnvkit.py +32 -59
  13. biopipen/ns/cnvkit_pipeline.py +266 -310
  14. biopipen/ns/csv.py +0 -2
  15. biopipen/ns/gene.py +0 -1
  16. biopipen/ns/gsea.py +4 -10
  17. biopipen/ns/misc.py +0 -5
  18. biopipen/ns/plot.py +2 -4
  19. biopipen/ns/rnaseq.py +0 -1
  20. biopipen/ns/scrna.py +78 -120
  21. biopipen/ns/scrna_metabolic_landscape.py +306 -348
  22. biopipen/ns/tcgamaf.py +52 -0
  23. biopipen/ns/tcr.py +5 -15
  24. biopipen/ns/vcf.py +52 -34
  25. biopipen/ns/web.py +8 -19
  26. biopipen/reports/bam/CNAClinic.svelte +1 -1
  27. biopipen/reports/bam/CNVpytor.svelte +2 -2
  28. biopipen/reports/bam/ControlFREEC.svelte +1 -1
  29. biopipen/reports/cnv/AneuploidyScore.svelte +2 -2
  30. biopipen/reports/cnv/AneuploidyScoreSummary.svelte +1 -1
  31. biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
  32. biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
  33. biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
  34. biopipen/reports/gsea/FGSEA.svelte +1 -1
  35. biopipen/reports/gsea/GSEA.svelte +2 -2
  36. biopipen/reports/scrna/CellsDistribution.svelte +1 -1
  37. biopipen/reports/scrna/DimPlots.svelte +1 -1
  38. biopipen/reports/scrna/GeneExpressionInvistigation.svelte +1 -1
  39. biopipen/reports/scrna/MarkersFinder.svelte +42 -39
  40. biopipen/reports/scrna/ScFGSEA.svelte +3 -3
  41. biopipen/reports/scrna/SeuratClusterStats.svelte +3 -3
  42. biopipen/reports/scrna/SeuratPreparing.svelte +2 -2
  43. biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubsets.svelte +2 -2
  44. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +1 -1
  45. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +1 -1
  46. biopipen/reports/tcr/CloneResidency.svelte +4 -4
  47. biopipen/reports/tcr/Immunarch.svelte +2 -2
  48. biopipen/reports/tcr/SampleDiversity.svelte +2 -2
  49. biopipen/reports/tcr/TCRClusteringStats.svelte +3 -3
  50. biopipen/reports/tcr/VJUsage.svelte +1 -1
  51. biopipen/reports/utils/gsea.liq +1 -1
  52. biopipen/reports/utils/misc.liq +1 -1
  53. biopipen/reports/vcf/TruvariBenchSummary.svelte +1 -1
  54. biopipen/reports/vcf/TruvariConsistency.svelte +3 -3
  55. biopipen/scripts/bcftools/BcftoolsSort.py +19 -0
  56. biopipen/scripts/scrna/MarkersFinder.R +73 -35
  57. biopipen/scripts/tcgamaf/Maf2Vcf.py +22 -0
  58. biopipen/scripts/tcgamaf/MafAddChr.py +14 -0
  59. biopipen/scripts/tcgamaf/maf2vcf.pl +427 -0
  60. biopipen/scripts/vcf/VcfAnno.py +26 -0
  61. biopipen/scripts/vcf/VcfFix_utils.py +3 -2
  62. {biopipen-0.7.0.dist-info → biopipen-0.8.0.dist-info}/METADATA +7 -8
  63. {biopipen-0.7.0.dist-info → biopipen-0.8.0.dist-info}/RECORD +65 -59
  64. {biopipen-0.7.0.dist-info → biopipen-0.8.0.dist-info}/WHEEL +1 -1
  65. {biopipen-0.7.0.dist-info → biopipen-0.8.0.dist-info}/entry_points.txt +2 -1
@@ -0,0 +1,427 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # maf2vcf - Reformat variants in a given MAF into generic VCFs with GT:AD:DP data if available
4
+
5
+ # Modified version of the original script by @mskcc, which can be found here:
6
+ # https://github.com/mskcc/vcf2maf
7
+ # This is modified to:
8
+ # - Add path to samtools to arguments
9
+ # - Add Variant_Classification and Variant_Type to INFO field
10
+ # - Fix https://github.com/mskcc/vcf2maf/issues/234
11
+ # - Adding logs
12
+
13
+ use strict;
14
+ use warnings;
15
+ use IO::File;
16
+ use Getopt::Long qw( GetOptions );
17
+ use Pod::Usage qw( pod2usage );
18
+
19
+ # Set any default paths and constants
20
+ my $ref_fasta = "$ENV{HOME}/.vep/homo_sapiens/102_GRCh37/Homo_sapiens.GRCh37.dna.toplevel.fa.gz";
21
+ my ( $tum_depth_col, $tum_rad_col, $tum_vad_col ) = qw( t_depth t_ref_count t_alt_count );
22
+ my ( $nrm_depth_col, $nrm_rad_col, $nrm_vad_col ) = qw( n_depth n_ref_count n_alt_count );
23
+
24
+ # Find out if samtools is properly installed, and warn the user if it's not
25
+ my ( $samtools ) = map{chomp; $_}`which samtools`;
26
+ # ( $samtools and -e $samtools ) or die "ERROR: Please install samtools, and make sure it's in your PATH\n";
27
+
28
+ # Check for missing or crappy arguments
29
+ unless( @ARGV and $ARGV[0]=~m/^-/ ) {
30
+ pod2usage( -verbose => 0, -message => "$0: Missing or invalid arguments!\n", -exitval => 2 );
31
+ }
32
+
33
+ # Parse options and print usage syntax on a syntax error, or if help was explicitly requested
34
+ my ( $man, $help, $per_tn_vcfs ) = ( 0, 0, 0 );
35
+ my ( $input_maf, $output_dir, $output_vcf );
36
+ GetOptions(
37
+ 'help!' => \$help,
38
+ 'man!' => \$man,
39
+ 'samtools=s' => \$samtools,
40
+ 'input-maf=s' => \$input_maf,
41
+ 'output-dir=s' => \$output_dir,
42
+ 'output-vcf=s' => \$output_vcf,
43
+ 'ref-fasta=s' => \$ref_fasta,
44
+ 'per-tn-vcfs!' => \$per_tn_vcfs,
45
+ 'tum-depth-col=s' => \$tum_depth_col,
46
+ 'tum-rad-col=s' => \$tum_rad_col,
47
+ 'tum-vad-col=s' => \$tum_vad_col,
48
+ 'nrm-depth-col=s' => \$nrm_depth_col,
49
+ 'nrm-rad-col=s' => \$nrm_rad_col,
50
+ 'nrm-vad-col=s' => \$nrm_vad_col
51
+ ) or pod2usage( -verbose => 1, -input => \*DATA, -exitval => 2 );
52
+ pod2usage( -verbose => 1, -input => \*DATA, -exitval => 0 ) if( $help );
53
+ pod2usage( -verbose => 2, -input => \*DATA, -exitval => 0 ) if( $man );
54
+
55
+ # Check if required arguments are missing or problematic, fix as needed
56
+ ( defined $input_maf and defined $output_dir ) or die "ERROR: --input-maf and --output-dir must be defined!\n";
57
+ ( -s $ref_fasta ) or die "ERROR: Provided Reference FASTA is missing or empty! Path: $ref_fasta\n";
58
+ unless( defined $output_vcf ) {
59
+ $output_vcf = "$output_dir/" . substr( $input_maf, rindex( $input_maf, '/' ) + 1 );
60
+ $output_vcf =~ s/(\.)?(maf|tsv|txt)?$/.vcf/;
61
+ }
62
+
63
+ # Before anything, let's parse the headers of this supposed "MAF-like" file and do some checks
64
+ my $maf_fh = IO::File->new( $input_maf ) or die "ERROR: Couldn't open input MAF: $input_maf!\n";
65
+ my ( %uniq_regions, %filter_tags, %flanking_bps, @tn_pair, %col_idx, $header_line );
66
+ my $i = 0;
67
+ print STDOUT "INFO: Parsing maf file to gather information for vcf header\n";
68
+ while( my $line = $maf_fh->getline ) {
69
+ ++$i;
70
+ if ( $i % 1000 == 0 ) {
71
+ print STDOUT "INFO: Parsing line $i of input maf\n";
72
+ }
73
+ # If the file uses Mac OS 9 newlines, quit with an error
74
+ ( $line !~ m/\r$/ ) or die "ERROR: Your MAF uses CR line breaks, which we can't support. Please use LF or CRLF.\n";
75
+
76
+ # Skip comment lines
77
+ next if( $line =~ m/^#/ );
78
+
79
+ # Instead of a chomp, do a thorough removal of carriage returns, line feeds, and prefixed/suffixed whitespace
80
+ my @cols = map{s/^\s+|\s+$|\r|\n//g; $_} split( /\t/, $line );
81
+
82
+ # Parse the header line to map column names to their indexes
83
+ if( $line =~ m/^(Hugo_Symbol|Chromosome|Tumor_Sample_Barcode)/i ) {
84
+
85
+ # Fetch the column names and do some sanity checks (don't be case-sensitive)
86
+ my $idx = 0;
87
+ $header_line = $line;
88
+ map{ my $c = lc; $col_idx{$c} = $idx; ++$idx; } @cols;
89
+ map{ my $c = lc; ( defined $col_idx{$c} ) or die "ERROR: $_ is a required MAF column!\n" } qw( Chromosome Start_Position Reference_Allele Tumor_Sample_Barcode );
90
+ ( defined $col_idx{tumor_seq_allele1} or defined $col_idx{tumor_seq_allele2} ) or die "ERROR: At least one MAF column for Tumor_Seq_Allele must be defined!\n";
91
+
92
+ # Fetch all tumor-normal paired IDs from the MAF, doing some whitespace cleanup in the same step
93
+ my $tn_idx = $col_idx{tumor_sample_barcode} + 1;
94
+ $tn_idx .= ( "," . ( $col_idx{matched_norm_sample_barcode} + 1 )) if( defined $col_idx{matched_norm_sample_barcode} );
95
+ @tn_pair = map{s/^\s+|\s+$|\r|\n//g; s/\s*\t\s*/\t/; $_}`grep -aEiv "^#|^Hugo_Symbol|^Chromosome|^Tumor_Sample_Barcode" '$input_maf' | cut -f $tn_idx | sort -u`;
96
+
97
+ # Quit if one of the TN barcodes are missing, or they contain characters not allowed in Unix filenames
98
+ map{ ( !m/^\s*$|^#|\0|\// ) or die "ERROR: Invalid Tumor_Sample_Barcode in MAF: \"$_\"\n"} @tn_pair;
99
+ next; # Code below is only for lines with variants
100
+ }
101
+
102
+ # Print an error if we got to this point without parsing a header line
103
+ ( %col_idx ) or die "ERROR: Couldn't find a header line (must start with Hugo_Symbol, Chromosome, or Tumor_Sample_Barcode): $input_maf\n";
104
+
105
+ # For each variant in the MAF, parse out the locus for running samtools faidx later
106
+ my ( $chr, $pos, $ref, $filter ) = map{ my $c = lc; ( defined $col_idx{$c} ? $cols[$col_idx{$c}] : "" )} qw( Chromosome Start_Position Reference_Allele FILTER );
107
+ $ref =~ s/^(\?|-|0)+$//; # Blank out the dashes (or other weird chars) used with indels
108
+ my $region = "$chr:" . ( $pos - 1 ) . "-" . ( $pos + length( $ref ));
109
+ $uniq_regions{$region} = 1;
110
+ # Also track the unique FILTER tags seen, so we can construct VCF header lines for each
111
+ map{ $filter_tags{$_} = 1 unless( $_ eq "PASS" or $_ eq "." )} split( /,|;/, $filter );
112
+ }
113
+ $maf_fh->close;
114
+
115
+ # samtools runs faster when passed many loci at a time, but limited to around 125k args at least on
116
+ # CentOS6. If there are too many loci, let's split them into smaller chunks and run separately
117
+ my ( @regions_split, $lines );
118
+ my @regions = keys %uniq_regions;
119
+ # https://github.com/mskcc/vcf2maf/issues/234
120
+ push( @regions_split, [ splice( @regions, 0, 1 ) ] ) while @regions;
121
+ print STDOUT "INFO: Fetching flanking sequences for the regions using samtools faidx\n";
122
+ map{ my $loci = join( " ", @{$_} ); $lines .= `'$samtools' faidx '$ref_fasta' $loci` } @regions_split;
123
+
124
+ my $k = 0;
125
+ foreach my $line ( grep( length, split( ">", $lines ))) {
126
+ $k ++;
127
+ if ( $k % 1000 == 0 ) {
128
+ print STDOUT "INFO: Handling line $k\n";
129
+ }
130
+ # Carefully split this FASTA entry, properly chomping newlines for long indels
131
+ my ( $locus, $bps ) = split( "\n", $line, 2 );
132
+ $bps =~ s/\r|\n//g;
133
+ if( $bps ){
134
+ $bps = uc( $bps );
135
+ $flanking_bps{$locus} = $bps;
136
+ }
137
+ }
138
+
139
+ # If flanking_bps is entirely empty, then it's most likely that the user chose the wrong ref-fasta
140
+ ( %flanking_bps ) or die "ERROR: Make sure that ref-fasta is the same genome build as your MAF: $ref_fasta\n";
141
+
142
+ # Create VCF header lines for the reference FASTA, its contigs, and their lengths
143
+ my $ref_fai = $ref_fasta . ".fai";
144
+ print STDOUT "INFO: Creating index if not exists for the reference FASTA\n";
145
+ `'$samtools' faidx '$ref_fasta'` unless( -s $ref_fai );
146
+ print STDOUT "INFO: Fetching contig lengths from the reference FASTA\n";
147
+ my @ref_contigs = map { chomp; my ($chr, $len)=split("\t"); "##contig=<ID=$chr,length=$len>\n" } `cut -f1,2 '$ref_fai' | sort -k1,1V`;
148
+ my $ref_header = "##reference=file://$ref_fasta\n" . join( "", @ref_contigs );
149
+
150
+ # Parse through each variant in the MAF, and fill up the respective per-sample VCFs
151
+ $maf_fh = IO::File->new( $input_maf ) or die "ERROR: Couldn't open file: $input_maf\n";
152
+ my %tn_vcf = (); # In-memory cache to speed up writing per-TN pair VCFs
153
+ my $skipped_fh; # If any variants have ref mismatch issues, skip and store them separately
154
+ my ( @var_key, %var_frmt, %var_fltr, %var_id, %var_qual ); # Retain variant info for printing later
155
+ my %vcf_col_idx = (); # Tracks the position of genotype columns for each sample
156
+ my $line_count = 0;
157
+
158
+ my $j = 0;
159
+ print STDOUT "INFO: Converting each line from maf file\n";
160
+ while( my $line = $maf_fh->getline ) {
161
+ $j ++;
162
+ if ( $j % 1000 == 0 ) {
163
+ print STDOUT "INFO: Converting line $j\n";
164
+ }
165
+ # Skip comment lines
166
+ next if( $line =~ m/^#/ );
167
+
168
+ # Instead of a chomp, do a thorough removal of carriage returns, line feeds, and prefixed/suffixed whitespace
169
+ my @cols = map{s/^\s+|\s+$|\r|\n//g; $_} split( /\t/, $line );
170
+
171
+ if( $line =~ m/^(Hugo_Symbol|Chromosome|Tumor_Sample_Barcode)/i ) {
172
+
173
+ unless( -e $output_dir ) { mkdir $output_dir or die "ERROR: Couldn't create directory $output_dir! $!"; }
174
+
175
+ # Create a T-N pairing TSV file, since it's lost in translation to multi-sample VCF
176
+ my $tsv_file = "$output_dir/" . substr( $input_maf, rindex( $input_maf, '/' ) + 1 );
177
+ $tsv_file =~ s/(\.)?(maf|tsv|txt)?$/.pairs.tsv/;
178
+ my $tsv_fh = IO::File->new( $tsv_file, ">" ) or die "ERROR: Failed to create file $tsv_file\n";
179
+ $tsv_fh->print( "#Tumor_Sample_Barcode\tMatched_Norm_Sample_Barcode\n" );
180
+
181
+ # For each TN-pair in the MAF, initialize a blank VCF with proper VCF headers in output directory
182
+ my $idx = 0;
183
+ foreach my $pair ( @tn_pair ) {
184
+ my ( $t_id, $n_id ) = split( /\t/, $pair );
185
+ $n_id = "NORMAL" unless( defined $n_id ); # Use a placeholder name for normal if its undefined
186
+ if( $per_tn_vcfs ) {
187
+ my $vcf_file = "$output_dir/$t_id\_vs_$n_id.vcf";
188
+ $tn_vcf{$vcf_file} .= "##fileformat=VCFv4.2\n";
189
+ $tn_vcf{$vcf_file} .= $ref_header;
190
+ $tn_vcf{$vcf_file} .= "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n";
191
+ $tn_vcf{$vcf_file} .= "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths of REF and ALT(s) in the order listed\">\n";
192
+ $tn_vcf{$vcf_file} .= "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Total read depth across this site\">\n";
193
+ $tn_vcf{$vcf_file} .= "##INFO=<ID=VC,Number=1,Type=String,Description=\"Variant_Classification\">\n";
194
+ $tn_vcf{$vcf_file} .= "##INFO=<ID=VT,Number=1,Type=String,Description=\"Variant_Type\">\n";
195
+ $tn_vcf{$vcf_file} .= "##FILTER=<ID=$_,Description=\"$_\">\n" foreach ( sort keys %filter_tags );
196
+ $tn_vcf{$vcf_file} .= "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$t_id\t$n_id\n";
197
+ }
198
+
199
+ # Set genotype column indexes for the multi-sample VCF, and keep the pairing info
200
+ $vcf_col_idx{ $t_id } = $idx++ if ( !exists $vcf_col_idx{ $t_id } );
201
+ $vcf_col_idx{ $n_id } = $idx++ if ( !exists $vcf_col_idx{ $n_id } );
202
+ $tsv_fh->print( "$t_id\t$n_id\n" );
203
+ }
204
+ $tsv_fh->close;
205
+ next;
206
+ }
207
+
208
+ # For each variant in the MAF, parse out data that can go into the output VCF
209
+ my ( $chr, $pos, $ref, $al1, $al2, $t_id, $n_id, $n_al1, $n_al2, $id, $qual, $filter, $vc, $vt ) = map{ my $c = lc; ( defined $col_idx{$c} ? $cols[$col_idx{$c}] : "" )} qw( Chromosome Start_Position Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 variant_id variant_qual FILTER Variant_Classification Variant_Type );
210
+ $filter =~ s/,/;/g;
211
+ ++$line_count;
212
+
213
+ # Handle a situation in Oncotator MAFs where alleles of DNPs are pipe "|" delimited:
214
+ ( $ref, $al1, $al2, $n_al1, $n_al2 ) = map{s/\|//g; $_} ( $ref, $al1, $al2, $n_al1, $n_al2 );
215
+
216
+ # Make sure that our minimum required columns contain proper data
217
+ map{( !m/^\s*$/ ) or die "ERROR: $_ is empty in MAF line $line_count!\n" } qw( Chromosome Start_Position Reference_Allele Tumor_Sample_Barcode );
218
+ (( $col_idx{tumor_seq_allele1} and $col_idx{tumor_seq_allele1} !~ m/^\s*$/ ) or ( $col_idx{tumor_seq_allele2} and $col_idx{tumor_seq_allele2} !~ m/^\s*$/ )) or die "ERROR: At least one of the Tumor_Seq_Allele columns must be non-empty in MAF line $line_count!\n";
219
+
220
+ # Parse out read counts for ref/var alleles, if available
221
+ my ( $t_dp, $t_rad, $t_vad, $n_dp, $n_rad, $n_vad ) = map{ my $c = lc; (( defined $col_idx{$c} and defined $cols[$col_idx{$c}] and $cols[$col_idx{$c}] =~ m/^\d+/ ) ? sprintf( "%.0f", $cols[$col_idx{$c}] ) : '.' )} ( $tum_depth_col, $tum_rad_col, $tum_vad_col, $nrm_depth_col, $nrm_rad_col, $nrm_vad_col );
222
+
223
+ # Normal sample ID could be undefined for legit reasons, but we need a placeholder name
224
+ $n_id = "NORMAL" if( !defined $n_id or $n_id eq "" );
225
+
226
+ # If VCF ID, QUAL, or FILTER are undefined or empty, set them to "." per proper VCF specs
227
+ $id = "." if( !defined $id or $id eq "" );
228
+ $qual = "." if( !defined $qual or $qual eq "" );
229
+ $filter = "." if( !defined $filter or $filter eq "" );
230
+
231
+ # Make sure we have at least one variant allele. If not, die with an error
232
+ if( $al1 eq "" and $al2 eq "" ) {
233
+ die "ERROR: MAF line $line_count has no variant allele specified at $chr:$pos!\n";
234
+ }
235
+ # If one of the variant alleles is unset, assume that it's the same as the reference allele
236
+ $al1 = $ref if( $al1 eq "" );
237
+ $al2 = $ref if( $al2 eq "" );
238
+
239
+ # When variant alleles are a SNP and a "-", warn user of misusing "-" to denote REF
240
+ if( $al1 ne $ref and $al2 ne $ref and $al1 ne $al2 and ( $al1 eq "-" or $al2 eq "-" ) and
241
+ length( $al1 ) == 1 and length( $al2 ) == 1 and length( $ref ) == 1 ) {
242
+ $al1 = $ref if( $al1 eq "-" );
243
+ $al2 = $ref if( $al2 eq "-" );
244
+ warn "WARNING: Replacing '-' with reference allele in: $line";
245
+ }
246
+
247
+ # Blank out the dashes (or other weird chars) used with indels
248
+ ( $ref, $al1, $al2, $n_al1, $n_al2 ) = map{s/^(\?|-|0)+$//; $_} ( $ref, $al1, $al2, $n_al1, $n_al2 );
249
+
250
+ # If normal alleles are unset in the MAF (quite common), assume homozygous reference
251
+ $n_al1 = $ref if( $n_al1 eq "" );
252
+ $n_al2 = $ref if( $n_al2 eq "" );
253
+
254
+ # Do a sanity check on all the alleles
255
+ unless( $al1=~m/^[ACGT-]*$/ and $al2=~m/^[ACGT-]*$/ and $n_al1=~m/^[ACGT-]*$/ and $n_al2=~m/^[ACGT-]*$/ ) {
256
+ die "ERROR: MAF line $line_count (at $chr:$pos) contains invalid alleles in Tumor_Seq_Allele or Match_Norm_Seq_Allele columns!\n";
257
+ }
258
+
259
+ # To simplify code coming up below, ensure that $al2 is always non-REF
260
+ ( $al1, $al2 ) = ( $al2, $al1 ) if( $al2 eq $ref );
261
+ # Do the same for the normal alleles, though it makes no difference if both are REF
262
+ ( $n_al1, $n_al2 ) = ( $n_al2, $n_al1 ) if( $n_al2 eq $ref );
263
+
264
+ # Except for MAF-format simple insertions, check ref alleles, and skip lines that mismatch
265
+ my $locus = "$chr:" . ( $pos - 1 ) . "-" . ( $pos + length( $ref ));
266
+ if( $ref ne "" or !defined $flanking_bps{$locus} ) {
267
+ my $ref_from_fasta = ( defined $flanking_bps{$locus} ? substr( $flanking_bps{$locus}, 1, -1 ) : "" );
268
+ if( $ref ne $ref_from_fasta or !defined $flanking_bps{$locus} ) {
269
+ # Create the file for skipped variants, if it wasn't already
270
+ unless( $skipped_fh ) {
271
+ my $skip_file = "$output_dir/" . substr( $input_maf, rindex( $input_maf, '/' ) + 1 );
272
+ $skip_file =~ s/(\.)?(maf|tsv|txt)?$/.skipped.tsv/;
273
+ $skipped_fh = IO::File->new( $skip_file, ">" ) or die "ERROR: Failed to create file $skip_file\n";
274
+ warn "WARNING: Reference allele mismatches found. Storing them here for debugging: $skip_file\n";
275
+ $skipped_fh->print( $header_line );
276
+ }
277
+ $skipped_fh->print( $line );
278
+ next;
279
+ }
280
+ }
281
+
282
+ # To represent indels in VCF format, we need the preceding bp in the reference FASTA
283
+ my ( $ref_len, $al1_len, $al2_len ) = map{ length( $_ ) } ( $ref, $al1, $al2 );
284
+ if( $ref_len == 0 or $al1_len == 0 or $al2_len == 0 or ( $ref_len ne $al2_len and substr( $ref, 0, 1 ) ne substr( $al2, 0, 1 ))) {
285
+ my $prefix_bp = substr( $flanking_bps{$locus}, 0, 1 );
286
+ # For MAF-format simple insertions, $pos is already the locus of the preceding bp
287
+ $prefix_bp = substr( $flanking_bps{$locus}, 1, 1 ) if( $ref eq "" );
288
+ # If this is not a MAF-format simple insertion, decrement $pos
289
+ --$pos unless( $ref eq "" );
290
+ # Prefix the fetched reference bp to all the alleles
291
+ ( $ref, $al1, $al2, $n_al1, $n_al2 ) = map{$prefix_bp.$_} ( $ref, $al1, $al2, $n_al1, $n_al2 );
292
+ }
293
+
294
+ # Fill an array with all unique REF/ALT alleles, and set their 0-based indexes like in a VCF
295
+ # Notice how we ensure that $alleles[0] is REF and $alleles[1] is the major ALT allele in tumor
296
+ my ( @alleles, %al_idx );
297
+ my $idx = 0;
298
+ foreach my $al ( $ref, $al2, $al1, $n_al2, $n_al1 ) {
299
+ unless( defined $al_idx{$al} ) {
300
+ push( @alleles, $al );
301
+ $al_idx{$al} = $idx++;
302
+ }
303
+ }
304
+
305
+ # Set tumor and normal genotypes (FORMAT tag GT in VCF)
306
+ my ( $t_gt, $n_gt ) = ( "0/1", "0/0" ); # Set defaults
307
+ $t_gt = join( "/", $al_idx{$al2}, $al_idx{$al1} ) if( $al_idx{$al1} ne "0" );
308
+ $n_gt = join( "/", $al_idx{$n_al2}, $al_idx{$n_al1} ) if( $al_idx{$n_al1} ne "0" );
309
+ $n_gt = join( "/", $al_idx{$n_al1}, $al_idx{$n_al2} ) if( $al_idx{$n_al2} ne "0" );
310
+
311
+ # Create the VCF's comma-delimited ALT field that must list all non-REF (variant) alleles
312
+ my $alt = join( ",", @alleles[1..$#alleles] );
313
+
314
+ # If there are >1 variant alleles, assume that depths in $t_vad and $n_vad are for $al2
315
+ if( scalar( @alleles ) > 2 ) {
316
+ $t_vad = join( ",", $t_vad, map{"."}@alleles[2..$#alleles] );
317
+ $n_vad = join( ",", $n_vad, map{"."}@alleles[2..$#alleles] );
318
+ }
319
+
320
+ # Construct genotype fields for FORMAT tags GT:AD:DP
321
+ my $t_fmt = "$t_gt:$t_rad,$t_vad:$t_dp";
322
+ my $n_fmt = "$n_gt:$n_rad,$n_vad:$n_dp";
323
+
324
+ # Contruct a VCF formatted line and append it to the respective VCF
325
+ if( $per_tn_vcfs ) {
326
+ my $vcf_file = "$output_dir/$t_id\_vs_$n_id.vcf";
327
+ my $vcf_line = join( "\t", $chr, $pos, $id, $ref, $alt, $qual, $filter, "VC=$vc:VT=$vt", "GT:AD:DP", $t_fmt, $n_fmt );
328
+ $tn_vcf{$vcf_file} .= "$vcf_line\n";
329
+ }
330
+
331
+ # Store VCF formatted data for the multi-sample VCF
332
+ my $key = join( "\t", $chr, $pos, $ref, $alt, $vc, $vt );
333
+ push( @var_key, $key ) unless( exists $var_frmt{ $key } );
334
+ $var_frmt{ $key }{ $vcf_col_idx{ $t_id }} = $t_fmt;
335
+ $var_frmt{ $key }{ $vcf_col_idx{ $n_id }} = $n_fmt;
336
+ # ::NOTE:: Samples shouldn't have different ID, QUAL, or FILTERs for the same loci+alleles
337
+ $var_fltr{ $key } = $filter;
338
+ $var_id{ $key } = $id;
339
+ $var_qual{ $key } = $qual;
340
+ }
341
+ $maf_fh->close;
342
+ $skipped_fh->close if( $skipped_fh );
343
+
344
+ # Write the cached contents of per-TN VCFs into files
345
+ if( $per_tn_vcfs ) {
346
+ print STDOUT "INFO: Writing per-TN VCFs to output directory\n";
347
+ foreach my $vcf_file ( keys %tn_vcf ) {
348
+ my $tn_vcf_fh = IO::File->new( $vcf_file, ">" ) or die "ERROR: Failed to create file $vcf_file\n";
349
+ $tn_vcf_fh->print( $tn_vcf{$vcf_file} );
350
+ $tn_vcf_fh->close;
351
+ }
352
+ }
353
+
354
+ # Initialize header lines for the multi-sample VCF
355
+ my @vcf_cols = sort { $vcf_col_idx{$a} <=> $vcf_col_idx{$b} } keys %vcf_col_idx;
356
+ my $vcf_fh = IO::File->new( $output_vcf, ">" ) or die "ERROR: Fail to create file $output_vcf\n";
357
+ $vcf_fh->print( "##fileformat=VCFv4.2\n" );
358
+ $vcf_fh->print( $ref_header );
359
+ $vcf_fh->print( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n" );
360
+ $vcf_fh->print( "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic Depths of REF and ALT(s) in the order listed\">\n" );
361
+ $vcf_fh->print( "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n" );
362
+ $vcf_fh->print( "##INFO=<ID=VC,Number=1,Type=String,Description=\"Variant_Classification\">\n" );
363
+ $vcf_fh->print( "##INFO=<ID=VT,Number=1,Type=String,Description=\"Variant_Type\">\n" );
364
+ $vcf_fh->print( "##FILTER=<ID=$_,Description=\"$_\">\n" ) foreach ( sort keys %filter_tags );
365
+ $vcf_fh->print( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" . join("\t", @vcf_cols) . "\n" );
366
+
367
+ # Write each variant into the multi-sample VCF
368
+ print STDOUT "INFO: Writing multi-sample VCF\n";
369
+ foreach my $key ( @var_key ) {
370
+ my ( $chr, $pos, $ref, $alt, $vc, $vt ) = split( "\t", $key );
371
+ $vcf_fh->print( join( "\t", $chr, $pos, $var_id{ $key }, $ref, $alt, $var_qual{ $key }, $var_fltr{ $key }, "VC=$vc;VT=$vt", "GT:AD:DP" ));
372
+ map{ $vcf_fh->print( "\t" . (( exists $var_frmt{$key}{$_} ) ? $var_frmt{$key}{$_} : './.:.:.' ))}( 0..$#vcf_cols );
373
+ $vcf_fh->print( "\n" );
374
+ }
375
+ $vcf_fh->close;
376
+
377
+ # Make sure that we handled a positive non-zero number of lines in the MAF
378
+ ( $line_count > 0 ) or die "ERROR: No variant lines in the input MAF!\n";
379
+
380
+ __DATA__
381
+
382
+ =head1 NAME
383
+
384
+ maf2vcf.pl - Reformat variants in a MAF into a multisample VCF with GT:AD:DP data if available
385
+
386
+ =head1 SYNOPSIS
387
+
388
+ perl maf2vcf.pl --help
389
+ perl maf2vcf.pl --input-maf test.maf --output-dir vcfs
390
+
391
+ =head1 OPTIONS
392
+
393
+ --input-maf Path to input file in MAF format
394
+ --output-dir Path to output directory where VCFs will be stored, one per TN-pair
395
+ --output-vcf Path to output multi-sample VCF containing all TN-pairs [<output-dir>/<input-maf-name>.vcf]
396
+ --ref-fasta Path to reference Fasta file [~/.vep/homo_sapiens/102_GRCh37/Homo_sapiens.GRCh37.dna.toplevel.fa.gz]
397
+ --samtools Path to samtools binary [samtools]
398
+ --per-tn-vcfs Specify this to generate VCFs per-TN pair, in addition to the multi-sample VCF
399
+ --tum-depth-col Name of MAF column for read depth in tumor BAM [t_depth]
400
+ --tum-rad-col Name of MAF column for reference allele depth in tumor BAM [t_ref_count]
401
+ --tum-vad-col Name of MAF column for variant allele depth in tumor BAM [t_alt_count]
402
+ --nrm-depth-col Name of MAF column for read depth in normal BAM [n_depth]
403
+ --nrm-rad-col Name of MAF column for reference allele depth in normal BAM [n_ref_count]
404
+ --nrm-vad-col Name of MAF column for variant allele depth in normal BAM [n_alt_count]
405
+ --help Print a brief help message and quit
406
+ --man Print the detailed manual
407
+
408
+ =head1 DESCRIPTION
409
+
410
+ This script breaks down variants in a MAF into a multi-sample VCF, in preparation for annotation with VEP. Can also create VCFs per-TN pair.
411
+
412
+ =head2 Relevant links:
413
+
414
+ Homepage: https://github.com/ckandoth/vcf2maf
415
+ VCF format: http://samtools.github.io/hts-specs/
416
+ MAF format: https://wiki.nci.nih.gov/x/eJaPAQ
417
+
418
+ =head1 AUTHORS
419
+
420
+ Cyriac Kandoth (ckandoth@gmail.com)
421
+ Qingguo Wang (josephw10000@gmail.com)
422
+
423
+ =head1 LICENSE
424
+
425
+ Apache-2.0 | Apache License, Version 2.0 | https://www.apache.org/licenses/LICENSE-2.0
426
+
427
+ =cut
@@ -0,0 +1,26 @@
1
+ from os import path
2
+ import cmdy
3
+
4
+ infile = {{in.infile | quote}} # pyright: ignore
5
+ outfile = {{out.outfile | quote}} # pyright: ignore
6
+ joboutdir = {{job.outdir | quote}} # pyright: ignore
7
+ vcfanno = {{envs.vcfanno | quote}} # pyright: ignore
8
+ ncores = {{envs.ncores | repr}} # pyright: ignore
9
+ args = {{envs.args | repr}} # pyright: ignore
10
+
11
+ {% set conf = envs.conffile or in.conffile %}
12
+ {% if conf | isinstance: dict %}
13
+ conffile = path.join(joboutdir, "config.toml")
14
+ conf = {{ conf | toml | quote }}
15
+ with open(conffile, "w") as f:
16
+ f.write(conf)
17
+ {% else %}
18
+ conffile = {{conf | quote}}
19
+ {% endif %}
20
+
21
+ args["p"] = ncores
22
+ args["_"] = [conffile, infile]
23
+ args["_exe"] = vcfanno
24
+ args["_prefix"] = "-"
25
+
26
+ cmdy.vcfanno(**args).r() > outfile
@@ -1,5 +1,5 @@
1
1
  import re
2
-
2
+ import gzip
3
3
  from biopipen.utils.vcf import * # noqa: F401, F403
4
4
 
5
5
 
@@ -63,7 +63,8 @@ def fix_vcffile(vcffile, outfile, fixes):
63
63
  else:
64
64
  modify_fixes.append(fix)
65
65
 
66
- with open(vcffile, "r") as fin, open(outfile, "w") as fout:
66
+ inopen = gzip.open if vcffile.endswith(".gz") else open
67
+ with inopen(vcffile, "rt") as fin, open(outfile, "w") as fout:
67
68
  for line in fin:
68
69
  obj = line_to_obj(line)
69
70
  out = handle_obj(obj, modify_fixes)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: biopipen
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: Bioinformatics processes/pipelines that can be run from `pipen run`
5
5
  License: MIT
6
6
  Author: pwwang
@@ -12,12 +12,11 @@ Classifier: Programming Language :: Python :: 3.8
12
12
  Classifier: Programming Language :: Python :: 3.9
13
13
  Classifier: Programming Language :: Python :: 3.10
14
14
  Classifier: Programming Language :: Python :: 3.11
15
- Provides-Extra: test
16
15
  Requires-Dist: cmdy (>=0.5,<0.6)
17
16
  Requires-Dist: datar[pandas] (>=0.11,<0.12)
18
- Requires-Dist: pipen (>=0.3,<0.4)
19
- Requires-Dist: pipen-args (>=0.3,<0.4) ; extra == "test"
20
- Requires-Dist: pipen-cli-run (>=0.4,<0.5)
21
- Requires-Dist: pipen-filters (>=0.1,<0.2)
22
- Requires-Dist: pipen-report (>=0.4,<0.5)
23
- Requires-Dist: pipen-verbose (>=0.1,<0.2) ; extra == "test"
17
+ Requires-Dist: pipen (>=0.5,<0.6)
18
+ Requires-Dist: pipen-args (>=0.6,<0.7)
19
+ Requires-Dist: pipen-cli-run (>=0.5,<0.6)
20
+ Requires-Dist: pipen-filters (>=0.4,<0.5)
21
+ Requires-Dist: pipen-report (>=0.6,<0.7)
22
+ Requires-Dist: pipen-verbose (>=0.3,<0.4)