ngs_server 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/ngs_server +72 -50
- data/ext/bamtools/extconf.rb +3 -3
- data/ext/vcftools/Makefile +28 -0
- data/ext/vcftools/README.txt +36 -0
- data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
- data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
- data/ext/vcftools/cpp/.svn/entries +708 -0
- data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
- data/ext/vcftools/cpp/Makefile +46 -0
- data/ext/vcftools/cpp/dgeev.cpp +146 -0
- data/ext/vcftools/cpp/dgeev.h +43 -0
- data/ext/vcftools/cpp/output_log.cpp +79 -0
- data/ext/vcftools/cpp/output_log.h +34 -0
- data/ext/vcftools/cpp/parameters.cpp +535 -0
- data/ext/vcftools/cpp/parameters.h +154 -0
- data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
- data/ext/vcftools/cpp/vcf_entry.h +190 -0
- data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
- data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
- data/ext/vcftools/cpp/vcf_file.cpp +495 -0
- data/ext/vcftools/cpp/vcf_file.h +184 -0
- data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
- data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
- data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
- data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
- data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
- data/ext/vcftools/cpp/vcftools.cpp +107 -0
- data/ext/vcftools/cpp/vcftools.h +25 -0
- data/ext/vcftools/examples/.svn/all-wcprops +185 -0
- data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
- data/ext/vcftools/examples/.svn/entries +1048 -0
- data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
- data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
- data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
- data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
- data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
- data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
- data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
- data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
- data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
- data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
- data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
- data/ext/vcftools/examples/annotate-test.vcf +37 -0
- data/ext/vcftools/examples/annotate.out +23 -0
- data/ext/vcftools/examples/annotate.txt +7 -0
- data/ext/vcftools/examples/annotate2.out +52 -0
- data/ext/vcftools/examples/annotate3.out +23 -0
- data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
- data/ext/vcftools/examples/cmp-test.out +53 -0
- data/ext/vcftools/examples/concat-a.vcf +21 -0
- data/ext/vcftools/examples/concat-b.vcf +13 -0
- data/ext/vcftools/examples/concat-c.vcf +19 -0
- data/ext/vcftools/examples/concat.out +39 -0
- data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
- data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
- data/ext/vcftools/examples/merge-test-a.vcf +17 -0
- data/ext/vcftools/examples/merge-test-b.vcf +17 -0
- data/ext/vcftools/examples/merge-test-c.vcf +15 -0
- data/ext/vcftools/examples/merge-test.vcf.out +31 -0
- data/ext/vcftools/examples/perl-api-1.pl +46 -0
- data/ext/vcftools/examples/query-test.out +6 -0
- data/ext/vcftools/examples/shuffle-test.vcf +12 -0
- data/ext/vcftools/examples/subset.SNPs.out +10 -0
- data/ext/vcftools/examples/subset.indels.out +18 -0
- data/ext/vcftools/examples/subset.vcf +21 -0
- data/ext/vcftools/examples/valid-3.3.vcf +30 -0
- data/ext/vcftools/examples/valid-4.0.vcf +34 -0
- data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
- data/ext/vcftools/examples/valid-4.1.vcf +37 -0
- data/ext/vcftools/extconf.rb +2 -0
- data/ext/vcftools/perl/.svn/all-wcprops +149 -0
- data/ext/vcftools/perl/.svn/entries +844 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
- data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
- data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
- data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
- data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
- data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
- data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
- data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
- data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
- data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
- data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
- data/ext/vcftools/perl/ChangeLog +84 -0
- data/ext/vcftools/perl/FaSlice.pm +214 -0
- data/ext/vcftools/perl/Makefile +12 -0
- data/ext/vcftools/perl/Vcf.pm +2853 -0
- data/ext/vcftools/perl/VcfStats.pm +681 -0
- data/ext/vcftools/perl/fill-aa +103 -0
- data/ext/vcftools/perl/fill-an-ac +56 -0
- data/ext/vcftools/perl/fill-ref-md5 +204 -0
- data/ext/vcftools/perl/tab-to-vcf +92 -0
- data/ext/vcftools/perl/test.t +376 -0
- data/ext/vcftools/perl/vcf-annotate +1099 -0
- data/ext/vcftools/perl/vcf-compare +1193 -0
- data/ext/vcftools/perl/vcf-concat +310 -0
- data/ext/vcftools/perl/vcf-convert +180 -0
- data/ext/vcftools/perl/vcf-fix-newlines +97 -0
- data/ext/vcftools/perl/vcf-isec +660 -0
- data/ext/vcftools/perl/vcf-merge +577 -0
- data/ext/vcftools/perl/vcf-query +286 -0
- data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
- data/ext/vcftools/perl/vcf-sort +79 -0
- data/ext/vcftools/perl/vcf-stats +160 -0
- data/ext/vcftools/perl/vcf-subset +206 -0
- data/ext/vcftools/perl/vcf-to-tab +112 -0
- data/ext/vcftools/perl/vcf-validator +145 -0
- data/ext/vcftools/website/.svn/all-wcprops +41 -0
- data/ext/vcftools/website/.svn/entries +238 -0
- data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
- data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
- data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
- data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
- data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
- data/ext/vcftools/website/Makefile +6 -0
- data/ext/vcftools/website/README +2 -0
- data/ext/vcftools/website/VCF-poster.pdf +0 -0
- data/ext/vcftools/website/default.css +250 -0
- data/ext/vcftools/website/favicon.ico +0 -0
- data/ext/vcftools/website/favicon.png +0 -0
- data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/img/.svn/entries +300 -0
- data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
- data/ext/vcftools/website/img/bg.gif +0 -0
- data/ext/vcftools/website/img/bgcode.gif +0 -0
- data/ext/vcftools/website/img/bgcontainer.gif +0 -0
- data/ext/vcftools/website/img/bgul.gif +0 -0
- data/ext/vcftools/website/img/header.gif +0 -0
- data/ext/vcftools/website/img/li.gif +0 -0
- data/ext/vcftools/website/img/quote.gif +0 -0
- data/ext/vcftools/website/img/search.gif +0 -0
- data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/src/.svn/entries +300 -0
- data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
- data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
- data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
- data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
- data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
- data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
- data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
- data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
- data/ext/vcftools/website/src/docs.inc +202 -0
- data/ext/vcftools/website/src/index.inc +52 -0
- data/ext/vcftools/website/src/index.php +80 -0
- data/ext/vcftools/website/src/license.inc +27 -0
- data/ext/vcftools/website/src/links.inc +13 -0
- data/ext/vcftools/website/src/options.inc +654 -0
- data/ext/vcftools/website/src/perl_module.inc +249 -0
- data/ext/vcftools/website/src/specs.inc +18 -0
- data/lib/config.ru +9 -0
- data/lib/ngs_server/add.rb +9 -0
- data/lib/ngs_server/version.rb +1 -1
- data/lib/ngs_server.rb +55 -3
- data/ngs_server.gemspec +5 -2
- metadata +296 -6
@@ -0,0 +1,2853 @@
|
|
1
|
+
package Vcf;
|
2
|
+
|
3
|
+
# http://vcftools.sourceforge.net/specs.html
|
4
|
+
# http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
|
5
|
+
# http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:variant_call_format
|
6
|
+
# http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf4.0
|
7
|
+
# http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf_4.0_sv
|
8
|
+
# http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3
|
9
|
+
# http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcfv3.2
|
10
|
+
#
|
11
|
+
# Authors: petr.danecek@sanger
|
12
|
+
# for VCF v3.2, v3.3, v4.0, v4.1
|
13
|
+
#
|
14
|
+
|
15
|
+
=head1 NAME
|
16
|
+
|
17
|
+
Vcf.pm. Module for validation, parsing and creating VCF files.
|
18
|
+
Supported versions: 3.2, 3.3, 4.0, 4.1
|
19
|
+
|
20
|
+
=head1 SYNOPSIS
|
21
|
+
|
22
|
+
From the command line:
|
23
|
+
perl -MVcf -e validate example.vcf
|
24
|
+
perl -I/path/to/the/module/ -MVcf -e validate_v32 example.vcf
|
25
|
+
|
26
|
+
From a script:
|
27
|
+
use Vcf;
|
28
|
+
|
29
|
+
my $vcf = Vcf->new(file=>'example.vcf.gz',region=>'1:1000-2000');
|
30
|
+
$vcf->parse_header();
|
31
|
+
|
32
|
+
# Do some simple parsing. Most thorough but slowest way how to get the data.
|
33
|
+
while (my $x=$vcf->next_data_hash())
|
34
|
+
{
|
35
|
+
for my $gt (keys %{$$x{gtypes}})
|
36
|
+
{
|
37
|
+
my ($al1,$sep,$al2) = $vcf->parse_alleles($x,$gt);
|
38
|
+
print "\t$gt: $al1$sep$al2\n";
|
39
|
+
}
|
40
|
+
print "\n";
|
41
|
+
}
|
42
|
+
|
43
|
+
# This will split the fields and print a list of CHR:POS
|
44
|
+
while (my $x=$vcf->next_data_array())
|
45
|
+
{
|
46
|
+
print "$$x[0]:$$x[1]\n";
|
47
|
+
}
|
48
|
+
|
49
|
+
# This will return the lines as they were read, including the newline at the end
|
50
|
+
while (my $x=$vcf->next_line())
|
51
|
+
{
|
52
|
+
print $x;
|
53
|
+
}
|
54
|
+
|
55
|
+
# Only the columns NA00001, NA00002 and NA00003 will be printed.
|
56
|
+
my @columns = qw(NA00001 NA00002 NA00003);
|
57
|
+
print $vcf->format_header(\@columns);
|
58
|
+
while (my $x=$vcf->next_data_array())
|
59
|
+
{
|
60
|
+
# this will recalculate AC and AN counts, unless $vcf->recalc_ac_an was set to 0
|
61
|
+
print $vcf->format_line($x,\@columns);
|
62
|
+
}
|
63
|
+
|
64
|
+
$vcf->close();
|
65
|
+
|
66
|
+
=cut
|
67
|
+
|
68
|
+
|
69
|
+
use strict;
|
70
|
+
use warnings;
|
71
|
+
use Carp;
|
72
|
+
use Exporter;
|
73
|
+
use Data::Dumper;
|
74
|
+
use POSIX ":sys_wait_h";
|
75
|
+
|
76
|
+
use vars qw/@ISA @EXPORT/;
|
77
|
+
@ISA = qw/Exporter/;
|
78
|
+
@EXPORT = qw/validate validate_v32/;
|
79
|
+
|
80
|
+
=head2 validate
|
81
|
+
|
82
|
+
About : Validates the VCF file.
|
83
|
+
Usage : perl -MVcf -e validate example.vcf.gz # (from the command line)
|
84
|
+
validate('example.vcf.gz'); # (from a script)
|
85
|
+
validate(\*STDIN);
|
86
|
+
Args : File name or file handle. When no argument given, the first command line
|
87
|
+
argument is interpreted as the file name.
|
88
|
+
|
89
|
+
=cut
|
90
|
+
|
91
|
+
sub validate
|
92
|
+
{
|
93
|
+
my ($fh) = @_;
|
94
|
+
|
95
|
+
if ( !$fh && @ARGV ) { $fh = $ARGV[0]; }
|
96
|
+
|
97
|
+
my $vcf;
|
98
|
+
if ( $fh ) { $vcf = fileno($fh) ? Vcf->new(fh=>$fh) : Vcf->new(file=>$fh); }
|
99
|
+
else { $vcf = Vcf->new(fh=>\*STDIN); }
|
100
|
+
|
101
|
+
$vcf->run_validation();
|
102
|
+
}
|
103
|
+
|
104
|
+
|
105
|
+
=head2 validate_v32
|
106
|
+
|
107
|
+
About : Same as validate, but assumes v3.2 VCF version.
|
108
|
+
Usage : perl -MVcf -e validate_v32 example.vcf.gz # (from the command line)
|
109
|
+
Args : File name or file handle. When no argument given, the first command line
|
110
|
+
argument is interpreted as the file name.
|
111
|
+
|
112
|
+
=cut
|
113
|
+
|
114
|
+
sub validate_v32
|
115
|
+
{
|
116
|
+
my ($fh) = @_;
|
117
|
+
|
118
|
+
if ( !$fh && @ARGV && -e $ARGV[0] ) { $fh = $ARGV[0]; }
|
119
|
+
|
120
|
+
my %params = ( version=>'3.2' );
|
121
|
+
|
122
|
+
my $vcf;
|
123
|
+
if ( $fh ) { $vcf = fileno($fh) ? Vcf->new(%params, fh=>$fh) : Vcf->new(%params, file=>$fh); }
|
124
|
+
else { $vcf = Vcf->new(%params, fh=>\*STDIN); }
|
125
|
+
|
126
|
+
$vcf->run_validation();
|
127
|
+
}
|
128
|
+
|
129
|
+
|
130
|
+
=head2 new
|
131
|
+
|
132
|
+
About : Creates new VCF reader/writer.
|
133
|
+
Usage : my $vcf = Vcf->new(file=>'my.vcf', version=>'3.2');
|
134
|
+
Args :
|
135
|
+
fh .. Open file handle. If neither file nor fh is given, open in write mode.
|
136
|
+
file .. The file name. If neither file nor fh is given, open in write mode.
|
137
|
+
region .. Optional region to parse (requires tabix indexed VCF file)
|
138
|
+
silent .. Unless set to 0, warning messages may be printed.
|
139
|
+
strict .. Unless set to 0, the reader will die when the file violates the specification.
|
140
|
+
version .. If not given, '4.0' is assumed. The header information overrides this setting.
|
141
|
+
|
142
|
+
=cut
|
143
|
+
|
144
|
+
sub new
|
145
|
+
{
|
146
|
+
my ($class,@args) = @_;
|
147
|
+
my $self = {@args};
|
148
|
+
bless $self, ref($class) || $class;
|
149
|
+
|
150
|
+
$$self{silent} = 0 unless exists($$self{silent});
|
151
|
+
$$self{strict} = 0 unless exists($$self{strict});
|
152
|
+
$$self{buffer} = []; # buffer stores the lines in the reverse order
|
153
|
+
$$self{columns} = undef; # column names
|
154
|
+
$$self{mandatory} = ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO'] unless exists($$self{mandatory});
|
155
|
+
$$self{reserved}{cols} = {CHROM=>1,POS=>1,ID=>1,REF=>1,ALT=>1,QUAL=>1,FILTER=>1,INFO=>1,FORMAT=>1} unless exists($$self{reserved_cols});
|
156
|
+
$$self{recalc_ac_an} = 1;
|
157
|
+
$$self{has_header} = 0;
|
158
|
+
$$self{default_version} = '4.1';
|
159
|
+
$$self{versions} = [ qw(Vcf3_2 Vcf3_3 Vcf4_0 Vcf4_1) ];
|
160
|
+
if ( !exists($$self{max_line_len}) && exists($ENV{MAX_VCF_LINE_LEN}) ) { $$self{max_line_len} = $ENV{MAX_VCF_LINE_LEN} }
|
161
|
+
$$self{fix_v40_AGtags} = $ENV{DONT_FIX_VCF40_AG_TAGS} ? 0 : 1;
|
162
|
+
my %open_args = ();
|
163
|
+
if ( exists($$self{region}) ) { $open_args{region}=$$self{region}; }
|
164
|
+
if ( exists($$self{print_header}) ) { $open_args{print_header}=$$self{print_header}; }
|
165
|
+
return $self->_open(%open_args);
|
166
|
+
}
|
167
|
+
|
168
|
+
sub throw
|
169
|
+
{
|
170
|
+
my ($self,@msg) = @_;
|
171
|
+
confess @msg,"\n";
|
172
|
+
}
|
173
|
+
|
174
|
+
sub warn
|
175
|
+
{
|
176
|
+
my ($self,@msg) = @_;
|
177
|
+
if ( $$self{silent} ) { return; }
|
178
|
+
if ( $$self{strict} ) { $self->throw(@msg); }
|
179
|
+
warn @msg;
|
180
|
+
}
|
181
|
+
|
182
|
+
sub _open
|
183
|
+
{
|
184
|
+
my ($self,%args) = @_;
|
185
|
+
|
186
|
+
if ( !exists($$self{fh}) && !exists($$self{file}) )
|
187
|
+
{
|
188
|
+
# Write mode, the version must be supplied by the user
|
189
|
+
return $self->_set_version(exists($$self{version}) ? $$self{version} : $$self{default_version});
|
190
|
+
}
|
191
|
+
|
192
|
+
# Open the file unless filehandle is provided
|
193
|
+
if ( !exists($$self{fh}) )
|
194
|
+
{
|
195
|
+
my $cmd = "<$$self{file}";
|
196
|
+
|
197
|
+
my $tabix_args = '';
|
198
|
+
if ( exists($args{print_header}) && $args{print_header} ) { $tabix_args .= ' -h '; }
|
199
|
+
$tabix_args .= $$self{file};
|
200
|
+
if ( exists($args{region}) && defined($args{region}) ) { $tabix_args .= ' '.$args{region}; }
|
201
|
+
|
202
|
+
if ( -e $$self{file} && $$self{file}=~/\.gz/i )
|
203
|
+
{
|
204
|
+
if ( exists($args{region}) && defined($args{region}) )
|
205
|
+
{
|
206
|
+
$cmd = "tabix $tabix_args |";
|
207
|
+
}
|
208
|
+
else { $cmd = "gunzip -c $$self{file} |"; }
|
209
|
+
$$self{check_exit_status} = 1;
|
210
|
+
}
|
211
|
+
elsif ( $$self{file}=~m{^(?:http|ftp)://} )
|
212
|
+
{
|
213
|
+
if ( !exists($args{region}) ) { $tabix_args .= ' .'; }
|
214
|
+
$cmd = "tabix $tabix_args |";
|
215
|
+
$$self{check_exit_status} = 1;
|
216
|
+
}
|
217
|
+
open($$self{fh},$cmd) or $self->throw("$cmd: $!");
|
218
|
+
}
|
219
|
+
|
220
|
+
# Set the correct VCF version, but only when called for the first time
|
221
|
+
my $vcf = $self;
|
222
|
+
if ( !$$self{_version_set} )
|
223
|
+
{
|
224
|
+
my $first_line = $self->next_line();
|
225
|
+
$vcf = $self->_set_version($first_line);
|
226
|
+
$self->_unread_line($first_line);
|
227
|
+
}
|
228
|
+
return $vcf;
|
229
|
+
}
|
230
|
+
|
231
|
+
|
232
|
+
|
233
|
+
=head2 open
|
234
|
+
|
235
|
+
About : (Re)Open file. No need to call this explicitly unless reading from a different
|
236
|
+
region is requested.
|
237
|
+
Usage : $vcf->open(); # Read from the start
|
238
|
+
$vcf->open(region=>'1:12345-92345');
|
239
|
+
Args : region .. Supported only for tabix indexed files
|
240
|
+
|
241
|
+
=cut
|
242
|
+
|
243
|
+
sub open
|
244
|
+
{
|
245
|
+
my ($self,%args) = @_;
|
246
|
+
$self->close();
|
247
|
+
$self->_open(%args);
|
248
|
+
}
|
249
|
+
|
250
|
+
|
251
|
+
=head2 close
|
252
|
+
|
253
|
+
About : Close the filehandle
|
254
|
+
Usage : $vcf->close();
|
255
|
+
Args : none
|
256
|
+
|
257
|
+
=cut
|
258
|
+
|
259
|
+
sub close
|
260
|
+
{
|
261
|
+
my ($self) = @_;
|
262
|
+
if ( !$$self{fh} ) { return; }
|
263
|
+
close($$self{fh});
|
264
|
+
delete($$self{fh});
|
265
|
+
}
|
266
|
+
|
267
|
+
|
268
|
+
=head2 next_line
|
269
|
+
|
270
|
+
About : Reads next VCF line.
|
271
|
+
Usage : my $vcf = Vcf->new();
|
272
|
+
my $x = $vcf->next_line();
|
273
|
+
Args : none
|
274
|
+
|
275
|
+
=cut
|
276
|
+
|
277
|
+
sub next_line
|
278
|
+
{
|
279
|
+
my ($self) = @_;
|
280
|
+
if ( @{$$self{buffer}} ) { return shift(@{$$self{buffer}}); }
|
281
|
+
|
282
|
+
my $line;
|
283
|
+
if ( !exists($$self{max_line_len}) )
|
284
|
+
{
|
285
|
+
$line = readline($$self{fh});
|
286
|
+
}
|
287
|
+
else
|
288
|
+
{
|
289
|
+
while (1)
|
290
|
+
{
|
291
|
+
$line = readline($$self{fh});
|
292
|
+
if ( !defined $line ) { last; }
|
293
|
+
|
294
|
+
my $len = length($line);
|
295
|
+
if ( $len>$$self{max_line_len} && !($line=~/^#/) )
|
296
|
+
{
|
297
|
+
if ( !($line=~/^([^\t]+)\t([^\t]+)/) ) { $self->throw("Could not parse the line: $line"); }
|
298
|
+
$self->warn("The VCF line too long, ignoring: $1 $2 .. len=$len\n");
|
299
|
+
next;
|
300
|
+
}
|
301
|
+
last;
|
302
|
+
}
|
303
|
+
}
|
304
|
+
if ( !defined $line && $$self{check_exit_status} )
|
305
|
+
{
|
306
|
+
my $pid = waitpid(-1, WNOHANG);
|
307
|
+
if ( $pid!=0 && $pid!=-1 && $? !=0 )
|
308
|
+
{
|
309
|
+
$self->throw("Error reading VCF file.\n");
|
310
|
+
}
|
311
|
+
}
|
312
|
+
return $line;
|
313
|
+
}
|
314
|
+
|
315
|
+
sub _unread_line
|
316
|
+
{
|
317
|
+
my ($self,$line) = @_;
|
318
|
+
unshift @{$$self{buffer}}, $line;
|
319
|
+
return;
|
320
|
+
}
|
321
|
+
|
322
|
+
|
323
|
+
=head2 next_data_array
|
324
|
+
|
325
|
+
About : Reads next VCF line and splits it into an array. The last element is chomped.
|
326
|
+
Usage : my $vcf = Vcf->new();
|
327
|
+
$vcf->parse_header();
|
328
|
+
my $x = $vcf->next_data_array();
|
329
|
+
Args : Optional line to parse
|
330
|
+
|
331
|
+
=cut
|
332
|
+
|
333
|
+
sub next_data_array
|
334
|
+
{
|
335
|
+
my ($self,$line) = @_;
|
336
|
+
if ( !$line ) { $line = $self->next_line(); }
|
337
|
+
if ( !$line ) { return undef; }
|
338
|
+
my @items = split(/\t/,$line);
|
339
|
+
chomp($items[-1]);
|
340
|
+
return \@items;
|
341
|
+
}
|
342
|
+
|
343
|
+
|
344
|
+
=head2 set_samples
|
345
|
+
|
346
|
+
About : Parsing big VCF files with many sample columns is slow, not parsing unwanted samples may speed things a bit.
|
347
|
+
Usage : my $vcf = Vcf->new();
|
348
|
+
$vcf->set_samples(include=>['NA0001']); # Exclude all but this sample. When the array is empty, all samples will be excluded.
|
349
|
+
$vcf->set_samples(exclude=>['NA0003']); # Include only this sample. When the array is empty, all samples will be included.
|
350
|
+
my $x = $vcf->next_data_hash();
|
351
|
+
Args : Optional line to parse
|
352
|
+
|
353
|
+
=cut
|
354
|
+
|
355
|
+
sub set_samples
|
356
|
+
{
|
357
|
+
my ($self,%args) = @_;
|
358
|
+
|
359
|
+
if ( exists($args{include}) )
|
360
|
+
{
|
361
|
+
for (my $i=0; $i<@{$$self{columns}}; $i++) { $$self{samples_to_parse}[$i] = 0; }
|
362
|
+
for my $sample (@{$args{include}})
|
363
|
+
{
|
364
|
+
if ( !exists($$self{has_column}{$sample}) ) { $self->throw("The sample not present in the VCF file: [$sample]\n"); }
|
365
|
+
my $idx = $$self{has_column}{$sample} - 1;
|
366
|
+
$$self{samples_to_parse}[$idx] = 1;
|
367
|
+
}
|
368
|
+
}
|
369
|
+
|
370
|
+
if ( exists($args{exclude}) )
|
371
|
+
{
|
372
|
+
for (my $i=0; $i<@{$$self{columns}}; $i++) { $$self{samples_to_parse}[$i] = 1; }
|
373
|
+
for my $sample (@{$args{exclude}})
|
374
|
+
{
|
375
|
+
if ( !exists($$self{has_column}{$sample}) ) { $self->throw("The sample not present in the VCF file: [$sample]\n"); }
|
376
|
+
my $idx = $$self{has_column}{$sample} - 1;
|
377
|
+
$$self{samples_to_parse}[$idx] = 0;
|
378
|
+
}
|
379
|
+
}
|
380
|
+
}
|
381
|
+
|
382
|
+
|
383
|
+
sub _set_version
|
384
|
+
{
|
385
|
+
my ($self,$version_line) = @_;
|
386
|
+
|
387
|
+
if ( $$self{_version_set} ) { return $self; }
|
388
|
+
$$self{_version_set} = 1;
|
389
|
+
|
390
|
+
$$self{version} = $$self{default_version};
|
391
|
+
if ( $version_line )
|
392
|
+
{
|
393
|
+
if ( $version_line=~/^(\d+(?:\.\d+)?)$/ )
|
394
|
+
{
|
395
|
+
$$self{version} = $1;
|
396
|
+
undef $version_line;
|
397
|
+
}
|
398
|
+
elsif ( !($version_line=~/^##fileformat=/i) or !($version_line=~/(\d+(?:\.\d+)?)\s*$/i) )
|
399
|
+
{
|
400
|
+
$self->warn("Could not parse the fileformat version string [$version_line], assuming VCFv$$self{default_version}\n");
|
401
|
+
undef $version_line;
|
402
|
+
}
|
403
|
+
else
|
404
|
+
{
|
405
|
+
$$self{version} = $1;
|
406
|
+
}
|
407
|
+
}
|
408
|
+
|
409
|
+
my $reader;
|
410
|
+
if ( $$self{version} eq '3.2' ) { $reader=Vcf3_2->new(%$self); }
|
411
|
+
elsif ( $$self{version} eq '3.3' ) { $reader=Vcf3_3->new(%$self); }
|
412
|
+
elsif ( $$self{version} eq '4.0' ) { $reader=Vcf4_0->new(%$self); }
|
413
|
+
elsif ( $$self{version} eq '4.1' ) { $reader=Vcf4_1->new(%$self); }
|
414
|
+
else
|
415
|
+
{
|
416
|
+
$self->warn(qq[The version "$$self{version}" not supported, assuming VCFv$$self{default_version}\n]);
|
417
|
+
$$self{version} = '4.1';
|
418
|
+
$reader = Vcf4_1->new(%$self);
|
419
|
+
}
|
420
|
+
|
421
|
+
$self = $reader;
|
422
|
+
# When changing version, change also the fileformat header line
|
423
|
+
if ( exists($$self{header_lines}) && exists($$self{header_lines}[0]{key}) && $$self{header_lines}[0]{key} eq 'fileformat' )
|
424
|
+
{
|
425
|
+
shift(@{$$self{header_lines}});
|
426
|
+
}
|
427
|
+
|
428
|
+
return $self;
|
429
|
+
}
|
430
|
+
|
431
|
+
|
432
|
+
#---------------------------------------
|
433
|
+
|
434
|
+
package VcfReader;
|
435
|
+
use base qw(Vcf);
|
436
|
+
use strict;
|
437
|
+
use warnings;
|
438
|
+
use Carp;
|
439
|
+
use Data::Dumper;
|
440
|
+
|
441
|
+
sub new
|
442
|
+
{
|
443
|
+
my ($class,@args) = @_;
|
444
|
+
my $self = {@args};
|
445
|
+
bless $self, ref($class) || $class;
|
446
|
+
return $self;
|
447
|
+
}
|
448
|
+
|
449
|
+
|
450
|
+
=head2 next_data_hash
|
451
|
+
|
452
|
+
About : Reads next VCF line and splits it into a hash. This is the slowest way to obtain the data.
|
453
|
+
Usage : my $vcf = Vcf->new();
|
454
|
+
$vcf->parse_header();
|
455
|
+
my $x = $vcf->next_data_hash();
|
456
|
+
|
457
|
+
# Or having a VCF data line $line
|
458
|
+
my $x = $vcf->next_data_hash($line);
|
459
|
+
|
460
|
+
Args : Optional line to parse.
|
461
|
+
|
462
|
+
=cut
|
463
|
+
|
464
|
+
sub next_data_hash
|
465
|
+
{
|
466
|
+
my ($self,$line) = @_;
|
467
|
+
if ( !$line ) { $line = $self->next_line(); }
|
468
|
+
if ( !$line ) { return undef; }
|
469
|
+
my @items;
|
470
|
+
if ( ref($line) eq 'ARRAY' ) { @items = @$line; }
|
471
|
+
else { @items = split(/\t/,$line); }
|
472
|
+
chomp($items[-1]);
|
473
|
+
|
474
|
+
my $cols = $$self{columns};
|
475
|
+
if ( !$cols )
|
476
|
+
{
|
477
|
+
$self->_fake_column_names(scalar @items - 9);
|
478
|
+
$cols = $$self{columns};
|
479
|
+
}
|
480
|
+
|
481
|
+
# Check the number of columns
|
482
|
+
if ( scalar @items != scalar @$cols )
|
483
|
+
{
|
484
|
+
if ( $line=~/^\s*$/ ) { $self->throw("Sorry, empty lines not allowed.\n"); }
|
485
|
+
if ( $line=~/^#/ ) { $self->throw("FIXME: parse_header must be called before next_data_hash.\n"); }
|
486
|
+
|
487
|
+
$self->warn("Different number of columns at $items[0]:$items[1] (expected ".scalar @$cols.", got ".scalar @items.")\n");
|
488
|
+
while ( $items[-1] eq '' ) { pop(@items); }
|
489
|
+
if ( scalar @items != scalar @$cols )
|
490
|
+
{
|
491
|
+
my @test = split(/\s+/,$line);
|
492
|
+
if ( scalar @test == scalar @$cols ) { $self->warn("(Have spaces been used instead of tabs?)\n\n"); }
|
493
|
+
else { $self->throw("Error not recoverable, exiting.\n"); }
|
494
|
+
|
495
|
+
@items = @test;
|
496
|
+
}
|
497
|
+
else { $self->warn("(Trailing tabs?)\n\n"); }
|
498
|
+
}
|
499
|
+
my %out;
|
500
|
+
|
501
|
+
# Mandatory fields
|
502
|
+
$out{CHROM} = $items[0];
|
503
|
+
$out{POS} = $items[1];
|
504
|
+
$out{ID} = $items[2];
|
505
|
+
$out{REF} = $items[3];
|
506
|
+
$out{ALT} = [ split(/,/,$items[4]) ];
|
507
|
+
$out{QUAL} = $items[5];
|
508
|
+
$out{FILTER} = [ split(/;/,$items[6]) ];
|
509
|
+
|
510
|
+
# INFO, e.g. NS=58;DP=258;AF=0.786;DB;H2
|
511
|
+
if ( defined $items[7] )
|
512
|
+
{
|
513
|
+
my %hash;
|
514
|
+
for my $info (split(/;/,$items[7]))
|
515
|
+
{
|
516
|
+
my ($key,$val) = split(/=/,$info);
|
517
|
+
if ( !defined $key )
|
518
|
+
{
|
519
|
+
$self->warn("Broken VCF file, empty INFO field at $items[0]:$items[1]\n");
|
520
|
+
next;
|
521
|
+
}
|
522
|
+
if ( defined $val )
|
523
|
+
{
|
524
|
+
$hash{$key} = $val;
|
525
|
+
}
|
526
|
+
elsif ( exists($$self{header}{INFO}{$key}) )
|
527
|
+
{
|
528
|
+
$hash{$key} = $$self{header}{INFO}{$key}{default};
|
529
|
+
}
|
530
|
+
else
|
531
|
+
{
|
532
|
+
$hash{$key} = undef;
|
533
|
+
}
|
534
|
+
}
|
535
|
+
$out{INFO} = \%hash;
|
536
|
+
}
|
537
|
+
|
538
|
+
# The FORMAT field may not be present. GT:GQ:DP:HQ
|
539
|
+
my $format;
|
540
|
+
if ( $$cols[8] || $items[8] )
|
541
|
+
{
|
542
|
+
$format = $out{FORMAT} = [ split(/:/,$items[8]) ];
|
543
|
+
if ( (!$$format[0] || $$format[0] ne 'GT') && !$$self{ignore_missing_GT} ) { $self->warn("Expected GT as the first genotype field at $items[0]:$items[1]\n"); }
|
544
|
+
}
|
545
|
+
|
546
|
+
# Genotype fields
|
547
|
+
my %gtypes;
|
548
|
+
my $check_nformat = $$self{drop_trailings} ? 0 : 1;
|
549
|
+
for (my $icol=9; $icol<@items; $icol++)
|
550
|
+
{
|
551
|
+
if ( $items[$icol] eq '' ) { $self->warn("Empty column $$cols[$icol] at $items[0]:$items[1]\n"); next; }
|
552
|
+
if ( exists($$self{samples_to_parse}) && !$$self{samples_to_parse}[$icol] ) { next; }
|
553
|
+
|
554
|
+
my @fields = split(/:/, $items[$icol]);
|
555
|
+
if ( $check_nformat && @fields != @$format )
|
556
|
+
{
|
557
|
+
$self->warn("Different number of fields in the format and the column $$cols[$icol] at $items[0]:$items[1] ("
|
558
|
+
.scalar @fields." vs ".scalar @$format.": [",join(',',@fields),"] vs [",join(',',@$format),"])\n");
|
559
|
+
}
|
560
|
+
my %hash;
|
561
|
+
for (my $ifield=0; $ifield<@fields; $ifield++)
|
562
|
+
{
|
563
|
+
$hash{$$format[$ifield]} = $fields[$ifield];
|
564
|
+
}
|
565
|
+
$gtypes{$$cols[$icol]} = \%hash;
|
566
|
+
}
|
567
|
+
$out{gtypes} = \%gtypes;
|
568
|
+
|
569
|
+
return \%out;
|
570
|
+
}
|
571
|
+
|
572
|
+
|
573
|
+
=head2 parse_header
|
574
|
+
|
575
|
+
About : Reads (and stores) the VCF header.
|
576
|
+
Usage : my $vcf = Vcf->new(); $vcf->parse_header();
|
577
|
+
Args : silent .. do not warn about duplicate header lines
|
578
|
+
|
579
|
+
=cut
|
580
|
+
|
581
|
+
sub parse_header
|
582
|
+
{
|
583
|
+
my ($self,%args) = @_;
|
584
|
+
|
585
|
+
# First come the header lines prefixed by ##
|
586
|
+
while ($self->_next_header_line(%args)) { ; }
|
587
|
+
|
588
|
+
# Now comes the column names line prefixed by #
|
589
|
+
$self->_read_column_names();
|
590
|
+
}
|
591
|
+
|
592
|
+
|
593
|
+
=head2 _next_header_line
|
594
|
+
|
595
|
+
About : Stores the header lines and meta information, such as fields types, etc.
|
596
|
+
Args : silent .. do not warn about duplicate column names
|
597
|
+
|
598
|
+
=cut
|
599
|
+
|
600
|
+
sub _next_header_line
|
601
|
+
{
|
602
|
+
my ($self,%args) = @_;
|
603
|
+
my $line = $self->next_line();
|
604
|
+
if ( !defined $line ) { return undef; }
|
605
|
+
if ( substr($line,0,2) ne '##' )
|
606
|
+
{
|
607
|
+
$self->_unread_line($line);
|
608
|
+
return undef;
|
609
|
+
}
|
610
|
+
|
611
|
+
my $rec = $self->parse_header_line($line);
|
612
|
+
if ( $rec ) { $self->add_header_line($rec,%args); }
|
613
|
+
|
614
|
+
return $rec;
|
615
|
+
}
|
616
|
+
|
617
|
+
=head2 get_header_line
|
618
|
+
|
619
|
+
Usage : $vcf->get_header_line(key=>'INFO', ID=>'AC')
|
620
|
+
$vcf->get_header_line(key=>'FILTER', ID=>'q10')
|
621
|
+
$vcf->get_header_line(key=>'reference')
|
622
|
+
$vcf->get_header_line(key=>'contig',ID=>'20')
|
623
|
+
Args : Header line filter as in the example above
|
624
|
+
Returns : List ref of header line hashes matching the filter
|
625
|
+
|
626
|
+
=cut
|
627
|
+
|
628
|
+
sub get_header_line
|
629
|
+
{
|
630
|
+
my ($self,%filter) = @_;
|
631
|
+
|
632
|
+
my $key = $filter{key};
|
633
|
+
delete($filter{key});
|
634
|
+
|
635
|
+
my $id = $filter{ID};
|
636
|
+
|
637
|
+
my @out;
|
638
|
+
while (my ($hline_key,$hline_hash) = each %{$$self{header}})
|
639
|
+
{
|
640
|
+
if ( $key ne $hline_key ) { next; }
|
641
|
+
|
642
|
+
if ( defined $id )
|
643
|
+
{
|
644
|
+
if ( !exists($$hline_hash{$id}) ) { next; }
|
645
|
+
$hline_hash = $$hline_hash{$id};
|
646
|
+
}
|
647
|
+
|
648
|
+
my $match = 1;
|
649
|
+
while (my ($fkey,$fval) = each %filter)
|
650
|
+
{
|
651
|
+
if ( !exists($$hline_hash{$fkey}) or $$hline_hash{$fkey} ne $fval )
|
652
|
+
{
|
653
|
+
$match=0;
|
654
|
+
last;
|
655
|
+
}
|
656
|
+
}
|
657
|
+
if ( $match ) { push @out,$hline_hash }
|
658
|
+
}
|
659
|
+
return \@out;
|
660
|
+
}
|
661
|
+
|
662
|
+
|
663
|
+
=head2 add_header_line
|
664
|
+
|
665
|
+
Usage : $vcf->add_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'})
|
666
|
+
$vcf->add_header_line({key=>'reference',value=>'1000GenomesPilot-NCBI36'})
|
667
|
+
Args : Header line hash as in the example above
|
668
|
+
Hash with additional parameters [optional]
|
669
|
+
silent .. do not warn about existing header keys
|
670
|
+
append .. append timestamp to the name of the new one
|
671
|
+
Returns :
|
672
|
+
|
673
|
+
=cut
|
674
|
+
|
675
|
+
sub add_header_line
|
676
|
+
{
|
677
|
+
my ($self,$rec,%args) = @_;
|
678
|
+
|
679
|
+
if ( !%args ) { $args{silent}=0; }
|
680
|
+
|
681
|
+
my $key = $$rec{key};
|
682
|
+
if ( !$key ) { $self->throw("Missing key: ",Dumper($rec)); }
|
683
|
+
|
684
|
+
if ( exists($$rec{Type}) )
|
685
|
+
{
|
686
|
+
if ( !exists($$rec{default}) )
|
687
|
+
{
|
688
|
+
my $type = $$rec{Type};
|
689
|
+
if ( exists($$self{defaults}{$type}) ) { $$rec{default}=$$self{defaults}{$type}; }
|
690
|
+
else { $$rec{default}=$$self{defaults}{default}; }
|
691
|
+
}
|
692
|
+
if ( !exists($$rec{handler}) )
|
693
|
+
{
|
694
|
+
my $type = $$rec{Type};
|
695
|
+
if ( !exists($$self{handlers}{$type}) )
|
696
|
+
{
|
697
|
+
$self->warn("Unknown type [$type]\n");
|
698
|
+
$type = 'String';
|
699
|
+
$$rec{Type} = $type;
|
700
|
+
}
|
701
|
+
if ( exists($$self{handlers}{$type}) ) { $$rec{handler}=$$self{handlers}{$type}; }
|
702
|
+
else { $self->throw("Unknown type [$type].\n"); }
|
703
|
+
}
|
704
|
+
}
|
705
|
+
|
706
|
+
if ( exists($$rec{ID}) )
|
707
|
+
{
|
708
|
+
my $id = $$rec{ID};
|
709
|
+
if ( exists($$self{header}{$key}{$id}) ) { $self->remove_header_line(%$rec); }
|
710
|
+
$$self{header}{$key}{$id} = $rec;
|
711
|
+
push @{$$self{header_lines}}, $rec;
|
712
|
+
return;
|
713
|
+
}
|
714
|
+
|
715
|
+
if ( $args{append} )
|
716
|
+
{
|
717
|
+
my @tm = gmtime(time);
|
718
|
+
$key = sprintf "%s_%d%.2d%.2d", $key,$tm[5]+1900,$tm[4],$tm[3];
|
719
|
+
my $i = 1;
|
720
|
+
while ( exists($$self{header}{$key.'.'.$i}) ) { $i++; }
|
721
|
+
$key = $key.'.'.$i;
|
722
|
+
$$rec{key} = $key;
|
723
|
+
}
|
724
|
+
|
725
|
+
if ( $self->_header_line_exists($key,$rec) ) { $self->remove_header_line(%$rec); }
|
726
|
+
|
727
|
+
push @{$$self{header}{$key}}, $rec;
|
728
|
+
if ( $$rec{key} eq 'fileformat' )
|
729
|
+
{
|
730
|
+
unshift @{$$self{header_lines}}, $rec;
|
731
|
+
}
|
732
|
+
else
|
733
|
+
{
|
734
|
+
push @{$$self{header_lines}}, $rec;
|
735
|
+
}
|
736
|
+
}
|
737
|
+
|
738
|
+
sub _header_line_exists
|
739
|
+
{
|
740
|
+
my ($self,$key,$rec) = @_;
|
741
|
+
if ( !exists($$self{header}{$key}) ) { return 0; }
|
742
|
+
if ( $key eq 'fileformat' ) { return 1; }
|
743
|
+
for my $hrec (@{$$self{header}{$key}})
|
744
|
+
{
|
745
|
+
my $differ = 0;
|
746
|
+
for my $item (keys %$rec)
|
747
|
+
{
|
748
|
+
if ( !exists($$hrec{$item}) ) { $differ=1; last; }
|
749
|
+
if ( $$hrec{$item} ne $$rec{$item} ) { $differ=1; last; }
|
750
|
+
}
|
751
|
+
if ( !$differ ) { return $hrec; }
|
752
|
+
}
|
753
|
+
return 0;
|
754
|
+
}
|
755
|
+
|
756
|
+
=head2 remove_header_line
|
757
|
+
|
758
|
+
Usage : $vcf->remove_header_line(key=>'INFO', ID=>'AC')
|
759
|
+
Args :
|
760
|
+
Returns :
|
761
|
+
|
762
|
+
=cut
|
763
|
+
|
764
|
+
sub remove_header_line
|
765
|
+
{
|
766
|
+
my ($self,%args) = @_;
|
767
|
+
my $key = $args{key};
|
768
|
+
for (my $i=0; $i<@{$$self{header_lines}}; $i++)
|
769
|
+
{
|
770
|
+
my $line = $$self{header_lines}[$i];
|
771
|
+
if ( $$line{key} ne $key ) { next; }
|
772
|
+
if ( exists($args{ID}) )
|
773
|
+
{
|
774
|
+
if ( $args{ID} ne $$line{ID} ) { next; }
|
775
|
+
delete($$self{header}{$key}{$args{ID}});
|
776
|
+
splice(@{$$self{header_lines}},$i,1);
|
777
|
+
}
|
778
|
+
else
|
779
|
+
{
|
780
|
+
my $to_be_removed = $self->_header_line_exists($key,\%args);
|
781
|
+
if ( !$to_be_removed ) { next; }
|
782
|
+
for (my $j=0; $j<@{$$self{header}{$key}}; $j++)
|
783
|
+
{
|
784
|
+
if ( $$self{header}{$key}[$j] eq $to_be_removed ) { splice(@{$$self{header}{$key}},$j,1); last; }
|
785
|
+
}
|
786
|
+
splice(@{$$self{header_lines}},$i,1);
|
787
|
+
}
|
788
|
+
}
|
789
|
+
}
|
790
|
+
|
791
|
+
|
792
|
+
=head2 parse_header_line
|
793
|
+
|
794
|
+
Usage : $vcf->parse_header_line(q[##reference=1000GenomesPilot-NCBI36])
|
795
|
+
$vcf->parse_header_line(q[##INFO=NS,1,Integer,"Number of Samples With Data"])
|
796
|
+
Args :
|
797
|
+
Returns :
|
798
|
+
|
799
|
+
=cut
|
800
|
+
|
801
|
+
sub parse_header_line
|
802
|
+
{
|
803
|
+
my ($self,$line) = @_;
|
804
|
+
|
805
|
+
chomp($line);
|
806
|
+
$line =~ s/^##//;
|
807
|
+
|
808
|
+
if ( !($line=~/^([^=]+)=/) ) { return { key=>$line, value=>'' }; }
|
809
|
+
my $key = $1;
|
810
|
+
my $value = $';
|
811
|
+
|
812
|
+
my $desc;
|
813
|
+
if ( $value=~/,\s*\"([^\"]+)\"\s*$/ ) { $desc=$1; $value=$`; }
|
814
|
+
|
815
|
+
if ( !$desc ) { return { key=>$key, value=>$value }; }
|
816
|
+
|
817
|
+
if ( $key eq 'INFO' or $key eq 'FORMAT' )
|
818
|
+
{
|
819
|
+
my ($id,$number,$type,@rest) = split(/,\s*/,$value);
|
820
|
+
if ( !$type or scalar @rest ) { $self->throw("Could not parse the header line: $line\n"); }
|
821
|
+
return { key=>$key, ID=>$id, Number=>$number, Type=>$type, Description=>$desc };
|
822
|
+
}
|
823
|
+
if ( $key eq 'FILTER' )
|
824
|
+
{
|
825
|
+
my ($id,@rest) = split(/,\s*/,$value);
|
826
|
+
if ( !$id or scalar @rest ) { $self->throw("Could not parse the header line: $line\n"); }
|
827
|
+
return { key=>$key, ID=>$id, Description=>$desc };
|
828
|
+
}
|
829
|
+
$self->throw("Could not parse the header line: $line\n");
|
830
|
+
}
|
831
|
+
|
832
|
+
=head2 _read_column_names
|
833
|
+
|
834
|
+
About : Stores the column names as array $$self{columns} and hash $$self{has_column}{COL_NAME}=index.
|
835
|
+
The indexes go from 1.
|
836
|
+
Usage : $vcf->_read_column_names();
|
837
|
+
Args : none
|
838
|
+
|
839
|
+
=cut
|
840
|
+
|
841
|
+
sub _read_column_names
|
842
|
+
{
|
843
|
+
my ($self) = @_;
|
844
|
+
my $line = $self->next_line();
|
845
|
+
if ( !defined $line or substr($line,0,1) ne '#' ) { $self->throw("Broken VCF header, no column names?"); }
|
846
|
+
$$self{column_line} = $line;
|
847
|
+
|
848
|
+
chomp($line);
|
849
|
+
my @cols = split(/\t/, substr($line,1));
|
850
|
+
my $ncols = scalar @cols;
|
851
|
+
if ( $ncols == 1 )
|
852
|
+
{
|
853
|
+
# If there is only one name, it can be space-seprated instead of tab separated
|
854
|
+
@cols = split(/\s+/, $cols[0]);
|
855
|
+
$ncols = scalar @cols;
|
856
|
+
chomp($line);
|
857
|
+
if ( $ncols <= 1 ) { $self->warn("Could not parse the column names. [$line]\n"); return; }
|
858
|
+
$self->warn("The column names not tab-separated? [$line]\n");
|
859
|
+
}
|
860
|
+
|
861
|
+
my $fields = $$self{mandatory};
|
862
|
+
my $nfields = scalar @$fields;
|
863
|
+
|
864
|
+
# Check the names of the mandatory columns
|
865
|
+
if ( $ncols < $nfields )
|
866
|
+
{
|
867
|
+
chomp($line);
|
868
|
+
$self->warn("Missing some of the mandatory column names.\n\tGot: $line\n\tExpected: #", join("\t",@{$$self{mandatory}}),"\n");
|
869
|
+
return;
|
870
|
+
}
|
871
|
+
|
872
|
+
for (my $i=0; $i<$ncols; $i++)
|
873
|
+
{
|
874
|
+
if ( $cols[$i]=~/^\s+/ or $cols[$i]=~/\s+$/ )
|
875
|
+
{
|
876
|
+
$self->warn("The column name contains leading/trailing spaces, removing: '$cols[$i]'\n");
|
877
|
+
$cols[$i] =~ s/^\s+//;
|
878
|
+
$cols[$i] =~ s/\s+$//;
|
879
|
+
}
|
880
|
+
if ( $i<$nfields && $cols[$i] ne $$fields[$i] )
|
881
|
+
{
|
882
|
+
$self->warn("Expected mandatory column [$$fields[$i]], got [$cols[$i]]\n");
|
883
|
+
$cols[$i] = $$fields[$i];
|
884
|
+
}
|
885
|
+
$$self{has_column}{$cols[$i]} = $i+1;
|
886
|
+
}
|
887
|
+
$$self{columns} = \@cols;
|
888
|
+
return;
|
889
|
+
}
|
890
|
+
|
891
|
+
|
892
|
+
=head2 _fake_column_names
|
893
|
+
|
894
|
+
About : When no header is present, fake column names as the default mandatory ones + numbers
|
895
|
+
Args : The number of genotype columns; 0 if no genotypes but FORMAT present; <0 if FORMAT and genotypes not present
|
896
|
+
|
897
|
+
=cut
|
898
|
+
|
899
|
+
sub _fake_column_names
|
900
|
+
{
|
901
|
+
my ($self,$ncols) = @_;
|
902
|
+
|
903
|
+
$$self{columns} = [ @{$$self{mandatory}} ];
|
904
|
+
if ( $ncols>=0 ) { push @{$$self{columns}}, 'FORMAT'; }
|
905
|
+
for (my $i=1; $i<=$ncols; $i++) { push @{$$self{columns}}, $i; }
|
906
|
+
}
|
907
|
+
|
908
|
+
|
909
|
+
=head2 format_header
|
910
|
+
|
911
|
+
About : Returns the header.
|
912
|
+
Usage : print $vcf->format_header();
|
913
|
+
Args : The columns to include on output [optional]
|
914
|
+
|
915
|
+
=cut
|
916
|
+
|
917
|
+
sub format_header
|
918
|
+
{
|
919
|
+
my ($self,$columns) = @_;
|
920
|
+
|
921
|
+
my $out = '';
|
922
|
+
for my $line (@{$$self{header_lines}}) { $out .= $self->format_header_line($line); }
|
923
|
+
|
924
|
+
# This is required when using the API for writing new VCF files and the caller does not add the line explicitly
|
925
|
+
if ( !exists($$self{header_lines}[0]{key}) or $$self{header_lines}[0]{key} ne 'fileformat' )
|
926
|
+
{
|
927
|
+
$out = "##fileformat=VCFv$$self{version}\n" .$out;
|
928
|
+
}
|
929
|
+
if ( !$$self{columns} ) { return $out; }
|
930
|
+
|
931
|
+
my @out_cols;
|
932
|
+
if ( $columns )
|
933
|
+
{
|
934
|
+
@out_cols = @{$$self{columns}}[0..8];
|
935
|
+
for my $col (@$columns)
|
936
|
+
{
|
937
|
+
if ( exists($$self{has_column}{$col}) ) { push @out_cols, $col; }
|
938
|
+
}
|
939
|
+
}
|
940
|
+
else
|
941
|
+
{
|
942
|
+
@out_cols = @{$$self{columns}};
|
943
|
+
}
|
944
|
+
$out .= "#". join("\t", @out_cols). "\n";
|
945
|
+
|
946
|
+
return $out;
|
947
|
+
}
|
948
|
+
|
949
|
+
|
950
|
+
=head2 format_line
|
951
|
+
|
952
|
+
About : Returns the header.
|
953
|
+
Usage : $x = $vcf->next_data_hash(); print $vcf->format_line($x);
|
954
|
+
$x = $vcf->next_data_array(); print $vcf->format_line($x);
|
955
|
+
Args 1 : The columns or hash in the format returned by next_data_hash or next_data_array.
|
956
|
+
2 : The columns to include [optional]
|
957
|
+
|
958
|
+
=cut
|
959
|
+
|
960
|
+
sub format_line
|
961
|
+
{
|
962
|
+
my ($self,$record,$columns) = @_;
|
963
|
+
|
964
|
+
if ( ref($record) eq 'HASH' ) { return $self->_format_line_hash($record,$columns); }
|
965
|
+
$self->throw("FIXME: todo\n");
|
966
|
+
}
|
967
|
+
|
968
|
+
|
969
|
+
=head2 recalc_ac_an
|
970
|
+
|
971
|
+
About : Control if the AC and AN values should be updated.
|
972
|
+
Usage : $vcf->recalc_ac_an(1); $x = $vcf->next_data_hash(); print $vcf->format_line($x);
|
973
|
+
Args 1 : 0 .. never recalculate
|
974
|
+
1 .. recalculate if present
|
975
|
+
2 .. recalculate if present and add if missing
|
976
|
+
|
977
|
+
=cut
|
978
|
+
|
979
|
+
sub recalc_ac_an
|
980
|
+
{
|
981
|
+
my ($self,$value) = @_;
|
982
|
+
if ( $value eq '0' || $value eq '1' || $value eq '2' ) { $$self{recalc_ac_an} = $value; }
|
983
|
+
return;
|
984
|
+
}
|
985
|
+
|
986
|
+
|
987
|
+
|
988
|
+
sub _format_line_hash
|
989
|
+
{
|
990
|
+
my ($self,$record,$columns) = @_;
|
991
|
+
|
992
|
+
if ( !$$self{columns} )
|
993
|
+
{
|
994
|
+
my $ngtypes = scalar keys %{$$record{gtypes}};
|
995
|
+
if ( !$ngtypes && !exists($$record{FORMAT}) ) { $ngtypes--; }
|
996
|
+
$self->_fake_column_names($ngtypes);
|
997
|
+
}
|
998
|
+
my $cols = $$self{columns};
|
999
|
+
|
1000
|
+
# CHROM POS ID REF
|
1001
|
+
my $out;
|
1002
|
+
$out .= $$record{CHROM} . "\t";
|
1003
|
+
$out .= $$record{POS} . "\t";
|
1004
|
+
$out .= (defined $$record{ID} ? $$record{ID} : '.') . "\t";
|
1005
|
+
$out .= $$record{REF} . "\t";
|
1006
|
+
|
1007
|
+
# ALT
|
1008
|
+
$out .= join(',',@{$$record{ALT}} ? @{$$record{ALT}} : '.');
|
1009
|
+
|
1010
|
+
# QUAL
|
1011
|
+
$out .= "\t". $$record{QUAL};
|
1012
|
+
|
1013
|
+
# FILTER
|
1014
|
+
$out .= "\t". join(';',$$record{FILTER} ? @{$$record{FILTER}} : '.');
|
1015
|
+
|
1016
|
+
# Collect the gtypes of interest
|
1017
|
+
my $gtypes;
|
1018
|
+
if ( $columns )
|
1019
|
+
{
|
1020
|
+
# Select only those gtypes keys with a corresponding key in columns.
|
1021
|
+
for my $col (@$columns) { $$gtypes{$col} = $$record{gtypes}{$col}; }
|
1022
|
+
}
|
1023
|
+
else
|
1024
|
+
{
|
1025
|
+
$gtypes = $$record{gtypes};
|
1026
|
+
}
|
1027
|
+
|
1028
|
+
# INFO
|
1029
|
+
# .. calculate NS, AN and AC, but only if recalc_ac_an is set
|
1030
|
+
my $needs_an_ac = $$self{recalc_ac_an}==2 ? 1 : 0;
|
1031
|
+
my @info;
|
1032
|
+
while (my ($key,$value) = each %{$$record{INFO}})
|
1033
|
+
{
|
1034
|
+
if ( $$self{recalc_ac_an}>0 )
|
1035
|
+
{
|
1036
|
+
if ( $key eq 'AN' ) { $needs_an_ac=1; next; }
|
1037
|
+
if ( $key eq 'AC' ) { $needs_an_ac=1; next; }
|
1038
|
+
}
|
1039
|
+
if ( defined $value )
|
1040
|
+
{
|
1041
|
+
push @info, "$key=$value";
|
1042
|
+
}
|
1043
|
+
elsif ( $key ne '.' )
|
1044
|
+
{
|
1045
|
+
push @info, $key;
|
1046
|
+
}
|
1047
|
+
}
|
1048
|
+
if ( $needs_an_ac )
|
1049
|
+
{
|
1050
|
+
my $nalt = scalar @{$$record{ALT}};
|
1051
|
+
if ( $nalt==1 && $$record{ALT}[0] eq '.' ) { $nalt=0; }
|
1052
|
+
my ($an,$ac) = $self->calc_an_ac($gtypes,$nalt);
|
1053
|
+
push @info, "AN=$an","AC=$ac";
|
1054
|
+
}
|
1055
|
+
if ( !@info ) { push @info, '.'; }
|
1056
|
+
$out .= "\t". join(';', sort @info);
|
1057
|
+
|
1058
|
+
# FORMAT, the column is not required, it may not be present when there are no genotypes
|
1059
|
+
if ( exists($$cols[8]) && defined $$record{FORMAT} )
|
1060
|
+
{
|
1061
|
+
$out .= "\t". join(':',@{$$record{FORMAT}});
|
1062
|
+
}
|
1063
|
+
|
1064
|
+
# Genotypes: output all columns or only a selection?
|
1065
|
+
my @col_names = $columns ? @$columns : @$cols[9..@$cols-1];
|
1066
|
+
my $nformat = defined $$record{FORMAT} ? @{$$record{FORMAT}} : 0;
|
1067
|
+
for my $col (@col_names)
|
1068
|
+
{
|
1069
|
+
my $gt = $$gtypes{$col};
|
1070
|
+
my $can_drop = $$self{drop_trailings};
|
1071
|
+
my @gtype;
|
1072
|
+
for (my $i=$nformat-1; $i>=0; $i--)
|
1073
|
+
{
|
1074
|
+
my $field = $$record{FORMAT}[$i];
|
1075
|
+
if ( $i==0 ) { $can_drop=0; }
|
1076
|
+
|
1077
|
+
if ( exists($$gt{$field}) )
|
1078
|
+
{
|
1079
|
+
$can_drop = 0;
|
1080
|
+
if ( ref($$gt{$field}) eq 'HASH' )
|
1081
|
+
{
|
1082
|
+
# Special treatment for Number=[AG] tags
|
1083
|
+
unshift @gtype, $self->format_AGtag($record,$$gt{$field},$field);
|
1084
|
+
}
|
1085
|
+
else
|
1086
|
+
{
|
1087
|
+
unshift @gtype,$$gt{$field};
|
1088
|
+
}
|
1089
|
+
}
|
1090
|
+
elsif ( $can_drop ) { next; }
|
1091
|
+
elsif ( exists($$self{header}{FORMAT}{$field}{default}) ) { unshift @gtype,$$self{header}{FORMAT}{$field}{default}; $can_drop=0; }
|
1092
|
+
else { $self->throw(qq[No value for the field "$field" and no default available, column "$col" at $$record{CHROM}:$$record{POS}.\n]); }
|
1093
|
+
}
|
1094
|
+
$out .= "\t" . join(':',@gtype);
|
1095
|
+
}
|
1096
|
+
|
1097
|
+
$out .= "\n";
|
1098
|
+
return $out;
|
1099
|
+
}
|
1100
|
+
|
1101
|
+
sub calc_an_ac
|
1102
|
+
{
|
1103
|
+
my ($self,$gtypes,$nalleles) = @_;
|
1104
|
+
my $sep_re = $$self{regex_gtsep};
|
1105
|
+
my ($an,%ac_counts);
|
1106
|
+
if ( defined $nalleles )
|
1107
|
+
{
|
1108
|
+
for (my $i=1; $i<=$nalleles; $i++) { $ac_counts{$i}=0; }
|
1109
|
+
}
|
1110
|
+
$an = 0;
|
1111
|
+
for my $gt (keys %$gtypes)
|
1112
|
+
{
|
1113
|
+
my $value = $$gtypes{$gt}{GT};
|
1114
|
+
if ( !defined $value ) { next; } # GT may not be present
|
1115
|
+
my ($al1,$al2) = split($sep_re,$value);
|
1116
|
+
if ( defined($al1) && $al1 ne '.' )
|
1117
|
+
{
|
1118
|
+
$an++;
|
1119
|
+
if ( $al1 ne '0' ) { $ac_counts{$al1}++; }
|
1120
|
+
}
|
1121
|
+
if ( defined($al2) && $al2 ne '.' )
|
1122
|
+
{
|
1123
|
+
$an++;
|
1124
|
+
if ( $al2 ne '0' ) { $ac_counts{$al2}++; }
|
1125
|
+
}
|
1126
|
+
}
|
1127
|
+
my @ac;
|
1128
|
+
for my $ac ( sort { $a <=> $b } keys %ac_counts) { push @ac, $ac_counts{$ac}; }
|
1129
|
+
if ( !@ac ) { @ac = ('0'); }
|
1130
|
+
return ($an,join(',',@ac),\@ac);
|
1131
|
+
}
|
1132
|
+
|
1133
|
+
sub _validate_alt_field
|
1134
|
+
{
|
1135
|
+
my ($self,$values,$ref) = @_;
|
1136
|
+
|
1137
|
+
for (my $i=0; $i<@$values; $i++)
|
1138
|
+
{
|
1139
|
+
for (my $j=0; $j<$i; $j++)
|
1140
|
+
{
|
1141
|
+
if ( $$values[$i] eq $$values[$j] ) { return "The alleles not unique: $$values[$i]"; }
|
1142
|
+
}
|
1143
|
+
if ( $$values[$i] eq $ref ) { return "REF allele listed in the ALT field??"; }
|
1144
|
+
}
|
1145
|
+
return undef;
|
1146
|
+
}
|
1147
|
+
|
1148
|
+
=head2 validate_alt_field
|
1149
|
+
|
1150
|
+
Usage : my $x = $vcf->next_data_hash(); $vcf->validate_alt_field($$x{ALT});
|
1151
|
+
Args : The ALT arrayref
|
1152
|
+
Returns : Error message in case of an error.
|
1153
|
+
|
1154
|
+
=cut
|
1155
|
+
|
1156
|
+
sub validate_alt_field
|
1157
|
+
{
|
1158
|
+
my ($self,$values,$ref) = @_;
|
1159
|
+
|
1160
|
+
if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
|
1161
|
+
|
1162
|
+
my $ret = $self->_validate_alt_field($values,$ref);
|
1163
|
+
if ( $ret ) { return $ret; }
|
1164
|
+
|
1165
|
+
my @err;
|
1166
|
+
for my $item (@$values)
|
1167
|
+
{
|
1168
|
+
if ( $item=~/^[ACTGN]$/ ) { next; }
|
1169
|
+
elsif ( $item=~/^I[ACTGN]+$/ ) { next; }
|
1170
|
+
elsif ( $item=~/^D\d+$/ ) { next; }
|
1171
|
+
|
1172
|
+
push @err, $item;
|
1173
|
+
}
|
1174
|
+
if ( !@err ) { return undef; }
|
1175
|
+
return 'Could not parse the allele(s) [' .join(',',@err). ']';
|
1176
|
+
}
|
1177
|
+
|
1178
|
+
=head2 event_type
|
1179
|
+
|
1180
|
+
Usage : my $x = $vcf->next_data_hash();
|
1181
|
+
my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001');
|
1182
|
+
for my $allele (@$alleles)
|
1183
|
+
{
|
1184
|
+
my ($type,$len,$ht) = $vcf->event_type($x,$allele);
|
1185
|
+
}
|
1186
|
+
or
|
1187
|
+
my ($type,$len,$ht) = $vcf->event_type($ref,$al);
|
1188
|
+
Args : VCF data line parsed by next_data_hash or the reference allele
|
1189
|
+
: Allele
|
1190
|
+
Returns : 's' for SNP and number of SNPs in the record
|
1191
|
+
'i' for indel and a positive (resp. negative) number for the length of insertion (resp. deletion)
|
1192
|
+
'r' identical to the reference, length 0
|
1193
|
+
'o' for other (complex events) and the number of affected bases
|
1194
|
+
'b' breakend
|
1195
|
+
'u' unknown
|
1196
|
+
|
1197
|
+
=cut
|
1198
|
+
|
1199
|
+
sub event_type
|
1200
|
+
{
|
1201
|
+
my ($self,$rec,$allele) = @_;
|
1202
|
+
|
1203
|
+
my $ref = $rec;
|
1204
|
+
if ( ref($rec) eq 'HASH' )
|
1205
|
+
{
|
1206
|
+
if ( exists($$rec{_cached_events}{$allele}) ) { return (@{$$rec{_cached_events}{$allele}}); }
|
1207
|
+
$ref = $$rec{REF};
|
1208
|
+
}
|
1209
|
+
|
1210
|
+
my ($type,$len,$ht);
|
1211
|
+
if ( $allele eq $ref or $allele eq '.' ) { $len=0; $type='r'; $ht=$ref; }
|
1212
|
+
elsif ( $allele=~/^[ACGT]$/ ) { $len=1; $type='s'; $ht=$allele; }
|
1213
|
+
elsif ( $allele=~/^I/ ) { $len=length($allele)-1; $type='i'; $ht=$'; }
|
1214
|
+
elsif ( $allele=~/^D(\d+)/ ) { $len=-$1; $type='i'; $ht=''; }
|
1215
|
+
else
|
1216
|
+
{
|
1217
|
+
my $chr = ref($rec) eq 'HASH' ? $$rec{CHROM} : 'undef';
|
1218
|
+
my $pos = ref($rec) eq 'HASH' ? $$rec{POS} : 'undef';
|
1219
|
+
$self->throw("Eh?: $chr:$pos .. $ref $allele\n");
|
1220
|
+
}
|
1221
|
+
|
1222
|
+
if ( ref($rec) eq 'HASH' )
|
1223
|
+
{
|
1224
|
+
$$rec{_cached_events}{$allele} = [$type,$len,$ht];
|
1225
|
+
}
|
1226
|
+
return ($type,$len,$ht);
|
1227
|
+
}
|
1228
|
+
|
1229
|
+
|
1230
|
+
=head2 parse_AGtags
|
1231
|
+
|
1232
|
+
About : Breaks tags with variable number of fields (that is where Number is set to 'A' or 'G', such as GL) into hashes
|
1233
|
+
Usage : my $x = $vcf->next_data_hash(); my $values = $vcf->parse_AGtags($x);
|
1234
|
+
Args : VCF data line parsed by next_data_hash
|
1235
|
+
: Mapping between ALT representations based on different REFs [optional]
|
1236
|
+
: New REF [optional]
|
1237
|
+
Returns : Hash {Allele=>Value}
|
1238
|
+
|
1239
|
+
=cut
|
1240
|
+
|
1241
|
+
sub parse_AGtags
|
1242
|
+
{
|
1243
|
+
my ($self,$rec,$ref_alt_map,$new_ref) = @_;
|
1244
|
+
|
1245
|
+
if ( !exists($$rec{gtypes}) ) { return; }
|
1246
|
+
|
1247
|
+
my (@atags,@gtags);
|
1248
|
+
for my $fmt (@{$$rec{FORMAT}})
|
1249
|
+
{
|
1250
|
+
# These have been listed explicitly for proper merging of v4.0 VCFs
|
1251
|
+
if ( $$self{fix_v40_AGtags} )
|
1252
|
+
{
|
1253
|
+
if ( $fmt eq 'GL' or $fmt eq 'PL' ) { push @gtags,$fmt; next; }
|
1254
|
+
if ( $fmt eq 'AC' or $fmt eq 'AF' ) { push @atags,$fmt; next; }
|
1255
|
+
}
|
1256
|
+
if ( !exists($$self{header}{FORMAT}{$fmt}) ) { next; }
|
1257
|
+
if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'A' ) { push @atags,$fmt; next; }
|
1258
|
+
if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'G' ) { push @gtags,$fmt; next; }
|
1259
|
+
}
|
1260
|
+
my $missing = $$self{defaults}{default};
|
1261
|
+
if ( @atags )
|
1262
|
+
{
|
1263
|
+
# Parse Number=A tags
|
1264
|
+
my $alts;
|
1265
|
+
if ( defined $ref_alt_map )
|
1266
|
+
{
|
1267
|
+
$alts = [];
|
1268
|
+
for my $alt (@{$$rec{ALT}})
|
1269
|
+
{
|
1270
|
+
if ( !exists($$ref_alt_map{$new_ref}{$alt}) ) { $self->throw("FIXME: $new_ref $alt...?\n"); }
|
1271
|
+
push @$alts, $$ref_alt_map{$new_ref}{$alt};
|
1272
|
+
}
|
1273
|
+
}
|
1274
|
+
else
|
1275
|
+
{
|
1276
|
+
$alts = $$rec{ALT};
|
1277
|
+
}
|
1278
|
+
for my $tag (@atags)
|
1279
|
+
{
|
1280
|
+
for my $sample (values %{$$rec{gtypes}})
|
1281
|
+
{
|
1282
|
+
if ( !exists($$sample{$tag}) or $$sample{$tag} eq $missing ) { next; }
|
1283
|
+
my @values = split(/,/,$$sample{$tag});
|
1284
|
+
$$sample{$tag} = {};
|
1285
|
+
for (my $i=0; $i<@values; $i++)
|
1286
|
+
{
|
1287
|
+
$$sample{$tag}{$$alts[$i]} = $values[$i];
|
1288
|
+
}
|
1289
|
+
}
|
1290
|
+
}
|
1291
|
+
}
|
1292
|
+
if ( @gtags )
|
1293
|
+
{
|
1294
|
+
# Parse Number=G tags
|
1295
|
+
my @alleles;
|
1296
|
+
if ( defined $ref_alt_map )
|
1297
|
+
{
|
1298
|
+
push @alleles, $new_ref;
|
1299
|
+
for my $alt (@{$$rec{ALT}})
|
1300
|
+
{
|
1301
|
+
if ( !exists($$ref_alt_map{$new_ref}{$alt}) ) { $self->throw("FIXME: [$new_ref] [$alt]...?\n", Dumper($ref_alt_map,$rec)); }
|
1302
|
+
push @alleles, $$ref_alt_map{$new_ref}{$alt};
|
1303
|
+
}
|
1304
|
+
}
|
1305
|
+
else
|
1306
|
+
{
|
1307
|
+
@alleles = ($$rec{REF},@{$$rec{ALT}});
|
1308
|
+
}
|
1309
|
+
my @gtypes;
|
1310
|
+
for (my $i=0; $i<@alleles; $i++)
|
1311
|
+
{
|
1312
|
+
for (my $j=0; $j<=$i; $j++)
|
1313
|
+
{
|
1314
|
+
push @gtypes, $alleles[$i].'/'.$alleles[$j];
|
1315
|
+
}
|
1316
|
+
}
|
1317
|
+
for my $tag (@gtags)
|
1318
|
+
{
|
1319
|
+
for my $sample (values %{$$rec{gtypes}})
|
1320
|
+
{
|
1321
|
+
if ( !exists($$sample{$tag}) or $$sample{$tag} eq $missing ) { next; }
|
1322
|
+
my @values = split(/,/,$$sample{$tag});
|
1323
|
+
$$sample{$tag} = {};
|
1324
|
+
for (my $i=0; $i<@values; $i++)
|
1325
|
+
{
|
1326
|
+
$$sample{$tag}{$gtypes[$i]} = $values[$i];
|
1327
|
+
}
|
1328
|
+
}
|
1329
|
+
}
|
1330
|
+
}
|
1331
|
+
}
|
1332
|
+
|
1333
|
+
=head2 format_AGtag
|
1334
|
+
|
1335
|
+
About : Format tag with variable number of fields (that is where Number is set to 'A' or 'G', such as GL)
|
1336
|
+
Usage :
|
1337
|
+
Args :
|
1338
|
+
:
|
1339
|
+
:
|
1340
|
+
Returns :
|
1341
|
+
|
1342
|
+
=cut
|
1343
|
+
|
1344
|
+
sub format_AGtag
|
1345
|
+
{
|
1346
|
+
my ($self,$record,$tag_data,$tag) = @_;
|
1347
|
+
|
1348
|
+
# The FORMAT field is checked only once and the results are cached.
|
1349
|
+
if ( !exists($$record{_atags}) )
|
1350
|
+
{
|
1351
|
+
$$record{_atags} = {};
|
1352
|
+
|
1353
|
+
# Check if there are any A,G tags
|
1354
|
+
for my $fmt (@{$$record{FORMAT}})
|
1355
|
+
{
|
1356
|
+
# These have been listed explicitly for proper merging of v4.0 VCFs
|
1357
|
+
if ( $$self{fix_v40_AGtags} )
|
1358
|
+
{
|
1359
|
+
if ( $fmt eq 'GL' or $fmt eq 'PL' ) { $$record{_gtags}{$fmt}=1; next; }
|
1360
|
+
if ( $fmt eq 'AC' or $fmt eq 'AF' ) { $$record{_atags}{$fmt}=1; next; }
|
1361
|
+
}
|
1362
|
+
if ( !exists($$self{header}{FORMAT}{$fmt}) ) { next; }
|
1363
|
+
if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'A' ) { $$record{_atags}{$fmt}=1; next; }
|
1364
|
+
if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'G' ) { $$record{_gtags}{$fmt}=1; next; }
|
1365
|
+
}
|
1366
|
+
}
|
1367
|
+
|
1368
|
+
my @out;
|
1369
|
+
if ( exists($$record{_atags}{$tag}) )
|
1370
|
+
{
|
1371
|
+
for my $alt (@{$$record{ALT}})
|
1372
|
+
{
|
1373
|
+
push @out, exists($$tag_data{$alt}) ? $$tag_data{$alt} : $$self{defaults}{default};
|
1374
|
+
}
|
1375
|
+
}
|
1376
|
+
|
1377
|
+
if ( exists($$record{_gtags}{$tag}) )
|
1378
|
+
{
|
1379
|
+
my $gtypes = $$record{_gtypes};
|
1380
|
+
my $gtypes2 = $$record{_gtypes2};
|
1381
|
+
if ( !defined $gtypes )
|
1382
|
+
{
|
1383
|
+
$gtypes = [];
|
1384
|
+
$gtypes2 = [];
|
1385
|
+
|
1386
|
+
my @alleles = ( $$record{REF}, @{$$record{ALT}} );
|
1387
|
+
for (my $i=0; $i<@alleles; $i++)
|
1388
|
+
{
|
1389
|
+
for (my $j=0; $j<=$i; $j++)
|
1390
|
+
{
|
1391
|
+
push @$gtypes, $alleles[$i].'/'.$alleles[$j];
|
1392
|
+
push @$gtypes2, $alleles[$j].'/'.$alleles[$i];
|
1393
|
+
}
|
1394
|
+
}
|
1395
|
+
|
1396
|
+
$$record{_gtypes} = $gtypes;
|
1397
|
+
$$record{_gtypes2} = $gtypes2;
|
1398
|
+
}
|
1399
|
+
|
1400
|
+
for (my $i=0; $i<@$gtypes; $i++)
|
1401
|
+
{
|
1402
|
+
my $gt = $$gtypes[$i];
|
1403
|
+
if ( !exists($$tag_data{$gt}) ) { $gt = $$gtypes2[$i]; }
|
1404
|
+
push @out, exists($$tag_data{$gt}) ? $$tag_data{$gt} : $$self{defaults}{default};
|
1405
|
+
}
|
1406
|
+
}
|
1407
|
+
|
1408
|
+
return join(',',@out);
|
1409
|
+
}
|
1410
|
+
|
1411
|
+
=head2 parse_alleles
|
1412
|
+
|
1413
|
+
About : Deprecated, use parse_haplotype instead.
|
1414
|
+
Usage : my $x = $vcf->next_data_hash(); my ($al1,$sep,$al2) = $vcf->parse_alleles($x,'NA00001');
|
1415
|
+
Args : VCF data line parsed by next_data_hash
|
1416
|
+
: The genotype column name
|
1417
|
+
Returns : Alleles and the separator. If only one allele is present, $sep and $al2 will be an empty string.
|
1418
|
+
|
1419
|
+
=cut
|
1420
|
+
|
1421
|
+
sub parse_alleles
|
1422
|
+
{
|
1423
|
+
my ($self,$rec,$column) = @_;
|
1424
|
+
if ( !exists($$rec{gtypes}) || !exists($$rec{gtypes}{$column}) ) { $self->throw("The column not present: '$column'\n"); }
|
1425
|
+
|
1426
|
+
my $gtype = $$rec{gtypes}{$column}{GT};
|
1427
|
+
if ( !($gtype=~$$self{regex_gt}) ) { $self->throw("Could not parse gtype string [$gtype] [$$rec{CHROM}:$$rec{POS}]\n"); }
|
1428
|
+
my $al1 = $1;
|
1429
|
+
my $sep = $2;
|
1430
|
+
my $al2 = $3;
|
1431
|
+
|
1432
|
+
if ( !$al1 ) { $al1 = $$rec{REF}; }
|
1433
|
+
elsif ( $al1 ne '.' )
|
1434
|
+
{
|
1435
|
+
if ( !($al1=~/^\d+$/) ) { $self->throw("Uh, what is this? [$al1] $$rec{CHROM}:$$rec{POS}\n"); }
|
1436
|
+
$al1 = $$rec{ALT}[$al1-1];
|
1437
|
+
}
|
1438
|
+
|
1439
|
+
if ( !defined $al2 or $al2 eq '' )
|
1440
|
+
{
|
1441
|
+
$sep = '';
|
1442
|
+
$al2 = '';
|
1443
|
+
}
|
1444
|
+
else
|
1445
|
+
{
|
1446
|
+
if ( !$al2 ) { $al2 = $$rec{REF}; }
|
1447
|
+
elsif ( $al2 ne '.' ) { $al2 = $$rec{ALT}[$al2-1]; }
|
1448
|
+
}
|
1449
|
+
return ($al1,$sep,$al2);
|
1450
|
+
}
|
1451
|
+
|
1452
|
+
=head2 parse_haplotype
|
1453
|
+
|
1454
|
+
About : Similar to parse_alleles, supports also multiploid VCFs.
|
1455
|
+
Usage : my $x = $vcf->next_data_hash(); my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001');
|
1456
|
+
Args : VCF data line parsed by next_data_hash
|
1457
|
+
: The genotype column name
|
1458
|
+
Returns : Two array refs and two boolean flags: List of alleles, list of separators, and is_phased/empty flags. The values
|
1459
|
+
can be cashed and must be therefore considered read only!
|
1460
|
+
|
1461
|
+
=cut
|
1462
|
+
|
1463
|
+
sub parse_haplotype
|
1464
|
+
{
|
1465
|
+
my ($self,$rec,$column) = @_;
|
1466
|
+
if ( !exists($$rec{gtypes}{$column}) ) { $self->throw("The column not present: '$column'\n"); }
|
1467
|
+
if ( !exists($$rec{gtypes}{$column}{GT}) ) { return (['.'],[],0,1); }
|
1468
|
+
|
1469
|
+
my $gtype = $$rec{gtypes}{$column}{GT};
|
1470
|
+
if ( exists($$rec{_cached_haplotypes}{$gtype}) ) { return (@{$$rec{_cached_haplotypes}{$gtype}}); }
|
1471
|
+
|
1472
|
+
my @alleles = ();
|
1473
|
+
my @seps = ();
|
1474
|
+
my $is_phased = 0;
|
1475
|
+
my $is_empty = 1;
|
1476
|
+
|
1477
|
+
my $buf = $gtype;
|
1478
|
+
while ($buf ne '')
|
1479
|
+
{
|
1480
|
+
if ( !($buf=~m{^(\.|\d+)([|/]?)}) ) { $self->throw("Could not parse gtype string [$gtype] .. $$rec{CHROM}:$$rec{POS} $column\n"); }
|
1481
|
+
$buf = $';
|
1482
|
+
|
1483
|
+
if ( $1 eq '.' ) { push @alleles,'.'; }
|
1484
|
+
else
|
1485
|
+
{
|
1486
|
+
$is_empty = 0;
|
1487
|
+
if ( $1 eq '0' ) { push @alleles,$$rec{REF}; }
|
1488
|
+
elsif ( exists($$rec{ALT}[$1-1]) ) { push @alleles,$$rec{ALT}[$1-1]; }
|
1489
|
+
else { $self->throw(qq[The haplotype indexes in "$gtype" do not match the ALT column .. $$rec{CHROM}:$$rec{POS} $column\n]); }
|
1490
|
+
}
|
1491
|
+
if ( $2 )
|
1492
|
+
{
|
1493
|
+
if ( $2 eq '|' ) { $is_phased=1; }
|
1494
|
+
push @seps,$2;
|
1495
|
+
}
|
1496
|
+
}
|
1497
|
+
$$rec{_cached_haplotypes}{$gtype} = [\@alleles,\@seps,$is_phased,$is_empty];
|
1498
|
+
return (@{$$rec{_cached_haplotypes}{$gtype}});
|
1499
|
+
}
|
1500
|
+
|
1501
|
+
=head2 format_haplotype
|
1502
|
+
|
1503
|
+
Usage : my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001'); print $vcf->format_haplotype($alleles,$seps);
|
1504
|
+
|
1505
|
+
=cut
|
1506
|
+
|
1507
|
+
sub format_haplotype
|
1508
|
+
{
|
1509
|
+
my ($self,$alleles,$seps) = @_;
|
1510
|
+
if ( @$alleles != @$seps+1 ) { $self->throw(sprintf("Uh: %d vs %d\n",scalar @$alleles,scalar @$seps),Dumper($alleles,$seps)); }
|
1511
|
+
my $out = $$alleles[0];
|
1512
|
+
for (my $i=1; $i<@$alleles; $i++)
|
1513
|
+
{
|
1514
|
+
$out .= $$seps[$i-1];
|
1515
|
+
$out .= $$alleles[$i];
|
1516
|
+
}
|
1517
|
+
return $out;
|
1518
|
+
}
|
1519
|
+
|
1520
|
+
|
1521
|
+
=head2 format_genotype_strings
|
1522
|
+
|
1523
|
+
Usage : my $x = { REF=>'A', gtypes=>{'NA00001'=>{'GT'=>'A/C'}}, FORMAT=>['GT'], CHROM=>1, POS=>1, FILTER=>['.'], QUAL=>-1 };
|
1524
|
+
$vcf->format_genotype_strings($x);
|
1525
|
+
print $vcf->format_line($x);
|
1526
|
+
Args 1 : VCF data line in the format as if parsed by next_data_hash with alleles written as letters.
|
1527
|
+
2 : Optionally, a subset of columns can be supplied. See also format_line.
|
1528
|
+
Returns : Modifies the ALT array and the genotypes so that ref alleles become 0 and non-ref alleles
|
1529
|
+
numbers starting from 1.
|
1530
|
+
|
1531
|
+
=cut
|
1532
|
+
|
1533
|
+
sub format_genotype_strings
|
1534
|
+
{
|
1535
|
+
my ($self,$rec,$columns) = @_;
|
1536
|
+
|
1537
|
+
if ( !exists($$rec{gtypes}) ) { return; }
|
1538
|
+
|
1539
|
+
my $ref = $$rec{REF};
|
1540
|
+
my $nalts = 0;
|
1541
|
+
my %alts = ();
|
1542
|
+
my $gt_re = $$self{regex_gt2};
|
1543
|
+
|
1544
|
+
if ( !$columns ) { $columns = [keys %{$$rec{gtypes}}]; }
|
1545
|
+
|
1546
|
+
for my $key (@$columns)
|
1547
|
+
{
|
1548
|
+
my $gtype = $$rec{gtypes}{$key}{GT};
|
1549
|
+
my $buf = $gtype;
|
1550
|
+
my $out = '';
|
1551
|
+
while ($buf ne '')
|
1552
|
+
{
|
1553
|
+
if ( !($buf=~$gt_re) ) { $self->throw("Could not parse gtype string [$gtype]\n"); }
|
1554
|
+
$buf = $';
|
1555
|
+
|
1556
|
+
my $al = $1;
|
1557
|
+
my $sep = $2;
|
1558
|
+
if ( $al eq $ref or $al eq '0' or $al eq '*' ) { $al=0; }
|
1559
|
+
else
|
1560
|
+
{
|
1561
|
+
if ( $al=~/^\d+$/ )
|
1562
|
+
{
|
1563
|
+
if ( !exists($$rec{ALT}[$al-1]) ) { $self->throw("Broken ALT, index $al out of bounds\n"); }
|
1564
|
+
$al = $$rec{ALT}[$al-1];
|
1565
|
+
}
|
1566
|
+
|
1567
|
+
if ( exists($alts{$al}) ) { $al = $alts{$al} }
|
1568
|
+
elsif ( $al=~$$self{regex_snp} or $al=~$$self{regex_ins} or $al=~$$self{regex_del} )
|
1569
|
+
{
|
1570
|
+
$alts{$al} = ++$nalts;
|
1571
|
+
$al = $nalts;
|
1572
|
+
}
|
1573
|
+
elsif ( $al ne '.' )
|
1574
|
+
{
|
1575
|
+
$self->throw("Could not parse the genotype string [$gtype]\n");
|
1576
|
+
}
|
1577
|
+
|
1578
|
+
}
|
1579
|
+
$out .= $al;
|
1580
|
+
if ( $sep ) { $out .= $sep; }
|
1581
|
+
}
|
1582
|
+
$$rec{gtypes}{$key}{GT} = $out;
|
1583
|
+
}
|
1584
|
+
$$rec{ALT} = [ sort { $alts{$a}<=>$alts{$b} } keys %alts ];
|
1585
|
+
}
|
1586
|
+
|
1587
|
+
sub fill_ref_alt_mapping
|
1588
|
+
{
|
1589
|
+
my ($self,$map) = @_;
|
1590
|
+
|
1591
|
+
my $new_ref;
|
1592
|
+
for my $ref (keys %$map)
|
1593
|
+
{
|
1594
|
+
$new_ref = $ref;
|
1595
|
+
if ( $ref ne $new_ref ) { $self->throw("The reference prefixes do not agree: $ref vs $new_ref\n"); }
|
1596
|
+
for my $alt (keys %{$$map{$ref}})
|
1597
|
+
{
|
1598
|
+
$$map{$ref}{$alt} = $alt;
|
1599
|
+
}
|
1600
|
+
}
|
1601
|
+
$$map{$new_ref}{$new_ref} = $new_ref;
|
1602
|
+
return $new_ref;
|
1603
|
+
}
|
1604
|
+
|
1605
|
+
|
1606
|
+
=head2 format_header_line
|
1607
|
+
|
1608
|
+
Usage : $vcf->format_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'})
|
1609
|
+
Args :
|
1610
|
+
Returns :
|
1611
|
+
|
1612
|
+
=cut
|
1613
|
+
|
1614
|
+
sub format_header_line
|
1615
|
+
{
|
1616
|
+
my ($self,$rec) = @_;
|
1617
|
+
my $line = "##$$rec{key}";
|
1618
|
+
$line .= "=$$rec{value}" unless !exists($$rec{value});
|
1619
|
+
$line .= "=$$rec{ID}" unless !exists($$rec{ID});
|
1620
|
+
$line .= ",$$rec{Number}" unless !exists($$rec{Number});
|
1621
|
+
$line .= ",$$rec{Type}" unless !exists($$rec{Type});
|
1622
|
+
$line .= qq[,"$$rec{Description}"] unless !exists($$rec{Description});
|
1623
|
+
$line .= "\n";
|
1624
|
+
return $line;
|
1625
|
+
}
|
1626
|
+
|
1627
|
+
=head2 add_columns
|
1628
|
+
|
1629
|
+
Usage : $vcf->add_columns('NA001','NA0002');
|
1630
|
+
Args :
|
1631
|
+
Returns :
|
1632
|
+
|
1633
|
+
=cut
|
1634
|
+
|
1635
|
+
sub add_columns
|
1636
|
+
{
|
1637
|
+
my ($self,@columns) = @_;
|
1638
|
+
if ( !$$self{columns} )
|
1639
|
+
{
|
1640
|
+
# The columns should be initialized de novo. Figure out if the @columns contain also the mandatory
|
1641
|
+
# columns and if FORMAT should be present (it can be absent when there is no genotype column present).
|
1642
|
+
my $has_other = 0;
|
1643
|
+
for my $col (@columns)
|
1644
|
+
{
|
1645
|
+
if ( !exists($$self{reserved}{cols}{$col}) ) { $has_other=1; last; }
|
1646
|
+
}
|
1647
|
+
|
1648
|
+
$$self{columns} = [ @{$$self{mandatory}} ];
|
1649
|
+
if ( $has_other ) { push @{$$self{columns}},'FORMAT'; }
|
1650
|
+
|
1651
|
+
for my $col (@{$$self{columns}}) { $$self{has_column}{$col}=1; }
|
1652
|
+
}
|
1653
|
+
my $ncols = @{$$self{columns}};
|
1654
|
+
for my $col (@columns)
|
1655
|
+
{
|
1656
|
+
if ( $$self{has_column}{$col} ) { next; }
|
1657
|
+
$ncols++;
|
1658
|
+
push @{$$self{columns}}, $col;
|
1659
|
+
}
|
1660
|
+
}
|
1661
|
+
|
1662
|
+
=head2 add_format_field
|
1663
|
+
|
1664
|
+
Usage : $x=$vcf->next_data_hash(); $vcf->add_format_field($x,'FOO'); $$x{gtypes}{NA0001}{FOO}='Bar'; print $vcf->format_line($x);
|
1665
|
+
Args : The record obtained by next_data_hash
|
1666
|
+
: The field name
|
1667
|
+
Returns :
|
1668
|
+
|
1669
|
+
=cut
|
1670
|
+
|
1671
|
+
sub add_format_field
|
1672
|
+
{
|
1673
|
+
my ($self,$rec,$field) = @_;
|
1674
|
+
|
1675
|
+
if ( !$$rec{FORMAT} ) { $$rec{FORMAT}=[]; }
|
1676
|
+
|
1677
|
+
for my $key (@{$$rec{FORMAT}})
|
1678
|
+
{
|
1679
|
+
if ( $key eq $field ) { return; } # already there
|
1680
|
+
}
|
1681
|
+
push @{$$rec{FORMAT}}, $field;
|
1682
|
+
}
|
1683
|
+
|
1684
|
+
|
1685
|
+
=head2 remove_format_field
|
1686
|
+
|
1687
|
+
Usage : $x=$vcf->next_data_hash(); $vcf->remove_format_field($x,'FOO'); print $vcf->format_line($x);
|
1688
|
+
Args : The record obtained by next_data_hash
|
1689
|
+
: The field name
|
1690
|
+
Returns :
|
1691
|
+
|
1692
|
+
=cut
|
1693
|
+
|
1694
|
+
sub remove_format_field
|
1695
|
+
{
|
1696
|
+
my ($self,$rec,$field) = @_;
|
1697
|
+
|
1698
|
+
if ( !$$rec{FORMAT} ) { $$rec{FORMAT}=[]; }
|
1699
|
+
|
1700
|
+
my $i = 0;
|
1701
|
+
for my $key (@{$$rec{FORMAT}})
|
1702
|
+
{
|
1703
|
+
if ( $key eq $field ) { splice @{$$rec{FORMAT}},$i,1; }
|
1704
|
+
$i++;
|
1705
|
+
}
|
1706
|
+
}
|
1707
|
+
|
1708
|
+
|
1709
|
+
=head2 add_info_field
|
1710
|
+
|
1711
|
+
Usage : $x=$vcf->next_data_array(); $$x[7]=$vcf->add_info_field($$x[7],'FOO'=>'value','BAR'=>undef,'BAZ'=>''); print join("\t",@$x)."\n";
|
1712
|
+
Args : The record obtained by next_data_array
|
1713
|
+
: The INFO field name and value pairs. If value is undef and the key is present in $$x[7],
|
1714
|
+
it will be removed. To add fields without a value, use empty string ''.
|
1715
|
+
Returns : The formatted INFO.
|
1716
|
+
|
1717
|
+
=cut
|
1718
|
+
|
1719
|
+
sub add_info_field
|
1720
|
+
{
|
1721
|
+
my ($self,$info,%fields) = @_;
|
1722
|
+
|
1723
|
+
my @out = ();
|
1724
|
+
|
1725
|
+
# First handle the existing values, keep everything unless in %fields
|
1726
|
+
for my $field (split(/;/,$info))
|
1727
|
+
{
|
1728
|
+
my ($key,$value) = split(/=/,$field);
|
1729
|
+
if ( $key eq '.' ) { next; }
|
1730
|
+
if ( !exists($fields{$key}) ) { push @out,$field; next; }
|
1731
|
+
}
|
1732
|
+
|
1733
|
+
# Now add the new values and remove the unwanted ones
|
1734
|
+
while (my ($key,$value)=each %fields)
|
1735
|
+
{
|
1736
|
+
if ( !defined($value) ) { next; } # this one should be removed
|
1737
|
+
if ( $value eq '' ) { push @out,$key; } # this one is of the form HM2 in contrast to DP=3
|
1738
|
+
else { push @out,"$key=$value"; } # this is the standard key=value pair
|
1739
|
+
}
|
1740
|
+
if ( !@out ) { push @out,'.'; }
|
1741
|
+
return join(';',@out);
|
1742
|
+
}
|
1743
|
+
|
1744
|
+
|
1745
|
+
=head2 add_filter
|
1746
|
+
|
1747
|
+
Usage : $x=$vcf->next_data_array(); $$x[6]=$vcf->add_filter($$x[6],'SnpCluster'=>1,'q10'=>0); print join("\t",@$x)."\n";
|
1748
|
+
Args : The record obtained by next_data_array or next_data_hash
|
1749
|
+
: The key-value pairs for filter to be added. If value is 1, the filter will be added. If 0, the filter will be removed.
|
1750
|
+
Returns : The formatted filter field.
|
1751
|
+
|
1752
|
+
=cut
|
1753
|
+
|
1754
|
+
sub add_filter
|
1755
|
+
{
|
1756
|
+
my ($self,$filter,%filters) = @_;
|
1757
|
+
|
1758
|
+
my @out = ();
|
1759
|
+
my @filters = ref($filter) eq 'ARRAY' ? @$filter : split(/;/,$filter);
|
1760
|
+
|
1761
|
+
# First handle the existing filters, keep everything unless in %filters
|
1762
|
+
for my $key (@filters)
|
1763
|
+
{
|
1764
|
+
if ( $key eq '.' or $key eq 'PASS' ) { next; }
|
1765
|
+
if ( !exists($filters{$key}) ) { push @out,$key; next; }
|
1766
|
+
}
|
1767
|
+
|
1768
|
+
# Now add the new filters and remove the unwanted ones
|
1769
|
+
while (my ($key,$value)=each %filters)
|
1770
|
+
{
|
1771
|
+
if ( !$value ) { next; } # this one should be removed
|
1772
|
+
push @out,$key; # this one should be added
|
1773
|
+
}
|
1774
|
+
if ( !@out ) { push @out,'PASS'; }
|
1775
|
+
return ref($filter) eq 'ARRAY' ? return \@out : join(';',@out);
|
1776
|
+
}
|
1777
|
+
|
1778
|
+
|
1779
|
+
=head2 validate_filter_field
|
1780
|
+
|
1781
|
+
Usage : my $x = $vcf->next_data_hash(); $vcf->validate_filter_field($$x{FILTER});
|
1782
|
+
Args : The FILTER arrayref
|
1783
|
+
Returns : Error message in case of an error.
|
1784
|
+
|
1785
|
+
=cut
|
1786
|
+
|
1787
|
+
sub validate_filter_field
|
1788
|
+
{
|
1789
|
+
my ($self,$values) = @_;
|
1790
|
+
|
1791
|
+
if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
|
1792
|
+
|
1793
|
+
my @errs;
|
1794
|
+
my @missing;
|
1795
|
+
for my $item (@$values)
|
1796
|
+
{
|
1797
|
+
if ( $item eq $$self{filter_passed} ) { next; }
|
1798
|
+
if ( $item=~/,/ ) { push @errs,"Expected semicolon as a separator."; }
|
1799
|
+
if ( exists($$self{reserved}{FILTER}{$item}) ) { return qq[The filter name "$item" cannot be used, it is a reserved word.]; }
|
1800
|
+
if ( exists($$self{header}{FILTER}{$item}) ) { next; }
|
1801
|
+
push @missing, $item;
|
1802
|
+
$self->add_header_line({key=>'FILTER',ID=>$item,Description=>'No description'});
|
1803
|
+
}
|
1804
|
+
if ( !@errs && !@missing ) { return undef; }
|
1805
|
+
if ( $$self{version}<3.3 ) { return undef; }
|
1806
|
+
return join(',',@errs) .' '. 'The filter(s) [' . join(',',@missing) . '] not listed in the header.';
|
1807
|
+
}
|
1808
|
+
|
1809
|
+
|
1810
|
+
sub _add_unknown_field
|
1811
|
+
{
|
1812
|
+
my ($self,$field,$key,$nargs) = @_;
|
1813
|
+
$self->add_header_line({key=>$field,ID=>$key,Number=>$nargs,Type=>'String',Description=>'No description'});
|
1814
|
+
}
|
1815
|
+
|
1816
|
+
=head2 validate_header
|
1817
|
+
|
1818
|
+
About : Version specific header validation code.
|
1819
|
+
Usage : my $vcf = Vcf->new(); $vcf->parse_header(); $vcf->validate_header();
|
1820
|
+
Args :
|
1821
|
+
|
1822
|
+
=cut
|
1823
|
+
|
1824
|
+
sub validate_header
|
1825
|
+
{
|
1826
|
+
my ($self) = @_;
|
1827
|
+
}
|
1828
|
+
|
1829
|
+
=head2 validate_line
|
1830
|
+
|
1831
|
+
About : Version specific line validation code.
|
1832
|
+
Usage : my $vcf = Vcf->new(); $vcf->parse_header(); $x = $vcf->next_data_hash; $vcf->validate_line($x);
|
1833
|
+
Args :
|
1834
|
+
|
1835
|
+
=cut
|
1836
|
+
|
1837
|
+
sub validate_line
|
1838
|
+
{
|
1839
|
+
my ($self,$x) = @_;
|
1840
|
+
|
1841
|
+
# Is the ID composed of alphanumeric chars
|
1842
|
+
if ( !($$x{ID}=~/^[\w;\.]+$/) ) { $self->warn("Expected alphanumeric ID at $$x{CHROM}:$$x{POS}, but got [$$x{ID}]\n"); }
|
1843
|
+
}
|
1844
|
+
|
1845
|
+
=head2 validate_info_field
|
1846
|
+
|
1847
|
+
Usage : my $x = $vcf->next_data_hash(); $vcf->validate_info_field($$x{INFO},$$x{ALT});
|
1848
|
+
Args : The INFO hashref
|
1849
|
+
Returns : Error message in case of an error.
|
1850
|
+
|
1851
|
+
=cut
|
1852
|
+
|
1853
|
+
sub validate_info_field
|
1854
|
+
{
|
1855
|
+
my ($self,$values,$alts) = @_;
|
1856
|
+
|
1857
|
+
if ( !defined $values ) { return 'Empty INFO field.'; }
|
1858
|
+
|
1859
|
+
# First handle the empty INFO field (.)
|
1860
|
+
if ( scalar keys %$values == 1 && exists($$values{'.'}) ) { return undef; }
|
1861
|
+
|
1862
|
+
# Expected numbers
|
1863
|
+
my $ng = -1;
|
1864
|
+
my $na = -1;
|
1865
|
+
if ( $$self{version}>4.0 )
|
1866
|
+
{
|
1867
|
+
if ( $$alts[0] eq '.' ) { $ng=1; $na=1; }
|
1868
|
+
else
|
1869
|
+
{
|
1870
|
+
$na = @$alts;
|
1871
|
+
$ng = (1+$na+1)*($na+1)/2;
|
1872
|
+
}
|
1873
|
+
}
|
1874
|
+
|
1875
|
+
my @errs;
|
1876
|
+
while (my ($key,$value) = each %$values)
|
1877
|
+
{
|
1878
|
+
if ( !exists($$self{header}{INFO}{$key}) )
|
1879
|
+
{
|
1880
|
+
push @errs, "INFO tag [$key] not listed in the header" unless $$self{version}<3.3;
|
1881
|
+
my $nargs = defined $value ? -1 : 0;
|
1882
|
+
$self->_add_unknown_field('INFO',$key,$nargs);
|
1883
|
+
next;
|
1884
|
+
}
|
1885
|
+
my $type = $$self{header}{INFO}{$key};
|
1886
|
+
|
1887
|
+
my @vals = defined $value ? split(/,/, $value) : ();
|
1888
|
+
if ( $$type{Number} eq 'G' )
|
1889
|
+
{
|
1890
|
+
if ( $ng != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "INFO tag [$key=$value] expected different number of values (expected $ng, found ".scalar @vals.")"; }
|
1891
|
+
}
|
1892
|
+
elsif ( $$type{Number} eq 'A' )
|
1893
|
+
{
|
1894
|
+
if ( $na != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "INFO tag [$key=$value] expected different number of values (expected $na, found ".scalar @vals.")"; }
|
1895
|
+
}
|
1896
|
+
elsif ( $$type{Number}==0 )
|
1897
|
+
{
|
1898
|
+
if ( defined($value) ) { push @errs, "INFO tag [$key] did not expect any parameters, got [$value]"; }
|
1899
|
+
next;
|
1900
|
+
}
|
1901
|
+
elsif ( $$type{Number}!=-1 && @vals!=$$type{Number} )
|
1902
|
+
{
|
1903
|
+
push @errs, "INFO tag [$key=$value] expected different number of values ($$type{Number})";
|
1904
|
+
}
|
1905
|
+
if ( !$$type{handler} ) { next; }
|
1906
|
+
for my $val (@vals)
|
1907
|
+
{
|
1908
|
+
my $err = &{$$type{handler}}($self,$val,$$type{default});
|
1909
|
+
if ( $err ) { push @errs, $err; }
|
1910
|
+
}
|
1911
|
+
}
|
1912
|
+
if ( !@errs ) { return undef; }
|
1913
|
+
return join(',',@errs);
|
1914
|
+
}
|
1915
|
+
|
1916
|
+
=head2 validate_gtype_field
|
1917
|
+
|
1918
|
+
Usage : my $x = $vcf->next_data_hash(); $vcf->validate_gtype_field($$x{gtypes}{NA00001},$$x{ALT},$$x{FORMAT});
|
1919
|
+
Args : The genotype data hashref
|
1920
|
+
The ALT arrayref
|
1921
|
+
Returns : Error message in case of an error.
|
1922
|
+
|
1923
|
+
=cut
|
1924
|
+
|
1925
|
+
sub validate_gtype_field
|
1926
|
+
{
|
1927
|
+
my ($self,$data,$alts,$format) = @_;
|
1928
|
+
|
1929
|
+
# Expected numbers
|
1930
|
+
my $ng = -1;
|
1931
|
+
my $na = -1;
|
1932
|
+
if ( $$self{version}>4.0 )
|
1933
|
+
{
|
1934
|
+
if ( $$alts[0] eq '.' ) { $ng=1; $na=1; }
|
1935
|
+
else
|
1936
|
+
{
|
1937
|
+
$na = @$alts;
|
1938
|
+
$ng = (1+$na+1)*($na+1)/2;
|
1939
|
+
}
|
1940
|
+
}
|
1941
|
+
|
1942
|
+
my @errs;
|
1943
|
+
while (my ($key,$value) = each %$data)
|
1944
|
+
{
|
1945
|
+
if ( !exists($$self{header}{FORMAT}{$key}) )
|
1946
|
+
{
|
1947
|
+
push @errs, "FORMAT tag [$key] not listed in the header" unless $$self{version}<3.3;
|
1948
|
+
$self->_add_unknown_field('FORMAT',$key,-1);
|
1949
|
+
next;
|
1950
|
+
}
|
1951
|
+
my $type = $$self{header}{FORMAT}{$key};
|
1952
|
+
|
1953
|
+
my @vals = split(/,/, $value);
|
1954
|
+
if ( $$type{Number} eq 'G' )
|
1955
|
+
{
|
1956
|
+
if ( $ng != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "FORMAT tag [$key] expected different number of values (expected $ng, found ".scalar @vals.")"; }
|
1957
|
+
}
|
1958
|
+
elsif ( $$type{Number} eq 'A' )
|
1959
|
+
{
|
1960
|
+
if ( $na != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "FORMAT tag [$key] expected different number of values (expected $na, found ".scalar @vals.")"; }
|
1961
|
+
}
|
1962
|
+
elsif ( $$type{Number}!=-1 && @vals!=$$type{Number} )
|
1963
|
+
{
|
1964
|
+
push @errs, "FORMAT tag [$key] expected different number of values ($$type{Number})";
|
1965
|
+
}
|
1966
|
+
if ( !$$type{handler} ) { next; }
|
1967
|
+
for my $val (@vals)
|
1968
|
+
{
|
1969
|
+
my $err = &{$$type{handler}}($self,$val,$$type{default});
|
1970
|
+
if ( $err ) { push @errs, $err; }
|
1971
|
+
}
|
1972
|
+
}
|
1973
|
+
if ( !exists($$data{GT}) ) { push @errs, "The mandatory tag GT not present." unless $$self{ignore_missing_GT}; }
|
1974
|
+
else
|
1975
|
+
{
|
1976
|
+
my $buf = $$data{GT};
|
1977
|
+
while ($buf ne '')
|
1978
|
+
{
|
1979
|
+
my $al = $buf;
|
1980
|
+
if ( $buf=~$$self{regex_gtsep} )
|
1981
|
+
{
|
1982
|
+
$al = $`;
|
1983
|
+
$buf = $';
|
1984
|
+
if ( $buf eq '' ) { push @errs, "Unable to parse the GT field [$$data{GT}]."; last; }
|
1985
|
+
}
|
1986
|
+
else
|
1987
|
+
{
|
1988
|
+
$buf = '';
|
1989
|
+
}
|
1990
|
+
|
1991
|
+
if ( !defined $al ) { push @errs, "Unable to parse the GT field [$$data{GT}]."; last; }
|
1992
|
+
if ( $al eq '.' ) { next; }
|
1993
|
+
if ( $al eq '0' ) { next; }
|
1994
|
+
if ( !($al=~/^[0-9]+$/) ) { push @errs, "Unable to parse the GT field [$$data{GT}], expected integer."; last; }
|
1995
|
+
if ( !exists($$alts[$al-1]) ) { push @errs, "Bad ALT value in the GT field, the index [$al] out of bounds [$$data{GT}]."; last; }
|
1996
|
+
}
|
1997
|
+
}
|
1998
|
+
if ( !@errs ) { return undef; }
|
1999
|
+
return join(',',@errs);
|
2000
|
+
}
|
2001
|
+
|
2002
|
+
|
2003
|
+
sub validate_ref_field
|
2004
|
+
{
|
2005
|
+
my ($self,$ref) = @_;
|
2006
|
+
if ( !($ref=~/^[ACGTN]$/) ) { return "Expected one of A,C,G,T,N, got [$ref]\n"; }
|
2007
|
+
return undef;
|
2008
|
+
}
|
2009
|
+
|
2010
|
+
sub validate_int
|
2011
|
+
{
|
2012
|
+
my ($self,$value,$default) = @_;
|
2013
|
+
|
2014
|
+
if ( defined($default) && $value eq $default ) { return undef; }
|
2015
|
+
if ( $value =~ /^-?\d+$/ ) { return undef; }
|
2016
|
+
return "Could not validate the int [$value]";
|
2017
|
+
}
|
2018
|
+
|
2019
|
+
sub validate_float
|
2020
|
+
{
|
2021
|
+
my ($self,$value,$default) = @_;
|
2022
|
+
if ( defined($default) && $value eq $default ) { return undef; }
|
2023
|
+
if ( $value =~ /^-?\d+(?:\.\d*)$/ ) { return undef; }
|
2024
|
+
if ( $value =~ /^-?\d*(?:\.\d+)$/ ) { return undef; }
|
2025
|
+
if ( $value =~ /^-?\d+$/ ) { return undef; }
|
2026
|
+
if ( $value =~ /^-?\d*(?:\.?\d+)(?:[Ee][-+]?\d+)?$/ ) { return undef; }
|
2027
|
+
return "Could not validate the float [$value]";
|
2028
|
+
}
|
2029
|
+
|
2030
|
+
sub validate_char
|
2031
|
+
{
|
2032
|
+
my ($self,$value,$default) = @_;
|
2033
|
+
|
2034
|
+
if ( defined($default) && $value eq $default ) { return undef; }
|
2035
|
+
if ( length($value)==1) { return undef; }
|
2036
|
+
return "Could not validate the char value [$value]";
|
2037
|
+
}
|
2038
|
+
|
2039
|
+
|
2040
|
+
=head2 run_validation
|
2041
|
+
|
2042
|
+
About : Validates the VCF file.
|
2043
|
+
Usage : my $vcf = Vcf->new(file=>'file.vcf'); $vcf->run_validation('example.vcf.gz');
|
2044
|
+
Args : File name or file handle.
|
2045
|
+
|
2046
|
+
=cut
|
2047
|
+
|
2048
|
+
sub run_validation
|
2049
|
+
{
|
2050
|
+
my ($self) = @_;
|
2051
|
+
|
2052
|
+
$self->parse_header();
|
2053
|
+
$self->validate_header();
|
2054
|
+
|
2055
|
+
if ( !exists($$self{header}) ) { $self->warn(qq[The header not present.\n]); }
|
2056
|
+
elsif ( !exists($$self{header}{fileformat}) )
|
2057
|
+
{
|
2058
|
+
$self->warn(qq[The "fileformat" field not present in the header, assuming VCFv$$self{version}\n]);
|
2059
|
+
}
|
2060
|
+
elsif ( $$self{header_lines}[0]{key} ne 'fileformat' )
|
2061
|
+
{
|
2062
|
+
$self->warn(qq[The "fileformat" not the first line in the header\n]);
|
2063
|
+
}
|
2064
|
+
if ( !exists($$self{columns}) ) { $self->warn("No column descriptions found.\n"); }
|
2065
|
+
|
2066
|
+
my $default_qual = $$self{defaults}{QUAL};
|
2067
|
+
my $warn_sorted=1;
|
2068
|
+
my $warn_duplicates = exists($$self{warn_duplicates}) ? $$self{warn_duplicates} : 1;
|
2069
|
+
my ($prev_chrm,$prev_pos);
|
2070
|
+
while (my $x=$self->next_data_hash())
|
2071
|
+
{
|
2072
|
+
$self->validate_line($x);
|
2073
|
+
|
2074
|
+
# Is the position numeric?
|
2075
|
+
if ( !($$x{POS}=~/^\d+$/) ) { $self->warn("Expected integer for the position at $$x{CHROM}:$$x{POS}\n"); }
|
2076
|
+
|
2077
|
+
if ( $warn_duplicates )
|
2078
|
+
{
|
2079
|
+
if ( $prev_chrm && $prev_chrm eq $$x{CHROM} && $prev_pos eq $$x{POS} )
|
2080
|
+
{
|
2081
|
+
$self->warn("Warning: Duplicate entries, for example $$x{CHROM}:$$x{POS}\n");
|
2082
|
+
$warn_duplicates = 0;
|
2083
|
+
}
|
2084
|
+
}
|
2085
|
+
|
2086
|
+
# Is the file sorted?
|
2087
|
+
if ( $warn_sorted )
|
2088
|
+
{
|
2089
|
+
if ( $prev_chrm && $prev_chrm eq $$x{CHROM} && $prev_pos > $$x{POS} )
|
2090
|
+
{
|
2091
|
+
$self->warn("Warning: The file is not sorted, for example $$x{CHROM}:$$x{POS} comes after $prev_chrm:$prev_pos\n");
|
2092
|
+
$warn_sorted = 0;
|
2093
|
+
}
|
2094
|
+
$prev_chrm = $$x{CHROM};
|
2095
|
+
$prev_pos = $$x{POS};
|
2096
|
+
}
|
2097
|
+
|
2098
|
+
# The reference base: one of A,C,G,T,N, non-empty.
|
2099
|
+
my $err = $self->validate_ref_field($$x{REF});
|
2100
|
+
if ( $err ) { $self->warn("$$x{CHROM}:$$x{POS} .. $err\n"); }
|
2101
|
+
|
2102
|
+
# The ALT field (alternate non-reference base)
|
2103
|
+
$err = $self->validate_alt_field($$x{ALT},$$x{REF});
|
2104
|
+
if ( $err ) { $self->warn("$$x{CHROM}:$$x{POS} .. $err\n"); }
|
2105
|
+
|
2106
|
+
# The QUAL field
|
2107
|
+
my $ret = $self->validate_float($$x{QUAL},$default_qual);
|
2108
|
+
if ( $ret ) { $self->warn("QUAL field at $$x{CHROM}:$$x{POS} .. $ret\n"); }
|
2109
|
+
elsif ( $$x{QUAL}=~/^-?\d+$/ && $$x{QUAL}<-1 ) { $self->warn("QUAL field at $$x{CHROM}:$$x{POS} is negative .. $$x{QUAL}\n"); }
|
2110
|
+
|
2111
|
+
# The FILTER field
|
2112
|
+
$err = $self->validate_filter_field($$x{FILTER});
|
2113
|
+
if ( $err ) { $self->warn("FILTER field at $$x{CHROM}:$$x{POS} .. $err\n"); }
|
2114
|
+
|
2115
|
+
# The INFO field
|
2116
|
+
$err = $self->validate_info_field($$x{INFO},$$x{ALT});
|
2117
|
+
if ( $err ) { $self->warn("INFO field at $$x{CHROM}:$$x{POS} .. $err\n"); }
|
2118
|
+
|
2119
|
+
while (my ($gt,$data) = each %{$$x{gtypes}})
|
2120
|
+
{
|
2121
|
+
$err = $self->validate_gtype_field($data,$$x{ALT},$$x{FORMAT});
|
2122
|
+
if ( $err ) { $self->warn("column $gt at $$x{CHROM}:$$x{POS} .. $err\n"); }
|
2123
|
+
}
|
2124
|
+
|
2125
|
+
if ( scalar keys %{$$x{gtypes}} && (exists($$x{INFO}{AN}) || exists($$x{INFO}{AC})) )
|
2126
|
+
{
|
2127
|
+
my $nalt = scalar @{$$x{ALT}};
|
2128
|
+
if ( $nalt==1 && $$x{ALT}[0] eq '.' ) { $nalt=0; }
|
2129
|
+
my ($an,$ac) = $self->calc_an_ac($$x{gtypes},$nalt); # Allow alleles in ALT which are absent in samples
|
2130
|
+
if ( exists($$x{INFO}{AN}) && $an ne $$x{INFO}{AN} )
|
2131
|
+
{
|
2132
|
+
$self->warn("$$x{CHROM}:$$x{POS} .. AN is $$x{INFO}{AN}, should be $an\n");
|
2133
|
+
}
|
2134
|
+
if ( exists($$x{INFO}{AC}) && $ac ne $$x{INFO}{AC} )
|
2135
|
+
{
|
2136
|
+
$self->warn("$$x{CHROM}:$$x{POS} .. AC is $$x{INFO}{AC}, should be $ac\n");
|
2137
|
+
}
|
2138
|
+
}
|
2139
|
+
}
|
2140
|
+
}
|
2141
|
+
|
2142
|
+
|
2143
|
+
=head2 get_chromosomes
|
2144
|
+
|
2145
|
+
About : Get list of chromosomes from the VCF file. Must be bgzipped and tabix indexed.
|
2146
|
+
Usage : my $vcf = Vcf->new(); $vcf->get_chromosomes();
|
2147
|
+
Args : none
|
2148
|
+
|
2149
|
+
=cut
|
2150
|
+
|
2151
|
+
sub get_chromosomes
|
2152
|
+
{
|
2153
|
+
my ($self) = @_;
|
2154
|
+
if ( !$$self{file} ) { $self->throw(qq[The parameter "file" not set.\n]); }
|
2155
|
+
my (@out) = `tabix -l $$self{file}`;
|
2156
|
+
if ( $? )
|
2157
|
+
{
|
2158
|
+
$self->throw(qq[The command "tabix -l $$self{file}" exited with an error. Is the file tabix indexed?\n]);
|
2159
|
+
}
|
2160
|
+
for (my $i=0; $i<@out; $i++) { chomp($out[$i]); }
|
2161
|
+
return \@out;
|
2162
|
+
}
|
2163
|
+
|
2164
|
+
|
2165
|
+
=head2 get_samples
|
2166
|
+
|
2167
|
+
About : Get list of samples.
|
2168
|
+
Usage : my $vcf = Vcf->new(); $vcf->parse_header(); my (@samples) = $vcf->get_samples();
|
2169
|
+
Args : none
|
2170
|
+
|
2171
|
+
=cut
|
2172
|
+
|
2173
|
+
sub get_samples
|
2174
|
+
{
|
2175
|
+
my ($self) = @_;
|
2176
|
+
my $n = @{$$self{columns}} - 1;
|
2177
|
+
return (@{$$self{columns}}[9..$n]);
|
2178
|
+
}
|
2179
|
+
|
2180
|
+
|
2181
|
+
#------------------------------------------------
|
2182
|
+
# Version 3.2 specific functions
|
2183
|
+
|
2184
|
+
package Vcf3_2;
|
2185
|
+
use base qw(VcfReader);
|
2186
|
+
|
2187
|
+
sub new
|
2188
|
+
{
|
2189
|
+
my ($class,@args) = @_;
|
2190
|
+
my $self = $class->SUPER::new(@args);
|
2191
|
+
bless $self, ref($class) || $class;
|
2192
|
+
|
2193
|
+
$$self{_defaults} =
|
2194
|
+
{
|
2195
|
+
version => '3.2',
|
2196
|
+
drop_trailings => 1,
|
2197
|
+
filter_passed => 0,
|
2198
|
+
|
2199
|
+
defaults =>
|
2200
|
+
{
|
2201
|
+
QUAL => '-1',
|
2202
|
+
default => '.',
|
2203
|
+
Flag => undef,
|
2204
|
+
GT => '.',
|
2205
|
+
},
|
2206
|
+
|
2207
|
+
handlers =>
|
2208
|
+
{
|
2209
|
+
Integer => \&VcfReader::validate_int,
|
2210
|
+
Float => \&VcfReader::validate_float,
|
2211
|
+
Character => \&VcfReader::validate_char,
|
2212
|
+
String => undef,
|
2213
|
+
Flag => undef,
|
2214
|
+
},
|
2215
|
+
|
2216
|
+
regex_snp => qr/^[ACGTN]$/i,
|
2217
|
+
regex_ins => qr/^I[ACGTN]+$/,
|
2218
|
+
regex_del => qr/^D\d+$/,
|
2219
|
+
regex_gtsep => qr{[\\|/]},
|
2220
|
+
regex_gt => qr{^(\.|\d+)([\\|/]?)(\.?|\d*)$},
|
2221
|
+
regex_gt2 => qr{^(\.|[0-9ACGTNIDacgtn]+)([\\|/]?)},
|
2222
|
+
};
|
2223
|
+
|
2224
|
+
for my $key (keys %{$$self{_defaults}})
|
2225
|
+
{
|
2226
|
+
$$self{$key}=$$self{_defaults}{$key};
|
2227
|
+
}
|
2228
|
+
|
2229
|
+
|
2230
|
+
return $self;
|
2231
|
+
}
|
2232
|
+
|
2233
|
+
|
2234
|
+
#------------------------------------------------
|
2235
|
+
# Version 3.3 specific functions
|
2236
|
+
|
2237
|
+
package Vcf3_3;
|
2238
|
+
use base qw(VcfReader);
|
2239
|
+
|
2240
|
+
sub new
|
2241
|
+
{
|
2242
|
+
my ($class,@args) = @_;
|
2243
|
+
my $self = $class->SUPER::new(@args);
|
2244
|
+
bless $self, ref($class) || $class;
|
2245
|
+
|
2246
|
+
$$self{_defaults} =
|
2247
|
+
{
|
2248
|
+
version => '3.3',
|
2249
|
+
drop_trailings => 0,
|
2250
|
+
filter_passed => 0,
|
2251
|
+
|
2252
|
+
defaults =>
|
2253
|
+
{
|
2254
|
+
QUAL => '-1',
|
2255
|
+
Integer => '-1',
|
2256
|
+
Float => '-1',
|
2257
|
+
Character => '.',
|
2258
|
+
String => '.',
|
2259
|
+
Flag => undef,
|
2260
|
+
GT => './.',
|
2261
|
+
default => '.',
|
2262
|
+
},
|
2263
|
+
|
2264
|
+
handlers =>
|
2265
|
+
{
|
2266
|
+
Integer => \&VcfReader::validate_int,
|
2267
|
+
Float => \&VcfReader::validate_float,
|
2268
|
+
Character => \&VcfReader::validate_char,
|
2269
|
+
String => undef,
|
2270
|
+
Flag => undef,
|
2271
|
+
},
|
2272
|
+
|
2273
|
+
regex_snp => qr/^[ACGTN]$/i,
|
2274
|
+
regex_ins => qr/^I[ACGTN]+$/,
|
2275
|
+
regex_del => qr/^D\d+$/,
|
2276
|
+
regex_gtsep => qr{[\\|/]},
|
2277
|
+
regex_gt => qr{^(\.|\d+)([\\|/]?)(\.?|\d*)$},
|
2278
|
+
regex_gt2 => qr{^(\.|[0-9ACGTNIDacgtn]+)([\\|/]?)}, # . 0/1 0|1 A/A A|A D4/IACGT
|
2279
|
+
};
|
2280
|
+
|
2281
|
+
for my $key (keys %{$$self{_defaults}})
|
2282
|
+
{
|
2283
|
+
$$self{$key}=$$self{_defaults}{$key};
|
2284
|
+
}
|
2285
|
+
|
2286
|
+
return $self;
|
2287
|
+
}
|
2288
|
+
|
2289
|
+
|
2290
|
+
#------------------------------------------------
|
2291
|
+
# Version 4.0 specific functions
|
2292
|
+
|
2293
|
+
=head1 VCFv4.0
|
2294
|
+
|
2295
|
+
VCFv4.0 specific functions
|
2296
|
+
|
2297
|
+
=cut
|
2298
|
+
|
2299
|
+
package Vcf4_0;
|
2300
|
+
use base qw(VcfReader);
|
2301
|
+
|
2302
|
+
sub new
|
2303
|
+
{
|
2304
|
+
my ($class,@args) = @_;
|
2305
|
+
my $self = $class->SUPER::new(@args);
|
2306
|
+
bless $self, ref($class) || $class;
|
2307
|
+
|
2308
|
+
$$self{_defaults} =
|
2309
|
+
{
|
2310
|
+
version => '4.0',
|
2311
|
+
drop_trailings => 1,
|
2312
|
+
filter_passed => 'PASS',
|
2313
|
+
|
2314
|
+
defaults =>
|
2315
|
+
{
|
2316
|
+
QUAL => '.',
|
2317
|
+
Flag => undef,
|
2318
|
+
GT => '.',
|
2319
|
+
default => '.',
|
2320
|
+
},
|
2321
|
+
reserved =>
|
2322
|
+
{
|
2323
|
+
FILTER => { 0=>1 },
|
2324
|
+
},
|
2325
|
+
|
2326
|
+
handlers =>
|
2327
|
+
{
|
2328
|
+
Integer => \&VcfReader::validate_int,
|
2329
|
+
Float => \&VcfReader::validate_float,
|
2330
|
+
Character => \&VcfReader::validate_char,
|
2331
|
+
String => undef,
|
2332
|
+
Flag => undef,
|
2333
|
+
},
|
2334
|
+
|
2335
|
+
regex_snp => qr/^[ACGTN]$|^<[\w:.]+>$/i,
|
2336
|
+
regex_ins => qr/^[ACGTN]+$/,
|
2337
|
+
regex_del => qr/^[ACGTN]+$/,
|
2338
|
+
regex_gtsep => qr{[|/]}, # | /
|
2339
|
+
regex_gt => qr{^(\.|\d+)([|/]?)(\.?|\d*)$}, # . ./. 0/1 0|1
|
2340
|
+
regex_gt2 => qr{^(\.|[0-9ACGTNacgtn]+|<[\w:.]+>)([|/]?)}, # . ./. 0/1 0|1 A/A A|A 0|<DEL:ME:ALU>
|
2341
|
+
};
|
2342
|
+
|
2343
|
+
for my $key (keys %{$$self{_defaults}})
|
2344
|
+
{
|
2345
|
+
$$self{$key}=$$self{_defaults}{$key};
|
2346
|
+
}
|
2347
|
+
|
2348
|
+
return $self;
|
2349
|
+
}
|
2350
|
+
|
2351
|
+
sub Vcf4_0::format_header_line
|
2352
|
+
{
|
2353
|
+
my ($self,$rec) = @_;
|
2354
|
+
|
2355
|
+
my %tmp_rec = ( %$rec );
|
2356
|
+
if ( exists($tmp_rec{Number}) && $tmp_rec{Number} eq '-1' ) { $tmp_rec{Number} = '.' }
|
2357
|
+
my $value;
|
2358
|
+
if ( exists($tmp_rec{ID}) or $tmp_rec{key} eq 'PEDIGREE' )
|
2359
|
+
{
|
2360
|
+
my %has = ( key=>1, handler=>1, default=>1 ); # Internal keys not to be output
|
2361
|
+
my @items;
|
2362
|
+
for my $key (qw(ID Number Type Description), sort keys %tmp_rec)
|
2363
|
+
{
|
2364
|
+
if ( !exists($tmp_rec{$key}) or $has{$key} ) { next; }
|
2365
|
+
my $quote = ($key eq 'Description' or $tmp_rec{$key}=~/\s/) ? '"' : '';
|
2366
|
+
push @items, "$key=$quote$tmp_rec{$key}$quote";
|
2367
|
+
$has{$key}=1;
|
2368
|
+
}
|
2369
|
+
$value = '<' .join(',',@items). '>';
|
2370
|
+
}
|
2371
|
+
else { $value = $tmp_rec{value}; }
|
2372
|
+
|
2373
|
+
my $line = "##$tmp_rec{key}=".$value."\n";
|
2374
|
+
return $line;
|
2375
|
+
}
|
2376
|
+
|
2377
|
+
=head2 parse_header_line
|
2378
|
+
|
2379
|
+
Usage : $vcf->parse_header_line(q[##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">])
|
2380
|
+
$vcf->parse_header_line(q[reference=1000GenomesPilot-NCBI36])
|
2381
|
+
Args :
|
2382
|
+
Returns :
|
2383
|
+
|
2384
|
+
=cut
|
2385
|
+
|
2386
|
+
sub Vcf4_0::parse_header_line
|
2387
|
+
{
|
2388
|
+
my ($self,$line) = @_;
|
2389
|
+
|
2390
|
+
chomp($line);
|
2391
|
+
$line =~ s/^##//;
|
2392
|
+
|
2393
|
+
if ( !($line=~/^([^=]+)=/) ) { $self->throw("Expected key=value pair in the header: $line\n"); }
|
2394
|
+
my $key = $1;
|
2395
|
+
my $value = $';
|
2396
|
+
|
2397
|
+
if ( !($value=~/^<(.+)>\s*$/) )
|
2398
|
+
{
|
2399
|
+
# Simple sanity check for subtle typos
|
2400
|
+
if ( $key eq 'INFO' or $key eq 'FILTER' or $key eq 'FORMAT' or $key eq 'ALT' )
|
2401
|
+
{
|
2402
|
+
$self->throw("Hmm, is this a typo? [$key] [$value]");
|
2403
|
+
}
|
2404
|
+
return { key=>$key, value=>$value };
|
2405
|
+
}
|
2406
|
+
|
2407
|
+
my $rec = { key=>$key };
|
2408
|
+
my $tmp = $1;
|
2409
|
+
my ($attr_key,$attr_value,$quoted);
|
2410
|
+
while ($tmp ne '')
|
2411
|
+
{
|
2412
|
+
if ( !defined $attr_key )
|
2413
|
+
{
|
2414
|
+
if ( $tmp=~/^([^=]+)="/ ) { $attr_key=$1; $quoted=1; $tmp=$'; next; }
|
2415
|
+
elsif ( $tmp=~/^([^=]+)=/ ) { $attr_key=$1; $quoted=0; $tmp=$'; next; }
|
2416
|
+
else { $self->throw(qq[Could not parse header line: $line\nStopped at [$tmp].\n]); }
|
2417
|
+
}
|
2418
|
+
|
2419
|
+
if ( $tmp=~/^[^,\\"]+/ ) { $attr_value .= $&; $tmp = $'; }
|
2420
|
+
if ( $tmp=~/^\\\\/ ) { $attr_value .= '\\\\'; $tmp = $'; next; }
|
2421
|
+
if ( $tmp=~/^\\"/ ) { $attr_value .= '\\"'; $tmp = $'; next; }
|
2422
|
+
if ( $tmp eq '' or ($tmp=~/^,/ && !$quoted) or $tmp=~/^"/ )
|
2423
|
+
{
|
2424
|
+
if ( $attr_key=~/^\s+/ or $attr_key=~/\s+$/ or $attr_value=~/^\s+/ or $attr_value=~/\s+$/ )
|
2425
|
+
{
|
2426
|
+
$self->warn("Leading or trailing space in attr_key-attr_value pairs is discouraged:\n\t[$attr_key] [$attr_value]\n\t$line\n");
|
2427
|
+
$attr_key =~ s/^\s+//;
|
2428
|
+
$attr_key =~ s/\s+$//;
|
2429
|
+
$attr_value =~ s/^\s+//;
|
2430
|
+
$attr_value =~ s/\s+$//;
|
2431
|
+
}
|
2432
|
+
$$rec{$attr_key} = $attr_value;
|
2433
|
+
$tmp = $';
|
2434
|
+
if ( $quoted && $tmp=~/^,/ ) { $tmp = $'; }
|
2435
|
+
$attr_key = $attr_value = $quoted = undef;
|
2436
|
+
next;
|
2437
|
+
}
|
2438
|
+
if ( $tmp=~/^,/ ) { $attr_value .= $&; $tmp = $'; next; }
|
2439
|
+
$self->throw(qq[Could not parse header line: $line\nStopped at [$tmp].\n]);
|
2440
|
+
}
|
2441
|
+
|
2442
|
+
if ( $key ne 'PEDIGREE' && !exists($$rec{ID}) ) { $self->throw("Missing the ID tag in $line\n"); }
|
2443
|
+
if ( $key eq 'INFO' or $key eq 'FILTER' or $key eq 'FORMAT' )
|
2444
|
+
{
|
2445
|
+
if ( !exists($$rec{Description}) ) { $self->throw("Missing the Description tag in $line\n"); }
|
2446
|
+
}
|
2447
|
+
if ( exists($$rec{Number}) && $$rec{Number} eq '-1' ) { $self->warn("The use of -1 for unknown number of values is deprecated, please use '.' instead.\n\t$line\n"); }
|
2448
|
+
if ( exists($$rec{Number}) && $$rec{Number} eq '.' ) { $$rec{Number}=-1; }
|
2449
|
+
|
2450
|
+
return $rec;
|
2451
|
+
}
|
2452
|
+
|
2453
|
+
sub Vcf4_0::validate_ref_field
|
2454
|
+
{
|
2455
|
+
my ($self,$ref) = @_;
|
2456
|
+
if ( !($ref=~/^[ACGTN]+$/) )
|
2457
|
+
{
|
2458
|
+
my $offending = $ref;
|
2459
|
+
$offending =~ s/[ACGTN]+//g;
|
2460
|
+
return "Expected combination of A,C,G,T,N for REF, got [$ref], the offending chars were [$offending]\n";
|
2461
|
+
}
|
2462
|
+
return undef;
|
2463
|
+
}
|
2464
|
+
|
2465
|
+
sub Vcf4_0::validate_alt_field
|
2466
|
+
{
|
2467
|
+
my ($self,$values,$ref) = @_;
|
2468
|
+
|
2469
|
+
if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
|
2470
|
+
|
2471
|
+
my $ret = $self->_validate_alt_field($values,$ref);
|
2472
|
+
if ( $ret ) { return $ret; }
|
2473
|
+
|
2474
|
+
my $ref_len = length($ref);
|
2475
|
+
my $ref1 = substr($ref,0,1);
|
2476
|
+
|
2477
|
+
my @err;
|
2478
|
+
my $msg = '';
|
2479
|
+
for my $item (@$values)
|
2480
|
+
{
|
2481
|
+
if ( !($item=~/^[ACTGN]+$|^<[^<>\s]+>$/) ) { push @err,$item; next; }
|
2482
|
+
if ( $item=~/^<[^<>\s]+>$/ ) { next; }
|
2483
|
+
if ( $ref_len==length($item) ) { next; }
|
2484
|
+
if ( substr($item,0,1) ne $ref1 ) { $msg=', first base does not match the reference.'; push @err,$item; next; }
|
2485
|
+
}
|
2486
|
+
if ( !@err ) { return undef; }
|
2487
|
+
return 'Could not parse the allele(s) [' .join(',',@err). ']' . $msg;
|
2488
|
+
}
|
2489
|
+
|
2490
|
+
|
2491
|
+
=head2 fill_ref_alt_mapping
|
2492
|
+
|
2493
|
+
About : A tool for merging VCFv4.0 records. The subroutine unifies the REFs and creates a mapping
|
2494
|
+
from the original haplotypes to the haplotypes based on the new REF. Consider the following
|
2495
|
+
example:
|
2496
|
+
REF ALT
|
2497
|
+
G GA
|
2498
|
+
GT G
|
2499
|
+
GT GA
|
2500
|
+
GT GAA
|
2501
|
+
GTC G
|
2502
|
+
G <DEL>
|
2503
|
+
my $map={G=>{GA=>1},GT=>{G=>1,GA=>1,GAA=>1},GTC=>{G=>1},G=>{'<DEL>'=>1}};
|
2504
|
+
my $new_ref=$vcf->fill_ref_alt_mapping($map);
|
2505
|
+
|
2506
|
+
The call returns GTC and $map is now
|
2507
|
+
G GA -> GTC GATC
|
2508
|
+
GT G -> GTC GC
|
2509
|
+
GT GA -> GTC GAC
|
2510
|
+
GT GAA -> GTC GAAC
|
2511
|
+
GTC G -> GTC G
|
2512
|
+
G <DEL> -> GTC <DEL>
|
2513
|
+
Args :
|
2514
|
+
Returns : New REF string and fills the hash with appropriate ALT.
|
2515
|
+
|
2516
|
+
=cut
|
2517
|
+
|
2518
|
+
sub Vcf4_0::fill_ref_alt_mapping
|
2519
|
+
{
|
2520
|
+
my ($self,$map) = @_;
|
2521
|
+
|
2522
|
+
my $max_len = 0;
|
2523
|
+
my $new_ref;
|
2524
|
+
for my $ref (keys %$map)
|
2525
|
+
{
|
2526
|
+
my $len = length($ref);
|
2527
|
+
if ( $max_len<$len )
|
2528
|
+
{
|
2529
|
+
$max_len = $len;
|
2530
|
+
$new_ref = $ref;
|
2531
|
+
}
|
2532
|
+
$$map{$ref}{$ref} = 1;
|
2533
|
+
}
|
2534
|
+
for my $ref (keys %$map)
|
2535
|
+
{
|
2536
|
+
my $rlen = length($ref);
|
2537
|
+
if ( substr($new_ref,0,$rlen) ne $ref ) { $self->throw("The reference prefixes do not agree: $ref vs $new_ref\n"); }
|
2538
|
+
for my $alt (keys %{$$map{$ref}})
|
2539
|
+
{
|
2540
|
+
# The second part of the regex is for VCF>4.0, but does no harm for v<=4.0
|
2541
|
+
if ( $alt=~/^<.+>$/ or $alt=~/\[|\]/ ) { $$map{$ref}{$alt} = $alt; next; }
|
2542
|
+
my $new = $alt;
|
2543
|
+
if ( $rlen<$max_len ) { $new .= substr($new_ref,$rlen); }
|
2544
|
+
$$map{$ref}{$alt} = $new;
|
2545
|
+
}
|
2546
|
+
}
|
2547
|
+
return $new_ref;
|
2548
|
+
}
|
2549
|
+
|
2550
|
+
|
2551
|
+
|
2552
|
+
sub Vcf4_0::event_type
|
2553
|
+
{
|
2554
|
+
my ($self,$rec,$allele) = @_;
|
2555
|
+
|
2556
|
+
my $ref = $rec;
|
2557
|
+
if ( ref($rec) eq 'HASH' )
|
2558
|
+
{
|
2559
|
+
if ( exists($$rec{_cached_events}{$allele}) ) { return (@{$$rec{_cached_events}{$allele}}); }
|
2560
|
+
$ref = $$rec{REF};
|
2561
|
+
}
|
2562
|
+
|
2563
|
+
if ( $allele=~/^<[^>]+>$/ )
|
2564
|
+
{
|
2565
|
+
if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = ['u',0,$allele]; }
|
2566
|
+
return ('u',0,$allele);
|
2567
|
+
}
|
2568
|
+
if ( $allele eq '.' )
|
2569
|
+
{
|
2570
|
+
if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = ['r',0,$ref]; }
|
2571
|
+
return ('r',0,$ref);
|
2572
|
+
}
|
2573
|
+
|
2574
|
+
my $reflen = length($ref);
|
2575
|
+
my $len = length($allele);
|
2576
|
+
|
2577
|
+
my $ht;
|
2578
|
+
my $type;
|
2579
|
+
if ( $len==$reflen )
|
2580
|
+
{
|
2581
|
+
# This can be a reference, a SNP, or multiple SNPs
|
2582
|
+
my $mism = 0;
|
2583
|
+
for (my $i=0; $i<$len; $i++)
|
2584
|
+
{
|
2585
|
+
if ( substr($ref,$i,1) ne substr($allele,$i,1) ) { $mism++; }
|
2586
|
+
}
|
2587
|
+
if ( $mism==0 ) { $type='r'; $len=0; }
|
2588
|
+
else { $type='s'; $len=$mism; }
|
2589
|
+
}
|
2590
|
+
else
|
2591
|
+
{
|
2592
|
+
($len,$ht)=is_indel($ref,$allele);
|
2593
|
+
if ( $len )
|
2594
|
+
{
|
2595
|
+
# Indel
|
2596
|
+
$type = 'i';
|
2597
|
+
$allele = $ht;
|
2598
|
+
}
|
2599
|
+
else
|
2600
|
+
{
|
2601
|
+
$type = 'o'; $len = $len>$reflen ? $len-1 : $reflen-1;
|
2602
|
+
}
|
2603
|
+
}
|
2604
|
+
|
2605
|
+
if ( ref($rec) eq 'HASH' )
|
2606
|
+
{
|
2607
|
+
$$rec{_cached_events}{$allele} = [$type,$len,$allele];
|
2608
|
+
}
|
2609
|
+
return ($type,$len,$allele);
|
2610
|
+
}
|
2611
|
+
|
2612
|
+
# The sequences start at the same position, which simplifies things greatly.
|
2613
|
+
sub is_indel
|
2614
|
+
{
|
2615
|
+
my ($seq1,$seq2) = @_;
|
2616
|
+
|
2617
|
+
my $len1 = length($seq1);
|
2618
|
+
my $len2 = length($seq2);
|
2619
|
+
if ( $len1 eq $len2 ) { return (0,''); }
|
2620
|
+
|
2621
|
+
my ($del,$len,$LEN);
|
2622
|
+
if ( $len1<$len2 )
|
2623
|
+
{
|
2624
|
+
$len = $len1;
|
2625
|
+
$LEN = $len2;
|
2626
|
+
$del = 1;
|
2627
|
+
}
|
2628
|
+
else
|
2629
|
+
{
|
2630
|
+
$len = $len2;
|
2631
|
+
$LEN = $len1;
|
2632
|
+
$del = -1;
|
2633
|
+
my $tmp=$seq1; $seq1=$seq2; $seq2=$tmp;
|
2634
|
+
}
|
2635
|
+
|
2636
|
+
my $ileft;
|
2637
|
+
for ($ileft=0; $ileft<$len; $ileft++)
|
2638
|
+
{
|
2639
|
+
if ( substr($seq1,$ileft,1) ne substr($seq2,$ileft,1) ) { last; }
|
2640
|
+
}
|
2641
|
+
if ( $ileft==$len )
|
2642
|
+
{
|
2643
|
+
return ($del*($LEN-$len), substr($seq2,$ileft));
|
2644
|
+
}
|
2645
|
+
|
2646
|
+
my $iright;
|
2647
|
+
for ($iright=0; $iright<$len; $iright++)
|
2648
|
+
{
|
2649
|
+
if ( substr($seq1,$len-$iright,1) ne substr($seq2,$LEN-$iright,1) ) { last; }
|
2650
|
+
}
|
2651
|
+
if ( $iright+$ileft<=$len ) { return (0,''); }
|
2652
|
+
|
2653
|
+
return ($del*($LEN-$len),substr($seq2,$ileft,$LEN-$len));
|
2654
|
+
}
|
2655
|
+
|
2656
|
+
|
2657
|
+
#------------------------------------------------
|
2658
|
+
# Version 4.1 specific functions
|
2659
|
+
|
2660
|
+
=head1 VCFv4.1
|
2661
|
+
|
2662
|
+
VCFv4.1 specific functions
|
2663
|
+
|
2664
|
+
=cut
|
2665
|
+
|
2666
|
+
package Vcf4_1;
|
2667
|
+
use base qw(Vcf4_0);
|
2668
|
+
|
2669
|
+
sub new
|
2670
|
+
{
|
2671
|
+
my ($class,@args) = @_;
|
2672
|
+
my $self = $class->SUPER::new(@args);
|
2673
|
+
bless $self, ref($class) || $class;
|
2674
|
+
|
2675
|
+
$$self{_defaults} =
|
2676
|
+
{
|
2677
|
+
version => '4.1',
|
2678
|
+
drop_trailings => 1,
|
2679
|
+
filter_passed => 'PASS',
|
2680
|
+
|
2681
|
+
defaults =>
|
2682
|
+
{
|
2683
|
+
QUAL => '.',
|
2684
|
+
Flag => undef,
|
2685
|
+
GT => '.',
|
2686
|
+
default => '.',
|
2687
|
+
},
|
2688
|
+
reserved =>
|
2689
|
+
{
|
2690
|
+
FILTER => { 0=>1 },
|
2691
|
+
},
|
2692
|
+
|
2693
|
+
handlers =>
|
2694
|
+
{
|
2695
|
+
Integer => \&VcfReader::validate_int,
|
2696
|
+
Float => \&VcfReader::validate_float,
|
2697
|
+
Character => \&VcfReader::validate_char,
|
2698
|
+
String => undef,
|
2699
|
+
Flag => undef,
|
2700
|
+
},
|
2701
|
+
|
2702
|
+
regex_snp => qr/^[ACGTN]$|^<[\w:.]+>$/i,
|
2703
|
+
regex_ins => qr/^[ACGTN]+$/i,
|
2704
|
+
regex_del => qr/^[ACGTN]+$/i,
|
2705
|
+
regex_gtsep => qr{[|/]}, # | /
|
2706
|
+
regex_gt => qr{^(\.|\d+)([|/]?)(\.?|\d*)$}, # . ./. 0/1 0|1
|
2707
|
+
regex_gt2 => qr{^(\.|[0-9ACGTNacgtn]+|<[\w:.]+>)([|/]?)}, # . ./. 0/1 0|1 A/A A|A 0|<DEL:ME:ALU>
|
2708
|
+
};
|
2709
|
+
|
2710
|
+
$$self{ignore_missing_GT} = 1;
|
2711
|
+
|
2712
|
+
for my $key (keys %{$$self{_defaults}})
|
2713
|
+
{
|
2714
|
+
$$self{$key}=$$self{_defaults}{$key};
|
2715
|
+
}
|
2716
|
+
|
2717
|
+
return $self;
|
2718
|
+
}
|
2719
|
+
|
2720
|
+
sub Vcf4_1::validate_header
|
2721
|
+
{
|
2722
|
+
my ($self) = @_;
|
2723
|
+
my $lines = $self->get_header_line(key=>'reference');
|
2724
|
+
if ( !@$lines ) { $self->warn("The header tag 'reference' not present. (Not required but highly recommended.)\n"); }
|
2725
|
+
}
|
2726
|
+
|
2727
|
+
sub Vcf4_1::validate_line
|
2728
|
+
{
|
2729
|
+
my ($self,$line) = @_;
|
2730
|
+
|
2731
|
+
if ( !$$self{_contig_validated}{$$line{CHROM}} )
|
2732
|
+
{
|
2733
|
+
my $lines = $self->get_header_line(key=>'contig',ID=>$$line{CHROM});
|
2734
|
+
if ( !@$lines ) { $self->warn("The header tag 'contig' not present for CHROM=$$line{CHROM}. (Not required but highly recommended.)\n"); }
|
2735
|
+
$$self{_contig_validated}{$$line{CHROM}} = 1;
|
2736
|
+
}
|
2737
|
+
|
2738
|
+
# Is the ID composed of alphanumeric chars
|
2739
|
+
if ( !($$line{ID}=~/^\S+$/) ) { $self->warn("Expected non-whitespace ID at $$line{CHROM}:$$line{POS}, but got [$$line{ID}]\n"); }
|
2740
|
+
}
|
2741
|
+
|
2742
|
+
sub Vcf4_1::validate_alt_field
|
2743
|
+
{
|
2744
|
+
my ($self,$values,$ref) = @_;
|
2745
|
+
|
2746
|
+
if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
|
2747
|
+
|
2748
|
+
my $ret = $self->_validate_alt_field($values,$ref);
|
2749
|
+
if ( $ret ) { return $ret; }
|
2750
|
+
|
2751
|
+
my $ref_len = length($ref);
|
2752
|
+
my $ref1 = substr($ref,0,1);
|
2753
|
+
|
2754
|
+
my @err;
|
2755
|
+
my $msg = '';
|
2756
|
+
for my $item (@$values)
|
2757
|
+
{
|
2758
|
+
if ( $item=~/^(.*)\[(.+)\[(.*)$/ or $item=~/^(.*)\](.+)\](.*)$/ )
|
2759
|
+
{
|
2760
|
+
if ( $1 ne '' && $3 ne '' ) { $msg=', two replacement strings given (expected one)'; push @err,$item; next; }
|
2761
|
+
my $rpl;
|
2762
|
+
if ( $1 ne '' )
|
2763
|
+
{
|
2764
|
+
$rpl = $1;
|
2765
|
+
if ( $rpl ne '.' )
|
2766
|
+
{
|
2767
|
+
my $rref = substr($rpl,0,1);
|
2768
|
+
if ( $rref ne $ref1 ) { $msg=', the first base of the replacement string does not match the reference'; push @err,$item; next; }
|
2769
|
+
}
|
2770
|
+
}
|
2771
|
+
else
|
2772
|
+
{
|
2773
|
+
$rpl = $3;
|
2774
|
+
if ( $rpl ne '.' )
|
2775
|
+
{
|
2776
|
+
my $rref = substr($rpl,-1,1);
|
2777
|
+
if ( $rref ne $ref1 ) { $msg=', the last base of the replacement string does not match the reference'; push @err,$item; next; }
|
2778
|
+
}
|
2779
|
+
}
|
2780
|
+
my $pos = $2;
|
2781
|
+
if ( !($rpl=~/^[ACTGNacgtn]+$/) && $rpl ne '.' ) { $msg=', replacement string not valid (expected [ACTGNacgtn]+)'; push @err,$item; next; }
|
2782
|
+
if ( !($pos=~/^\S+:\d+$/) ) { $msg=', cannot parse sequence:position'; push @err,$item; next; }
|
2783
|
+
next;
|
2784
|
+
}
|
2785
|
+
if ( $item=~/^\.([ACTGNactgn])[ACTGNactgn]*$/ )
|
2786
|
+
{
|
2787
|
+
if ( $ref1 ne $1 ) { $msg=', first base does not match the reference'; push @err,$item; }
|
2788
|
+
next;
|
2789
|
+
}
|
2790
|
+
elsif ( $item=~/^[ACTGNactgn]*([ACTGNactgn])\.$/ )
|
2791
|
+
{
|
2792
|
+
if ( substr($ref,-1,1) ne $1 ) { $msg=', last base does not match the reference'; push @err,$item; }
|
2793
|
+
next;
|
2794
|
+
}
|
2795
|
+
if ( !($item=~/^[ACTGNactgn]+$|^<[^<>\s]+>$/) ) { push @err,$item; next; }
|
2796
|
+
if ( $item=~/^<[^<>\s]+>$/ ) { next; }
|
2797
|
+
if ( $ref_len==length($item) ) { next; }
|
2798
|
+
if ( substr($item,0,1) ne $ref1 ) { $msg=', first base does not match the reference'; push @err,$item; next; }
|
2799
|
+
}
|
2800
|
+
if ( !@err ) { return undef; }
|
2801
|
+
return 'Could not parse the allele(s) [' .join(',',@err). ']' . $msg;
|
2802
|
+
}
|
2803
|
+
|
2804
|
+
sub Vcf4_1::next_data_hash
|
2805
|
+
{
|
2806
|
+
my ($self,@args) = @_;
|
2807
|
+
|
2808
|
+
my $out = $self->SUPER::next_data_hash(@args);
|
2809
|
+
if ( !defined $out ) { return $out; }
|
2810
|
+
|
2811
|
+
# Case-insensitive ALT and REF bases
|
2812
|
+
$$out{REF} = uc($$out{REF});
|
2813
|
+
my $nalt = @{$$out{ALT}};
|
2814
|
+
for (my $i=0; $i<$nalt; $i++)
|
2815
|
+
{
|
2816
|
+
if ( $$out{ALT}[$i]=~/^</ ) { next; }
|
2817
|
+
$$out{ALT}[$i] = uc($$out{ALT}[$i]);
|
2818
|
+
}
|
2819
|
+
|
2820
|
+
return $out;
|
2821
|
+
}
|
2822
|
+
|
2823
|
+
sub Vcf4_1::next_data_array
|
2824
|
+
{
|
2825
|
+
my ($self,@args) = @_;
|
2826
|
+
|
2827
|
+
my $out = $self->SUPER::next_data_array(@args);
|
2828
|
+
if ( !defined $out or $$self{assume_uppercase} ) { return $out; }
|
2829
|
+
|
2830
|
+
# Case-insensitive ALT and REF bases
|
2831
|
+
$$out[3] = uc($$out[3]);
|
2832
|
+
my $alt = $$out[4];
|
2833
|
+
$$out[4] = '';
|
2834
|
+
while ($alt=~/[^<>]+/)
|
2835
|
+
{
|
2836
|
+
$$out[4] .= $`;
|
2837
|
+
$$out[4] .= (length($`) && substr($`,-1,1) eq '<' ) ? $& : uc($&);
|
2838
|
+
$alt = $';
|
2839
|
+
if ( $alt=~/^<[^<>]+>/ ) { $$out[4] .= $&; $alt=$'; }
|
2840
|
+
}
|
2841
|
+
|
2842
|
+
return $out;
|
2843
|
+
}
|
2844
|
+
|
2845
|
+
sub Vcf4_1::event_type
|
2846
|
+
{
|
2847
|
+
my ($self,$rec,$allele) = @_;
|
2848
|
+
if ( $allele=~/\[|\]|^\..+|.+\.$/ ) { return 'b'; }
|
2849
|
+
return $self->SUPER::event_type($rec,$allele);
|
2850
|
+
}
|
2851
|
+
|
2852
|
+
1;
|
2853
|
+
|