ngs_server 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/ngs_server +72 -50
- data/ext/bamtools/extconf.rb +3 -3
- data/ext/vcftools/Makefile +28 -0
- data/ext/vcftools/README.txt +36 -0
- data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
- data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
- data/ext/vcftools/cpp/.svn/entries +708 -0
- data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
- data/ext/vcftools/cpp/Makefile +46 -0
- data/ext/vcftools/cpp/dgeev.cpp +146 -0
- data/ext/vcftools/cpp/dgeev.h +43 -0
- data/ext/vcftools/cpp/output_log.cpp +79 -0
- data/ext/vcftools/cpp/output_log.h +34 -0
- data/ext/vcftools/cpp/parameters.cpp +535 -0
- data/ext/vcftools/cpp/parameters.h +154 -0
- data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
- data/ext/vcftools/cpp/vcf_entry.h +190 -0
- data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
- data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
- data/ext/vcftools/cpp/vcf_file.cpp +495 -0
- data/ext/vcftools/cpp/vcf_file.h +184 -0
- data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
- data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
- data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
- data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
- data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
- data/ext/vcftools/cpp/vcftools.cpp +107 -0
- data/ext/vcftools/cpp/vcftools.h +25 -0
- data/ext/vcftools/examples/.svn/all-wcprops +185 -0
- data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
- data/ext/vcftools/examples/.svn/entries +1048 -0
- data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
- data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
- data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
- data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
- data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
- data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
- data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
- data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
- data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
- data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
- data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
- data/ext/vcftools/examples/annotate-test.vcf +37 -0
- data/ext/vcftools/examples/annotate.out +23 -0
- data/ext/vcftools/examples/annotate.txt +7 -0
- data/ext/vcftools/examples/annotate2.out +52 -0
- data/ext/vcftools/examples/annotate3.out +23 -0
- data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
- data/ext/vcftools/examples/cmp-test.out +53 -0
- data/ext/vcftools/examples/concat-a.vcf +21 -0
- data/ext/vcftools/examples/concat-b.vcf +13 -0
- data/ext/vcftools/examples/concat-c.vcf +19 -0
- data/ext/vcftools/examples/concat.out +39 -0
- data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
- data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
- data/ext/vcftools/examples/merge-test-a.vcf +17 -0
- data/ext/vcftools/examples/merge-test-b.vcf +17 -0
- data/ext/vcftools/examples/merge-test-c.vcf +15 -0
- data/ext/vcftools/examples/merge-test.vcf.out +31 -0
- data/ext/vcftools/examples/perl-api-1.pl +46 -0
- data/ext/vcftools/examples/query-test.out +6 -0
- data/ext/vcftools/examples/shuffle-test.vcf +12 -0
- data/ext/vcftools/examples/subset.SNPs.out +10 -0
- data/ext/vcftools/examples/subset.indels.out +18 -0
- data/ext/vcftools/examples/subset.vcf +21 -0
- data/ext/vcftools/examples/valid-3.3.vcf +30 -0
- data/ext/vcftools/examples/valid-4.0.vcf +34 -0
- data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
- data/ext/vcftools/examples/valid-4.1.vcf +37 -0
- data/ext/vcftools/extconf.rb +2 -0
- data/ext/vcftools/perl/.svn/all-wcprops +149 -0
- data/ext/vcftools/perl/.svn/entries +844 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
- data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
- data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
- data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
- data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
- data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
- data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
- data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
- data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
- data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
- data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
- data/ext/vcftools/perl/ChangeLog +84 -0
- data/ext/vcftools/perl/FaSlice.pm +214 -0
- data/ext/vcftools/perl/Makefile +12 -0
- data/ext/vcftools/perl/Vcf.pm +2853 -0
- data/ext/vcftools/perl/VcfStats.pm +681 -0
- data/ext/vcftools/perl/fill-aa +103 -0
- data/ext/vcftools/perl/fill-an-ac +56 -0
- data/ext/vcftools/perl/fill-ref-md5 +204 -0
- data/ext/vcftools/perl/tab-to-vcf +92 -0
- data/ext/vcftools/perl/test.t +376 -0
- data/ext/vcftools/perl/vcf-annotate +1099 -0
- data/ext/vcftools/perl/vcf-compare +1193 -0
- data/ext/vcftools/perl/vcf-concat +310 -0
- data/ext/vcftools/perl/vcf-convert +180 -0
- data/ext/vcftools/perl/vcf-fix-newlines +97 -0
- data/ext/vcftools/perl/vcf-isec +660 -0
- data/ext/vcftools/perl/vcf-merge +577 -0
- data/ext/vcftools/perl/vcf-query +286 -0
- data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
- data/ext/vcftools/perl/vcf-sort +79 -0
- data/ext/vcftools/perl/vcf-stats +160 -0
- data/ext/vcftools/perl/vcf-subset +206 -0
- data/ext/vcftools/perl/vcf-to-tab +112 -0
- data/ext/vcftools/perl/vcf-validator +145 -0
- data/ext/vcftools/website/.svn/all-wcprops +41 -0
- data/ext/vcftools/website/.svn/entries +238 -0
- data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
- data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
- data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
- data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
- data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
- data/ext/vcftools/website/Makefile +6 -0
- data/ext/vcftools/website/README +2 -0
- data/ext/vcftools/website/VCF-poster.pdf +0 -0
- data/ext/vcftools/website/default.css +250 -0
- data/ext/vcftools/website/favicon.ico +0 -0
- data/ext/vcftools/website/favicon.png +0 -0
- data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/img/.svn/entries +300 -0
- data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
- data/ext/vcftools/website/img/bg.gif +0 -0
- data/ext/vcftools/website/img/bgcode.gif +0 -0
- data/ext/vcftools/website/img/bgcontainer.gif +0 -0
- data/ext/vcftools/website/img/bgul.gif +0 -0
- data/ext/vcftools/website/img/header.gif +0 -0
- data/ext/vcftools/website/img/li.gif +0 -0
- data/ext/vcftools/website/img/quote.gif +0 -0
- data/ext/vcftools/website/img/search.gif +0 -0
- data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/src/.svn/entries +300 -0
- data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
- data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
- data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
- data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
- data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
- data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
- data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
- data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
- data/ext/vcftools/website/src/docs.inc +202 -0
- data/ext/vcftools/website/src/index.inc +52 -0
- data/ext/vcftools/website/src/index.php +80 -0
- data/ext/vcftools/website/src/license.inc +27 -0
- data/ext/vcftools/website/src/links.inc +13 -0
- data/ext/vcftools/website/src/options.inc +654 -0
- data/ext/vcftools/website/src/perl_module.inc +249 -0
- data/ext/vcftools/website/src/specs.inc +18 -0
- data/lib/config.ru +9 -0
- data/lib/ngs_server/add.rb +9 -0
- data/lib/ngs_server/version.rb +1 -1
- data/lib/ngs_server.rb +55 -3
- data/ngs_server.gemspec +5 -2
- metadata +296 -6
@@ -0,0 +1,2853 @@
|
|
1
|
+
package Vcf;
|
2
|
+
|
3
|
+
# http://vcftools.sourceforge.net/specs.html
|
4
|
+
# http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
|
5
|
+
# http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:variant_call_format
|
6
|
+
# http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf4.0
|
7
|
+
# http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf_4.0_sv
|
8
|
+
# http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3
|
9
|
+
# http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcfv3.2
|
10
|
+
#
|
11
|
+
# Authors: petr.danecek@sanger
|
12
|
+
# for VCF v3.2, v3.3, v4.0, v4.1
|
13
|
+
#
|
14
|
+
|
15
|
+
=head1 NAME
|
16
|
+
|
17
|
+
Vcf.pm. Module for validation, parsing and creating VCF files.
|
18
|
+
Supported versions: 3.2, 3.3, 4.0, 4.1
|
19
|
+
|
20
|
+
=head1 SYNOPSIS
|
21
|
+
|
22
|
+
From the command line:
|
23
|
+
perl -MVcf -e validate example.vcf
|
24
|
+
perl -I/path/to/the/module/ -MVcf -e validate_v32 example.vcf
|
25
|
+
|
26
|
+
From a script:
|
27
|
+
use Vcf;
|
28
|
+
|
29
|
+
my $vcf = Vcf->new(file=>'example.vcf.gz',region=>'1:1000-2000');
|
30
|
+
$vcf->parse_header();
|
31
|
+
|
32
|
+
# Do some simple parsing. Most thorough but slowest way how to get the data.
|
33
|
+
while (my $x=$vcf->next_data_hash())
|
34
|
+
{
|
35
|
+
for my $gt (keys %{$$x{gtypes}})
|
36
|
+
{
|
37
|
+
my ($al1,$sep,$al2) = $vcf->parse_alleles($x,$gt);
|
38
|
+
print "\t$gt: $al1$sep$al2\n";
|
39
|
+
}
|
40
|
+
print "\n";
|
41
|
+
}
|
42
|
+
|
43
|
+
# This will split the fields and print a list of CHR:POS
|
44
|
+
while (my $x=$vcf->next_data_array())
|
45
|
+
{
|
46
|
+
print "$$x[0]:$$x[1]\n";
|
47
|
+
}
|
48
|
+
|
49
|
+
# This will return the lines as they were read, including the newline at the end
|
50
|
+
while (my $x=$vcf->next_line())
|
51
|
+
{
|
52
|
+
print $x;
|
53
|
+
}
|
54
|
+
|
55
|
+
# Only the columns NA00001, NA00002 and NA00003 will be printed.
|
56
|
+
my @columns = qw(NA00001 NA00002 NA00003);
|
57
|
+
print $vcf->format_header(\@columns);
|
58
|
+
while (my $x=$vcf->next_data_array())
|
59
|
+
{
|
60
|
+
# this will recalculate AC and AN counts, unless $vcf->recalc_ac_an was set to 0
|
61
|
+
print $vcf->format_line($x,\@columns);
|
62
|
+
}
|
63
|
+
|
64
|
+
$vcf->close();
|
65
|
+
|
66
|
+
=cut
|
67
|
+
|
68
|
+
|
69
|
+
use strict;
|
70
|
+
use warnings;
|
71
|
+
use Carp;
|
72
|
+
use Exporter;
|
73
|
+
use Data::Dumper;
|
74
|
+
use POSIX ":sys_wait_h";
|
75
|
+
|
76
|
+
use vars qw/@ISA @EXPORT/;
|
77
|
+
@ISA = qw/Exporter/;
|
78
|
+
@EXPORT = qw/validate validate_v32/;
|
79
|
+
|
80
|
+
=head2 validate
|
81
|
+
|
82
|
+
About : Validates the VCF file.
|
83
|
+
Usage : perl -MVcf -e validate example.vcf.gz # (from the command line)
|
84
|
+
validate('example.vcf.gz'); # (from a script)
|
85
|
+
validate(\*STDIN);
|
86
|
+
Args : File name or file handle. When no argument given, the first command line
|
87
|
+
argument is interpreted as the file name.
|
88
|
+
|
89
|
+
=cut
|
90
|
+
|
91
|
+
sub validate
|
92
|
+
{
|
93
|
+
my ($fh) = @_;
|
94
|
+
|
95
|
+
if ( !$fh && @ARGV ) { $fh = $ARGV[0]; }
|
96
|
+
|
97
|
+
my $vcf;
|
98
|
+
if ( $fh ) { $vcf = fileno($fh) ? Vcf->new(fh=>$fh) : Vcf->new(file=>$fh); }
|
99
|
+
else { $vcf = Vcf->new(fh=>\*STDIN); }
|
100
|
+
|
101
|
+
$vcf->run_validation();
|
102
|
+
}
|
103
|
+
|
104
|
+
|
105
|
+
=head2 validate_v32
|
106
|
+
|
107
|
+
About : Same as validate, but assumes v3.2 VCF version.
|
108
|
+
Usage : perl -MVcf -e validate_v32 example.vcf.gz # (from the command line)
|
109
|
+
Args : File name or file handle. When no argument given, the first command line
|
110
|
+
argument is interpreted as the file name.
|
111
|
+
|
112
|
+
=cut
|
113
|
+
|
114
|
+
sub validate_v32
|
115
|
+
{
|
116
|
+
my ($fh) = @_;
|
117
|
+
|
118
|
+
if ( !$fh && @ARGV && -e $ARGV[0] ) { $fh = $ARGV[0]; }
|
119
|
+
|
120
|
+
my %params = ( version=>'3.2' );
|
121
|
+
|
122
|
+
my $vcf;
|
123
|
+
if ( $fh ) { $vcf = fileno($fh) ? Vcf->new(%params, fh=>$fh) : Vcf->new(%params, file=>$fh); }
|
124
|
+
else { $vcf = Vcf->new(%params, fh=>\*STDIN); }
|
125
|
+
|
126
|
+
$vcf->run_validation();
|
127
|
+
}
|
128
|
+
|
129
|
+
|
130
|
+
=head2 new
|
131
|
+
|
132
|
+
About : Creates new VCF reader/writer.
|
133
|
+
Usage : my $vcf = Vcf->new(file=>'my.vcf', version=>'3.2');
|
134
|
+
Args :
|
135
|
+
fh .. Open file handle. If neither file nor fh is given, open in write mode.
|
136
|
+
file .. The file name. If neither file nor fh is given, open in write mode.
|
137
|
+
region .. Optional region to parse (requires tabix indexed VCF file)
|
138
|
+
silent .. Unless set to 0, warning messages may be printed.
|
139
|
+
strict .. Unless set to 0, the reader will die when the file violates the specification.
|
140
|
+
version .. If not given, '4.0' is assumed. The header information overrides this setting.
|
141
|
+
|
142
|
+
=cut
|
143
|
+
|
144
|
+
sub new
|
145
|
+
{
|
146
|
+
my ($class,@args) = @_;
|
147
|
+
my $self = {@args};
|
148
|
+
bless $self, ref($class) || $class;
|
149
|
+
|
150
|
+
$$self{silent} = 0 unless exists($$self{silent});
|
151
|
+
$$self{strict} = 0 unless exists($$self{strict});
|
152
|
+
$$self{buffer} = []; # buffer stores the lines in the reverse order
|
153
|
+
$$self{columns} = undef; # column names
|
154
|
+
$$self{mandatory} = ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO'] unless exists($$self{mandatory});
|
155
|
+
$$self{reserved}{cols} = {CHROM=>1,POS=>1,ID=>1,REF=>1,ALT=>1,QUAL=>1,FILTER=>1,INFO=>1,FORMAT=>1} unless exists($$self{reserved_cols});
|
156
|
+
$$self{recalc_ac_an} = 1;
|
157
|
+
$$self{has_header} = 0;
|
158
|
+
$$self{default_version} = '4.1';
|
159
|
+
$$self{versions} = [ qw(Vcf3_2 Vcf3_3 Vcf4_0 Vcf4_1) ];
|
160
|
+
if ( !exists($$self{max_line_len}) && exists($ENV{MAX_VCF_LINE_LEN}) ) { $$self{max_line_len} = $ENV{MAX_VCF_LINE_LEN} }
|
161
|
+
$$self{fix_v40_AGtags} = $ENV{DONT_FIX_VCF40_AG_TAGS} ? 0 : 1;
|
162
|
+
my %open_args = ();
|
163
|
+
if ( exists($$self{region}) ) { $open_args{region}=$$self{region}; }
|
164
|
+
if ( exists($$self{print_header}) ) { $open_args{print_header}=$$self{print_header}; }
|
165
|
+
return $self->_open(%open_args);
|
166
|
+
}
|
167
|
+
|
168
|
+
sub throw
|
169
|
+
{
|
170
|
+
my ($self,@msg) = @_;
|
171
|
+
confess @msg,"\n";
|
172
|
+
}
|
173
|
+
|
174
|
+
sub warn
|
175
|
+
{
|
176
|
+
my ($self,@msg) = @_;
|
177
|
+
if ( $$self{silent} ) { return; }
|
178
|
+
if ( $$self{strict} ) { $self->throw(@msg); }
|
179
|
+
warn @msg;
|
180
|
+
}
|
181
|
+
|
182
|
+
sub _open
|
183
|
+
{
|
184
|
+
my ($self,%args) = @_;
|
185
|
+
|
186
|
+
if ( !exists($$self{fh}) && !exists($$self{file}) )
|
187
|
+
{
|
188
|
+
# Write mode, the version must be supplied by the user
|
189
|
+
return $self->_set_version(exists($$self{version}) ? $$self{version} : $$self{default_version});
|
190
|
+
}
|
191
|
+
|
192
|
+
# Open the file unless filehandle is provided
|
193
|
+
if ( !exists($$self{fh}) )
|
194
|
+
{
|
195
|
+
my $cmd = "<$$self{file}";
|
196
|
+
|
197
|
+
my $tabix_args = '';
|
198
|
+
if ( exists($args{print_header}) && $args{print_header} ) { $tabix_args .= ' -h '; }
|
199
|
+
$tabix_args .= $$self{file};
|
200
|
+
if ( exists($args{region}) && defined($args{region}) ) { $tabix_args .= ' '.$args{region}; }
|
201
|
+
|
202
|
+
if ( -e $$self{file} && $$self{file}=~/\.gz/i )
|
203
|
+
{
|
204
|
+
if ( exists($args{region}) && defined($args{region}) )
|
205
|
+
{
|
206
|
+
$cmd = "tabix $tabix_args |";
|
207
|
+
}
|
208
|
+
else { $cmd = "gunzip -c $$self{file} |"; }
|
209
|
+
$$self{check_exit_status} = 1;
|
210
|
+
}
|
211
|
+
elsif ( $$self{file}=~m{^(?:http|ftp)://} )
|
212
|
+
{
|
213
|
+
if ( !exists($args{region}) ) { $tabix_args .= ' .'; }
|
214
|
+
$cmd = "tabix $tabix_args |";
|
215
|
+
$$self{check_exit_status} = 1;
|
216
|
+
}
|
217
|
+
open($$self{fh},$cmd) or $self->throw("$cmd: $!");
|
218
|
+
}
|
219
|
+
|
220
|
+
# Set the correct VCF version, but only when called for the first time
|
221
|
+
my $vcf = $self;
|
222
|
+
if ( !$$self{_version_set} )
|
223
|
+
{
|
224
|
+
my $first_line = $self->next_line();
|
225
|
+
$vcf = $self->_set_version($first_line);
|
226
|
+
$self->_unread_line($first_line);
|
227
|
+
}
|
228
|
+
return $vcf;
|
229
|
+
}
|
230
|
+
|
231
|
+
|
232
|
+
|
233
|
+
=head2 open
|
234
|
+
|
235
|
+
About : (Re)Open file. No need to call this explicitly unless reading from a different
|
236
|
+
region is requested.
|
237
|
+
Usage : $vcf->open(); # Read from the start
|
238
|
+
$vcf->open(region=>'1:12345-92345');
|
239
|
+
Args : region .. Supported only for tabix indexed files
|
240
|
+
|
241
|
+
=cut
|
242
|
+
|
243
|
+
sub open
|
244
|
+
{
|
245
|
+
my ($self,%args) = @_;
|
246
|
+
$self->close();
|
247
|
+
$self->_open(%args);
|
248
|
+
}
|
249
|
+
|
250
|
+
|
251
|
+
=head2 close
|
252
|
+
|
253
|
+
About : Close the filehandle
|
254
|
+
Usage : $vcf->close();
|
255
|
+
Args : none
|
256
|
+
|
257
|
+
=cut
|
258
|
+
|
259
|
+
sub close
|
260
|
+
{
|
261
|
+
my ($self) = @_;
|
262
|
+
if ( !$$self{fh} ) { return; }
|
263
|
+
close($$self{fh});
|
264
|
+
delete($$self{fh});
|
265
|
+
}
|
266
|
+
|
267
|
+
|
268
|
+
=head2 next_line
|
269
|
+
|
270
|
+
About : Reads next VCF line.
|
271
|
+
Usage : my $vcf = Vcf->new();
|
272
|
+
my $x = $vcf->next_line();
|
273
|
+
Args : none
|
274
|
+
|
275
|
+
=cut
|
276
|
+
|
277
|
+
sub next_line
|
278
|
+
{
|
279
|
+
my ($self) = @_;
|
280
|
+
if ( @{$$self{buffer}} ) { return shift(@{$$self{buffer}}); }
|
281
|
+
|
282
|
+
my $line;
|
283
|
+
if ( !exists($$self{max_line_len}) )
|
284
|
+
{
|
285
|
+
$line = readline($$self{fh});
|
286
|
+
}
|
287
|
+
else
|
288
|
+
{
|
289
|
+
while (1)
|
290
|
+
{
|
291
|
+
$line = readline($$self{fh});
|
292
|
+
if ( !defined $line ) { last; }
|
293
|
+
|
294
|
+
my $len = length($line);
|
295
|
+
if ( $len>$$self{max_line_len} && !($line=~/^#/) )
|
296
|
+
{
|
297
|
+
if ( !($line=~/^([^\t]+)\t([^\t]+)/) ) { $self->throw("Could not parse the line: $line"); }
|
298
|
+
$self->warn("The VCF line too long, ignoring: $1 $2 .. len=$len\n");
|
299
|
+
next;
|
300
|
+
}
|
301
|
+
last;
|
302
|
+
}
|
303
|
+
}
|
304
|
+
if ( !defined $line && $$self{check_exit_status} )
|
305
|
+
{
|
306
|
+
my $pid = waitpid(-1, WNOHANG);
|
307
|
+
if ( $pid!=0 && $pid!=-1 && $? !=0 )
|
308
|
+
{
|
309
|
+
$self->throw("Error reading VCF file.\n");
|
310
|
+
}
|
311
|
+
}
|
312
|
+
return $line;
|
313
|
+
}
|
314
|
+
|
315
|
+
sub _unread_line
|
316
|
+
{
|
317
|
+
my ($self,$line) = @_;
|
318
|
+
unshift @{$$self{buffer}}, $line;
|
319
|
+
return;
|
320
|
+
}
|
321
|
+
|
322
|
+
|
323
|
+
=head2 next_data_array
|
324
|
+
|
325
|
+
About : Reads next VCF line and splits it into an array. The last element is chomped.
|
326
|
+
Usage : my $vcf = Vcf->new();
|
327
|
+
$vcf->parse_header();
|
328
|
+
my $x = $vcf->next_data_array();
|
329
|
+
Args : Optional line to parse
|
330
|
+
|
331
|
+
=cut
|
332
|
+
|
333
|
+
sub next_data_array
|
334
|
+
{
|
335
|
+
my ($self,$line) = @_;
|
336
|
+
if ( !$line ) { $line = $self->next_line(); }
|
337
|
+
if ( !$line ) { return undef; }
|
338
|
+
my @items = split(/\t/,$line);
|
339
|
+
chomp($items[-1]);
|
340
|
+
return \@items;
|
341
|
+
}
|
342
|
+
|
343
|
+
|
344
|
+
=head2 set_samples
|
345
|
+
|
346
|
+
About : Parsing big VCF files with many sample columns is slow, not parsing unwanted samples may speed things a bit.
|
347
|
+
Usage : my $vcf = Vcf->new();
|
348
|
+
$vcf->set_samples(include=>['NA0001']); # Exclude all but this sample. When the array is empty, all samples will be excluded.
|
349
|
+
$vcf->set_samples(exclude=>['NA0003']); # Include only this sample. When the array is empty, all samples will be included.
|
350
|
+
my $x = $vcf->next_data_hash();
|
351
|
+
Args : Optional line to parse
|
352
|
+
|
353
|
+
=cut
|
354
|
+
|
355
|
+
sub set_samples
|
356
|
+
{
|
357
|
+
my ($self,%args) = @_;
|
358
|
+
|
359
|
+
if ( exists($args{include}) )
|
360
|
+
{
|
361
|
+
for (my $i=0; $i<@{$$self{columns}}; $i++) { $$self{samples_to_parse}[$i] = 0; }
|
362
|
+
for my $sample (@{$args{include}})
|
363
|
+
{
|
364
|
+
if ( !exists($$self{has_column}{$sample}) ) { $self->throw("The sample not present in the VCF file: [$sample]\n"); }
|
365
|
+
my $idx = $$self{has_column}{$sample} - 1;
|
366
|
+
$$self{samples_to_parse}[$idx] = 1;
|
367
|
+
}
|
368
|
+
}
|
369
|
+
|
370
|
+
if ( exists($args{exclude}) )
|
371
|
+
{
|
372
|
+
for (my $i=0; $i<@{$$self{columns}}; $i++) { $$self{samples_to_parse}[$i] = 1; }
|
373
|
+
for my $sample (@{$args{exclude}})
|
374
|
+
{
|
375
|
+
if ( !exists($$self{has_column}{$sample}) ) { $self->throw("The sample not present in the VCF file: [$sample]\n"); }
|
376
|
+
my $idx = $$self{has_column}{$sample} - 1;
|
377
|
+
$$self{samples_to_parse}[$idx] = 0;
|
378
|
+
}
|
379
|
+
}
|
380
|
+
}
|
381
|
+
|
382
|
+
|
383
|
+
sub _set_version
|
384
|
+
{
|
385
|
+
my ($self,$version_line) = @_;
|
386
|
+
|
387
|
+
if ( $$self{_version_set} ) { return $self; }
|
388
|
+
$$self{_version_set} = 1;
|
389
|
+
|
390
|
+
$$self{version} = $$self{default_version};
|
391
|
+
if ( $version_line )
|
392
|
+
{
|
393
|
+
if ( $version_line=~/^(\d+(?:\.\d+)?)$/ )
|
394
|
+
{
|
395
|
+
$$self{version} = $1;
|
396
|
+
undef $version_line;
|
397
|
+
}
|
398
|
+
elsif ( !($version_line=~/^##fileformat=/i) or !($version_line=~/(\d+(?:\.\d+)?)\s*$/i) )
|
399
|
+
{
|
400
|
+
$self->warn("Could not parse the fileformat version string [$version_line], assuming VCFv$$self{default_version}\n");
|
401
|
+
undef $version_line;
|
402
|
+
}
|
403
|
+
else
|
404
|
+
{
|
405
|
+
$$self{version} = $1;
|
406
|
+
}
|
407
|
+
}
|
408
|
+
|
409
|
+
my $reader;
|
410
|
+
if ( $$self{version} eq '3.2' ) { $reader=Vcf3_2->new(%$self); }
|
411
|
+
elsif ( $$self{version} eq '3.3' ) { $reader=Vcf3_3->new(%$self); }
|
412
|
+
elsif ( $$self{version} eq '4.0' ) { $reader=Vcf4_0->new(%$self); }
|
413
|
+
elsif ( $$self{version} eq '4.1' ) { $reader=Vcf4_1->new(%$self); }
|
414
|
+
else
|
415
|
+
{
|
416
|
+
$self->warn(qq[The version "$$self{version}" not supported, assuming VCFv$$self{default_version}\n]);
|
417
|
+
$$self{version} = '4.1';
|
418
|
+
$reader = Vcf4_1->new(%$self);
|
419
|
+
}
|
420
|
+
|
421
|
+
$self = $reader;
|
422
|
+
# When changing version, change also the fileformat header line
|
423
|
+
if ( exists($$self{header_lines}) && exists($$self{header_lines}[0]{key}) && $$self{header_lines}[0]{key} eq 'fileformat' )
|
424
|
+
{
|
425
|
+
shift(@{$$self{header_lines}});
|
426
|
+
}
|
427
|
+
|
428
|
+
return $self;
|
429
|
+
}
|
430
|
+
|
431
|
+
|
432
|
+
#---------------------------------------
|
433
|
+
|
434
|
+
package VcfReader;
|
435
|
+
use base qw(Vcf);
|
436
|
+
use strict;
|
437
|
+
use warnings;
|
438
|
+
use Carp;
|
439
|
+
use Data::Dumper;
|
440
|
+
|
441
|
+
sub new
|
442
|
+
{
|
443
|
+
my ($class,@args) = @_;
|
444
|
+
my $self = {@args};
|
445
|
+
bless $self, ref($class) || $class;
|
446
|
+
return $self;
|
447
|
+
}
|
448
|
+
|
449
|
+
|
450
|
+
=head2 next_data_hash
|
451
|
+
|
452
|
+
About : Reads next VCF line and splits it into a hash. This is the slowest way to obtain the data.
|
453
|
+
Usage : my $vcf = Vcf->new();
|
454
|
+
$vcf->parse_header();
|
455
|
+
my $x = $vcf->next_data_hash();
|
456
|
+
|
457
|
+
# Or having a VCF data line $line
|
458
|
+
my $x = $vcf->next_data_hash($line);
|
459
|
+
|
460
|
+
Args : Optional line to parse.
|
461
|
+
|
462
|
+
=cut
|
463
|
+
|
464
|
+
sub next_data_hash
|
465
|
+
{
|
466
|
+
my ($self,$line) = @_;
|
467
|
+
if ( !$line ) { $line = $self->next_line(); }
|
468
|
+
if ( !$line ) { return undef; }
|
469
|
+
my @items;
|
470
|
+
if ( ref($line) eq 'ARRAY' ) { @items = @$line; }
|
471
|
+
else { @items = split(/\t/,$line); }
|
472
|
+
chomp($items[-1]);
|
473
|
+
|
474
|
+
my $cols = $$self{columns};
|
475
|
+
if ( !$cols )
|
476
|
+
{
|
477
|
+
$self->_fake_column_names(scalar @items - 9);
|
478
|
+
$cols = $$self{columns};
|
479
|
+
}
|
480
|
+
|
481
|
+
# Check the number of columns
|
482
|
+
if ( scalar @items != scalar @$cols )
|
483
|
+
{
|
484
|
+
if ( $line=~/^\s*$/ ) { $self->throw("Sorry, empty lines not allowed.\n"); }
|
485
|
+
if ( $line=~/^#/ ) { $self->throw("FIXME: parse_header must be called before next_data_hash.\n"); }
|
486
|
+
|
487
|
+
$self->warn("Different number of columns at $items[0]:$items[1] (expected ".scalar @$cols.", got ".scalar @items.")\n");
|
488
|
+
while ( $items[-1] eq '' ) { pop(@items); }
|
489
|
+
if ( scalar @items != scalar @$cols )
|
490
|
+
{
|
491
|
+
my @test = split(/\s+/,$line);
|
492
|
+
if ( scalar @test == scalar @$cols ) { $self->warn("(Have spaces been used instead of tabs?)\n\n"); }
|
493
|
+
else { $self->throw("Error not recoverable, exiting.\n"); }
|
494
|
+
|
495
|
+
@items = @test;
|
496
|
+
}
|
497
|
+
else { $self->warn("(Trailing tabs?)\n\n"); }
|
498
|
+
}
|
499
|
+
my %out;
|
500
|
+
|
501
|
+
# Mandatory fields
|
502
|
+
$out{CHROM} = $items[0];
|
503
|
+
$out{POS} = $items[1];
|
504
|
+
$out{ID} = $items[2];
|
505
|
+
$out{REF} = $items[3];
|
506
|
+
$out{ALT} = [ split(/,/,$items[4]) ];
|
507
|
+
$out{QUAL} = $items[5];
|
508
|
+
$out{FILTER} = [ split(/;/,$items[6]) ];
|
509
|
+
|
510
|
+
# INFO, e.g. NS=58;DP=258;AF=0.786;DB;H2
|
511
|
+
if ( defined $items[7] )
|
512
|
+
{
|
513
|
+
my %hash;
|
514
|
+
for my $info (split(/;/,$items[7]))
|
515
|
+
{
|
516
|
+
my ($key,$val) = split(/=/,$info);
|
517
|
+
if ( !defined $key )
|
518
|
+
{
|
519
|
+
$self->warn("Broken VCF file, empty INFO field at $items[0]:$items[1]\n");
|
520
|
+
next;
|
521
|
+
}
|
522
|
+
if ( defined $val )
|
523
|
+
{
|
524
|
+
$hash{$key} = $val;
|
525
|
+
}
|
526
|
+
elsif ( exists($$self{header}{INFO}{$key}) )
|
527
|
+
{
|
528
|
+
$hash{$key} = $$self{header}{INFO}{$key}{default};
|
529
|
+
}
|
530
|
+
else
|
531
|
+
{
|
532
|
+
$hash{$key} = undef;
|
533
|
+
}
|
534
|
+
}
|
535
|
+
$out{INFO} = \%hash;
|
536
|
+
}
|
537
|
+
|
538
|
+
# The FORMAT field may not be present. GT:GQ:DP:HQ
|
539
|
+
my $format;
|
540
|
+
if ( $$cols[8] || $items[8] )
|
541
|
+
{
|
542
|
+
$format = $out{FORMAT} = [ split(/:/,$items[8]) ];
|
543
|
+
if ( (!$$format[0] || $$format[0] ne 'GT') && !$$self{ignore_missing_GT} ) { $self->warn("Expected GT as the first genotype field at $items[0]:$items[1]\n"); }
|
544
|
+
}
|
545
|
+
|
546
|
+
# Genotype fields
|
547
|
+
my %gtypes;
|
548
|
+
my $check_nformat = $$self{drop_trailings} ? 0 : 1;
|
549
|
+
for (my $icol=9; $icol<@items; $icol++)
|
550
|
+
{
|
551
|
+
if ( $items[$icol] eq '' ) { $self->warn("Empty column $$cols[$icol] at $items[0]:$items[1]\n"); next; }
|
552
|
+
if ( exists($$self{samples_to_parse}) && !$$self{samples_to_parse}[$icol] ) { next; }
|
553
|
+
|
554
|
+
my @fields = split(/:/, $items[$icol]);
|
555
|
+
if ( $check_nformat && @fields != @$format )
|
556
|
+
{
|
557
|
+
$self->warn("Different number of fields in the format and the column $$cols[$icol] at $items[0]:$items[1] ("
|
558
|
+
.scalar @fields." vs ".scalar @$format.": [",join(',',@fields),"] vs [",join(',',@$format),"])\n");
|
559
|
+
}
|
560
|
+
my %hash;
|
561
|
+
for (my $ifield=0; $ifield<@fields; $ifield++)
|
562
|
+
{
|
563
|
+
$hash{$$format[$ifield]} = $fields[$ifield];
|
564
|
+
}
|
565
|
+
$gtypes{$$cols[$icol]} = \%hash;
|
566
|
+
}
|
567
|
+
$out{gtypes} = \%gtypes;
|
568
|
+
|
569
|
+
return \%out;
|
570
|
+
}
|
571
|
+
|
572
|
+
|
573
|
+
=head2 parse_header
|
574
|
+
|
575
|
+
About : Reads (and stores) the VCF header.
|
576
|
+
Usage : my $vcf = Vcf->new(); $vcf->parse_header();
|
577
|
+
Args : silent .. do not warn about duplicate header lines
|
578
|
+
|
579
|
+
=cut
|
580
|
+
|
581
|
+
sub parse_header
|
582
|
+
{
|
583
|
+
my ($self,%args) = @_;
|
584
|
+
|
585
|
+
# First come the header lines prefixed by ##
|
586
|
+
while ($self->_next_header_line(%args)) { ; }
|
587
|
+
|
588
|
+
# Now comes the column names line prefixed by #
|
589
|
+
$self->_read_column_names();
|
590
|
+
}
|
591
|
+
|
592
|
+
|
593
|
+
=head2 _next_header_line
|
594
|
+
|
595
|
+
About : Stores the header lines and meta information, such as fields types, etc.
|
596
|
+
Args : silent .. do not warn about duplicate column names
|
597
|
+
|
598
|
+
=cut
|
599
|
+
|
600
|
+
sub _next_header_line
|
601
|
+
{
|
602
|
+
my ($self,%args) = @_;
|
603
|
+
my $line = $self->next_line();
|
604
|
+
if ( !defined $line ) { return undef; }
|
605
|
+
if ( substr($line,0,2) ne '##' )
|
606
|
+
{
|
607
|
+
$self->_unread_line($line);
|
608
|
+
return undef;
|
609
|
+
}
|
610
|
+
|
611
|
+
my $rec = $self->parse_header_line($line);
|
612
|
+
if ( $rec ) { $self->add_header_line($rec,%args); }
|
613
|
+
|
614
|
+
return $rec;
|
615
|
+
}
|
616
|
+
|
617
|
+
=head2 get_header_line
|
618
|
+
|
619
|
+
Usage : $vcf->get_header_line(key=>'INFO', ID=>'AC')
|
620
|
+
$vcf->get_header_line(key=>'FILTER', ID=>'q10')
|
621
|
+
$vcf->get_header_line(key=>'reference')
|
622
|
+
$vcf->get_header_line(key=>'contig',ID=>'20')
|
623
|
+
Args : Header line filter as in the example above
|
624
|
+
Returns : List ref of header line hashes matching the filter
|
625
|
+
|
626
|
+
=cut
|
627
|
+
|
628
|
+
sub get_header_line
|
629
|
+
{
|
630
|
+
my ($self,%filter) = @_;
|
631
|
+
|
632
|
+
my $key = $filter{key};
|
633
|
+
delete($filter{key});
|
634
|
+
|
635
|
+
my $id = $filter{ID};
|
636
|
+
|
637
|
+
my @out;
|
638
|
+
while (my ($hline_key,$hline_hash) = each %{$$self{header}})
|
639
|
+
{
|
640
|
+
if ( $key ne $hline_key ) { next; }
|
641
|
+
|
642
|
+
if ( defined $id )
|
643
|
+
{
|
644
|
+
if ( !exists($$hline_hash{$id}) ) { next; }
|
645
|
+
$hline_hash = $$hline_hash{$id};
|
646
|
+
}
|
647
|
+
|
648
|
+
my $match = 1;
|
649
|
+
while (my ($fkey,$fval) = each %filter)
|
650
|
+
{
|
651
|
+
if ( !exists($$hline_hash{$fkey}) or $$hline_hash{$fkey} ne $fval )
|
652
|
+
{
|
653
|
+
$match=0;
|
654
|
+
last;
|
655
|
+
}
|
656
|
+
}
|
657
|
+
if ( $match ) { push @out,$hline_hash }
|
658
|
+
}
|
659
|
+
return \@out;
|
660
|
+
}
|
661
|
+
|
662
|
+
|
663
|
+
=head2 add_header_line
|
664
|
+
|
665
|
+
Usage : $vcf->add_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'})
|
666
|
+
$vcf->add_header_line({key=>'reference',value=>'1000GenomesPilot-NCBI36'})
|
667
|
+
Args : Header line hash as in the example above
|
668
|
+
Hash with additional parameters [optional]
|
669
|
+
silent .. do not warn about existing header keys
|
670
|
+
append .. append timestamp to the name of the new one
|
671
|
+
Returns :
|
672
|
+
|
673
|
+
=cut
|
674
|
+
|
675
|
+
sub add_header_line
|
676
|
+
{
|
677
|
+
my ($self,$rec,%args) = @_;
|
678
|
+
|
679
|
+
if ( !%args ) { $args{silent}=0; }
|
680
|
+
|
681
|
+
my $key = $$rec{key};
|
682
|
+
if ( !$key ) { $self->throw("Missing key: ",Dumper($rec)); }
|
683
|
+
|
684
|
+
if ( exists($$rec{Type}) )
|
685
|
+
{
|
686
|
+
if ( !exists($$rec{default}) )
|
687
|
+
{
|
688
|
+
my $type = $$rec{Type};
|
689
|
+
if ( exists($$self{defaults}{$type}) ) { $$rec{default}=$$self{defaults}{$type}; }
|
690
|
+
else { $$rec{default}=$$self{defaults}{default}; }
|
691
|
+
}
|
692
|
+
if ( !exists($$rec{handler}) )
|
693
|
+
{
|
694
|
+
my $type = $$rec{Type};
|
695
|
+
if ( !exists($$self{handlers}{$type}) )
|
696
|
+
{
|
697
|
+
$self->warn("Unknown type [$type]\n");
|
698
|
+
$type = 'String';
|
699
|
+
$$rec{Type} = $type;
|
700
|
+
}
|
701
|
+
if ( exists($$self{handlers}{$type}) ) { $$rec{handler}=$$self{handlers}{$type}; }
|
702
|
+
else { $self->throw("Unknown type [$type].\n"); }
|
703
|
+
}
|
704
|
+
}
|
705
|
+
|
706
|
+
if ( exists($$rec{ID}) )
|
707
|
+
{
|
708
|
+
my $id = $$rec{ID};
|
709
|
+
if ( exists($$self{header}{$key}{$id}) ) { $self->remove_header_line(%$rec); }
|
710
|
+
$$self{header}{$key}{$id} = $rec;
|
711
|
+
push @{$$self{header_lines}}, $rec;
|
712
|
+
return;
|
713
|
+
}
|
714
|
+
|
715
|
+
if ( $args{append} )
|
716
|
+
{
|
717
|
+
my @tm = gmtime(time);
|
718
|
+
$key = sprintf "%s_%d%.2d%.2d", $key,$tm[5]+1900,$tm[4],$tm[3];
|
719
|
+
my $i = 1;
|
720
|
+
while ( exists($$self{header}{$key.'.'.$i}) ) { $i++; }
|
721
|
+
$key = $key.'.'.$i;
|
722
|
+
$$rec{key} = $key;
|
723
|
+
}
|
724
|
+
|
725
|
+
if ( $self->_header_line_exists($key,$rec) ) { $self->remove_header_line(%$rec); }
|
726
|
+
|
727
|
+
push @{$$self{header}{$key}}, $rec;
|
728
|
+
if ( $$rec{key} eq 'fileformat' )
|
729
|
+
{
|
730
|
+
unshift @{$$self{header_lines}}, $rec;
|
731
|
+
}
|
732
|
+
else
|
733
|
+
{
|
734
|
+
push @{$$self{header_lines}}, $rec;
|
735
|
+
}
|
736
|
+
}
|
737
|
+
|
738
|
+
sub _header_line_exists
|
739
|
+
{
|
740
|
+
my ($self,$key,$rec) = @_;
|
741
|
+
if ( !exists($$self{header}{$key}) ) { return 0; }
|
742
|
+
if ( $key eq 'fileformat' ) { return 1; }
|
743
|
+
for my $hrec (@{$$self{header}{$key}})
|
744
|
+
{
|
745
|
+
my $differ = 0;
|
746
|
+
for my $item (keys %$rec)
|
747
|
+
{
|
748
|
+
if ( !exists($$hrec{$item}) ) { $differ=1; last; }
|
749
|
+
if ( $$hrec{$item} ne $$rec{$item} ) { $differ=1; last; }
|
750
|
+
}
|
751
|
+
if ( !$differ ) { return $hrec; }
|
752
|
+
}
|
753
|
+
return 0;
|
754
|
+
}
|
755
|
+
|
756
|
+
=head2 remove_header_line
|
757
|
+
|
758
|
+
Usage : $vcf->remove_header_line(key=>'INFO', ID=>'AC')
|
759
|
+
Args :
|
760
|
+
Returns :
|
761
|
+
|
762
|
+
=cut
|
763
|
+
|
764
|
+
sub remove_header_line
|
765
|
+
{
|
766
|
+
my ($self,%args) = @_;
|
767
|
+
my $key = $args{key};
|
768
|
+
for (my $i=0; $i<@{$$self{header_lines}}; $i++)
|
769
|
+
{
|
770
|
+
my $line = $$self{header_lines}[$i];
|
771
|
+
if ( $$line{key} ne $key ) { next; }
|
772
|
+
if ( exists($args{ID}) )
|
773
|
+
{
|
774
|
+
if ( $args{ID} ne $$line{ID} ) { next; }
|
775
|
+
delete($$self{header}{$key}{$args{ID}});
|
776
|
+
splice(@{$$self{header_lines}},$i,1);
|
777
|
+
}
|
778
|
+
else
|
779
|
+
{
|
780
|
+
my $to_be_removed = $self->_header_line_exists($key,\%args);
|
781
|
+
if ( !$to_be_removed ) { next; }
|
782
|
+
for (my $j=0; $j<@{$$self{header}{$key}}; $j++)
|
783
|
+
{
|
784
|
+
if ( $$self{header}{$key}[$j] eq $to_be_removed ) { splice(@{$$self{header}{$key}},$j,1); last; }
|
785
|
+
}
|
786
|
+
splice(@{$$self{header_lines}},$i,1);
|
787
|
+
}
|
788
|
+
}
|
789
|
+
}
|
790
|
+
|
791
|
+
|
792
|
+
=head2 parse_header_line
|
793
|
+
|
794
|
+
Usage : $vcf->parse_header_line(q[##reference=1000GenomesPilot-NCBI36])
|
795
|
+
$vcf->parse_header_line(q[##INFO=NS,1,Integer,"Number of Samples With Data"])
|
796
|
+
Args :
|
797
|
+
Returns :
|
798
|
+
|
799
|
+
=cut
|
800
|
+
|
801
|
+
sub parse_header_line
|
802
|
+
{
|
803
|
+
my ($self,$line) = @_;
|
804
|
+
|
805
|
+
chomp($line);
|
806
|
+
$line =~ s/^##//;
|
807
|
+
|
808
|
+
if ( !($line=~/^([^=]+)=/) ) { return { key=>$line, value=>'' }; }
|
809
|
+
my $key = $1;
|
810
|
+
my $value = $';
|
811
|
+
|
812
|
+
my $desc;
|
813
|
+
if ( $value=~/,\s*\"([^\"]+)\"\s*$/ ) { $desc=$1; $value=$`; }
|
814
|
+
|
815
|
+
if ( !$desc ) { return { key=>$key, value=>$value }; }
|
816
|
+
|
817
|
+
if ( $key eq 'INFO' or $key eq 'FORMAT' )
|
818
|
+
{
|
819
|
+
my ($id,$number,$type,@rest) = split(/,\s*/,$value);
|
820
|
+
if ( !$type or scalar @rest ) { $self->throw("Could not parse the header line: $line\n"); }
|
821
|
+
return { key=>$key, ID=>$id, Number=>$number, Type=>$type, Description=>$desc };
|
822
|
+
}
|
823
|
+
if ( $key eq 'FILTER' )
|
824
|
+
{
|
825
|
+
my ($id,@rest) = split(/,\s*/,$value);
|
826
|
+
if ( !$id or scalar @rest ) { $self->throw("Could not parse the header line: $line\n"); }
|
827
|
+
return { key=>$key, ID=>$id, Description=>$desc };
|
828
|
+
}
|
829
|
+
$self->throw("Could not parse the header line: $line\n");
|
830
|
+
}
|
831
|
+
|
832
|
+
=head2 _read_column_names
|
833
|
+
|
834
|
+
About : Stores the column names as array $$self{columns} and hash $$self{has_column}{COL_NAME}=index.
|
835
|
+
The indexes go from 1.
|
836
|
+
Usage : $vcf->_read_column_names();
|
837
|
+
Args : none
|
838
|
+
|
839
|
+
=cut
|
840
|
+
|
841
|
+
sub _read_column_names
|
842
|
+
{
|
843
|
+
my ($self) = @_;
|
844
|
+
my $line = $self->next_line();
|
845
|
+
if ( !defined $line or substr($line,0,1) ne '#' ) { $self->throw("Broken VCF header, no column names?"); }
|
846
|
+
$$self{column_line} = $line;
|
847
|
+
|
848
|
+
chomp($line);
|
849
|
+
my @cols = split(/\t/, substr($line,1));
|
850
|
+
my $ncols = scalar @cols;
|
851
|
+
if ( $ncols == 1 )
|
852
|
+
{
|
853
|
+
# If there is only one name, it can be space-seprated instead of tab separated
|
854
|
+
@cols = split(/\s+/, $cols[0]);
|
855
|
+
$ncols = scalar @cols;
|
856
|
+
chomp($line);
|
857
|
+
if ( $ncols <= 1 ) { $self->warn("Could not parse the column names. [$line]\n"); return; }
|
858
|
+
$self->warn("The column names not tab-separated? [$line]\n");
|
859
|
+
}
|
860
|
+
|
861
|
+
my $fields = $$self{mandatory};
|
862
|
+
my $nfields = scalar @$fields;
|
863
|
+
|
864
|
+
# Check the names of the mandatory columns
|
865
|
+
if ( $ncols < $nfields )
|
866
|
+
{
|
867
|
+
chomp($line);
|
868
|
+
$self->warn("Missing some of the mandatory column names.\n\tGot: $line\n\tExpected: #", join("\t",@{$$self{mandatory}}),"\n");
|
869
|
+
return;
|
870
|
+
}
|
871
|
+
|
872
|
+
for (my $i=0; $i<$ncols; $i++)
|
873
|
+
{
|
874
|
+
if ( $cols[$i]=~/^\s+/ or $cols[$i]=~/\s+$/ )
|
875
|
+
{
|
876
|
+
$self->warn("The column name contains leading/trailing spaces, removing: '$cols[$i]'\n");
|
877
|
+
$cols[$i] =~ s/^\s+//;
|
878
|
+
$cols[$i] =~ s/\s+$//;
|
879
|
+
}
|
880
|
+
if ( $i<$nfields && $cols[$i] ne $$fields[$i] )
|
881
|
+
{
|
882
|
+
$self->warn("Expected mandatory column [$$fields[$i]], got [$cols[$i]]\n");
|
883
|
+
$cols[$i] = $$fields[$i];
|
884
|
+
}
|
885
|
+
$$self{has_column}{$cols[$i]} = $i+1;
|
886
|
+
}
|
887
|
+
$$self{columns} = \@cols;
|
888
|
+
return;
|
889
|
+
}
|
890
|
+
|
891
|
+
|
892
|
+
=head2 _fake_column_names
|
893
|
+
|
894
|
+
About : When no header is present, fake column names as the default mandatory ones + numbers
|
895
|
+
Args : The number of genotype columns; 0 if no genotypes but FORMAT present; <0 if FORMAT and genotypes not present
|
896
|
+
|
897
|
+
=cut
|
898
|
+
|
899
|
+
sub _fake_column_names
|
900
|
+
{
|
901
|
+
my ($self,$ncols) = @_;
|
902
|
+
|
903
|
+
$$self{columns} = [ @{$$self{mandatory}} ];
|
904
|
+
if ( $ncols>=0 ) { push @{$$self{columns}}, 'FORMAT'; }
|
905
|
+
for (my $i=1; $i<=$ncols; $i++) { push @{$$self{columns}}, $i; }
|
906
|
+
}
|
907
|
+
|
908
|
+
|
909
|
+
=head2 format_header
|
910
|
+
|
911
|
+
About : Returns the header.
|
912
|
+
Usage : print $vcf->format_header();
|
913
|
+
Args : The columns to include on output [optional]
|
914
|
+
|
915
|
+
=cut
|
916
|
+
|
917
|
+
sub format_header
|
918
|
+
{
|
919
|
+
my ($self,$columns) = @_;
|
920
|
+
|
921
|
+
my $out = '';
|
922
|
+
for my $line (@{$$self{header_lines}}) { $out .= $self->format_header_line($line); }
|
923
|
+
|
924
|
+
# This is required when using the API for writing new VCF files and the caller does not add the line explicitly
|
925
|
+
if ( !exists($$self{header_lines}[0]{key}) or $$self{header_lines}[0]{key} ne 'fileformat' )
|
926
|
+
{
|
927
|
+
$out = "##fileformat=VCFv$$self{version}\n" .$out;
|
928
|
+
}
|
929
|
+
if ( !$$self{columns} ) { return $out; }
|
930
|
+
|
931
|
+
my @out_cols;
|
932
|
+
if ( $columns )
|
933
|
+
{
|
934
|
+
@out_cols = @{$$self{columns}}[0..8];
|
935
|
+
for my $col (@$columns)
|
936
|
+
{
|
937
|
+
if ( exists($$self{has_column}{$col}) ) { push @out_cols, $col; }
|
938
|
+
}
|
939
|
+
}
|
940
|
+
else
|
941
|
+
{
|
942
|
+
@out_cols = @{$$self{columns}};
|
943
|
+
}
|
944
|
+
$out .= "#". join("\t", @out_cols). "\n";
|
945
|
+
|
946
|
+
return $out;
|
947
|
+
}
|
948
|
+
|
949
|
+
|
950
|
+
=head2 format_line
|
951
|
+
|
952
|
+
About : Returns the header.
|
953
|
+
Usage : $x = $vcf->next_data_hash(); print $vcf->format_line($x);
|
954
|
+
$x = $vcf->next_data_array(); print $vcf->format_line($x);
|
955
|
+
Args 1 : The columns or hash in the format returned by next_data_hash or next_data_array.
|
956
|
+
2 : The columns to include [optional]
|
957
|
+
|
958
|
+
=cut
|
959
|
+
|
960
|
+
sub format_line
|
961
|
+
{
|
962
|
+
my ($self,$record,$columns) = @_;
|
963
|
+
|
964
|
+
if ( ref($record) eq 'HASH' ) { return $self->_format_line_hash($record,$columns); }
|
965
|
+
$self->throw("FIXME: todo\n");
|
966
|
+
}
|
967
|
+
|
968
|
+
|
969
|
+
=head2 recalc_ac_an
|
970
|
+
|
971
|
+
About : Control if the AC and AN values should be updated.
|
972
|
+
Usage : $vcf->recalc_ac_an(1); $x = $vcf->next_data_hash(); print $vcf->format_line($x);
|
973
|
+
Args 1 : 0 .. never recalculate
|
974
|
+
1 .. recalculate if present
|
975
|
+
2 .. recalculate if present and add if missing
|
976
|
+
|
977
|
+
=cut
|
978
|
+
|
979
|
+
sub recalc_ac_an
|
980
|
+
{
|
981
|
+
my ($self,$value) = @_;
|
982
|
+
if ( $value eq '0' || $value eq '1' || $value eq '2' ) { $$self{recalc_ac_an} = $value; }
|
983
|
+
return;
|
984
|
+
}
|
985
|
+
|
986
|
+
|
987
|
+
|
988
|
+
sub _format_line_hash
|
989
|
+
{
|
990
|
+
my ($self,$record,$columns) = @_;
|
991
|
+
|
992
|
+
if ( !$$self{columns} )
|
993
|
+
{
|
994
|
+
my $ngtypes = scalar keys %{$$record{gtypes}};
|
995
|
+
if ( !$ngtypes && !exists($$record{FORMAT}) ) { $ngtypes--; }
|
996
|
+
$self->_fake_column_names($ngtypes);
|
997
|
+
}
|
998
|
+
my $cols = $$self{columns};
|
999
|
+
|
1000
|
+
# CHROM POS ID REF
|
1001
|
+
my $out;
|
1002
|
+
$out .= $$record{CHROM} . "\t";
|
1003
|
+
$out .= $$record{POS} . "\t";
|
1004
|
+
$out .= (defined $$record{ID} ? $$record{ID} : '.') . "\t";
|
1005
|
+
$out .= $$record{REF} . "\t";
|
1006
|
+
|
1007
|
+
# ALT
|
1008
|
+
$out .= join(',',@{$$record{ALT}} ? @{$$record{ALT}} : '.');
|
1009
|
+
|
1010
|
+
# QUAL
|
1011
|
+
$out .= "\t". $$record{QUAL};
|
1012
|
+
|
1013
|
+
# FILTER
|
1014
|
+
$out .= "\t". join(';',$$record{FILTER} ? @{$$record{FILTER}} : '.');
|
1015
|
+
|
1016
|
+
# Collect the gtypes of interest
|
1017
|
+
my $gtypes;
|
1018
|
+
if ( $columns )
|
1019
|
+
{
|
1020
|
+
# Select only those gtypes keys with a corresponding key in columns.
|
1021
|
+
for my $col (@$columns) { $$gtypes{$col} = $$record{gtypes}{$col}; }
|
1022
|
+
}
|
1023
|
+
else
|
1024
|
+
{
|
1025
|
+
$gtypes = $$record{gtypes};
|
1026
|
+
}
|
1027
|
+
|
1028
|
+
# INFO
|
1029
|
+
# .. calculate NS, AN and AC, but only if recalc_ac_an is set
|
1030
|
+
my $needs_an_ac = $$self{recalc_ac_an}==2 ? 1 : 0;
|
1031
|
+
my @info;
|
1032
|
+
while (my ($key,$value) = each %{$$record{INFO}})
|
1033
|
+
{
|
1034
|
+
if ( $$self{recalc_ac_an}>0 )
|
1035
|
+
{
|
1036
|
+
if ( $key eq 'AN' ) { $needs_an_ac=1; next; }
|
1037
|
+
if ( $key eq 'AC' ) { $needs_an_ac=1; next; }
|
1038
|
+
}
|
1039
|
+
if ( defined $value )
|
1040
|
+
{
|
1041
|
+
push @info, "$key=$value";
|
1042
|
+
}
|
1043
|
+
elsif ( $key ne '.' )
|
1044
|
+
{
|
1045
|
+
push @info, $key;
|
1046
|
+
}
|
1047
|
+
}
|
1048
|
+
if ( $needs_an_ac )
|
1049
|
+
{
|
1050
|
+
my $nalt = scalar @{$$record{ALT}};
|
1051
|
+
if ( $nalt==1 && $$record{ALT}[0] eq '.' ) { $nalt=0; }
|
1052
|
+
my ($an,$ac) = $self->calc_an_ac($gtypes,$nalt);
|
1053
|
+
push @info, "AN=$an","AC=$ac";
|
1054
|
+
}
|
1055
|
+
if ( !@info ) { push @info, '.'; }
|
1056
|
+
$out .= "\t". join(';', sort @info);
|
1057
|
+
|
1058
|
+
# FORMAT, the column is not required, it may not be present when there are no genotypes
|
1059
|
+
if ( exists($$cols[8]) && defined $$record{FORMAT} )
|
1060
|
+
{
|
1061
|
+
$out .= "\t". join(':',@{$$record{FORMAT}});
|
1062
|
+
}
|
1063
|
+
|
1064
|
+
# Genotypes: output all columns or only a selection?
|
1065
|
+
my @col_names = $columns ? @$columns : @$cols[9..@$cols-1];
|
1066
|
+
my $nformat = defined $$record{FORMAT} ? @{$$record{FORMAT}} : 0;
|
1067
|
+
for my $col (@col_names)
|
1068
|
+
{
|
1069
|
+
my $gt = $$gtypes{$col};
|
1070
|
+
my $can_drop = $$self{drop_trailings};
|
1071
|
+
my @gtype;
|
1072
|
+
for (my $i=$nformat-1; $i>=0; $i--)
|
1073
|
+
{
|
1074
|
+
my $field = $$record{FORMAT}[$i];
|
1075
|
+
if ( $i==0 ) { $can_drop=0; }
|
1076
|
+
|
1077
|
+
if ( exists($$gt{$field}) )
|
1078
|
+
{
|
1079
|
+
$can_drop = 0;
|
1080
|
+
if ( ref($$gt{$field}) eq 'HASH' )
|
1081
|
+
{
|
1082
|
+
# Special treatment for Number=[AG] tags
|
1083
|
+
unshift @gtype, $self->format_AGtag($record,$$gt{$field},$field);
|
1084
|
+
}
|
1085
|
+
else
|
1086
|
+
{
|
1087
|
+
unshift @gtype,$$gt{$field};
|
1088
|
+
}
|
1089
|
+
}
|
1090
|
+
elsif ( $can_drop ) { next; }
|
1091
|
+
elsif ( exists($$self{header}{FORMAT}{$field}{default}) ) { unshift @gtype,$$self{header}{FORMAT}{$field}{default}; $can_drop=0; }
|
1092
|
+
else { $self->throw(qq[No value for the field "$field" and no default available, column "$col" at $$record{CHROM}:$$record{POS}.\n]); }
|
1093
|
+
}
|
1094
|
+
$out .= "\t" . join(':',@gtype);
|
1095
|
+
}
|
1096
|
+
|
1097
|
+
$out .= "\n";
|
1098
|
+
return $out;
|
1099
|
+
}
|
1100
|
+
|
1101
|
+
sub calc_an_ac
|
1102
|
+
{
|
1103
|
+
my ($self,$gtypes,$nalleles) = @_;
|
1104
|
+
my $sep_re = $$self{regex_gtsep};
|
1105
|
+
my ($an,%ac_counts);
|
1106
|
+
if ( defined $nalleles )
|
1107
|
+
{
|
1108
|
+
for (my $i=1; $i<=$nalleles; $i++) { $ac_counts{$i}=0; }
|
1109
|
+
}
|
1110
|
+
$an = 0;
|
1111
|
+
for my $gt (keys %$gtypes)
|
1112
|
+
{
|
1113
|
+
my $value = $$gtypes{$gt}{GT};
|
1114
|
+
if ( !defined $value ) { next; } # GT may not be present
|
1115
|
+
my ($al1,$al2) = split($sep_re,$value);
|
1116
|
+
if ( defined($al1) && $al1 ne '.' )
|
1117
|
+
{
|
1118
|
+
$an++;
|
1119
|
+
if ( $al1 ne '0' ) { $ac_counts{$al1}++; }
|
1120
|
+
}
|
1121
|
+
if ( defined($al2) && $al2 ne '.' )
|
1122
|
+
{
|
1123
|
+
$an++;
|
1124
|
+
if ( $al2 ne '0' ) { $ac_counts{$al2}++; }
|
1125
|
+
}
|
1126
|
+
}
|
1127
|
+
my @ac;
|
1128
|
+
for my $ac ( sort { $a <=> $b } keys %ac_counts) { push @ac, $ac_counts{$ac}; }
|
1129
|
+
if ( !@ac ) { @ac = ('0'); }
|
1130
|
+
return ($an,join(',',@ac),\@ac);
|
1131
|
+
}
|
1132
|
+
|
1133
|
+
sub _validate_alt_field
|
1134
|
+
{
|
1135
|
+
my ($self,$values,$ref) = @_;
|
1136
|
+
|
1137
|
+
for (my $i=0; $i<@$values; $i++)
|
1138
|
+
{
|
1139
|
+
for (my $j=0; $j<$i; $j++)
|
1140
|
+
{
|
1141
|
+
if ( $$values[$i] eq $$values[$j] ) { return "The alleles not unique: $$values[$i]"; }
|
1142
|
+
}
|
1143
|
+
if ( $$values[$i] eq $ref ) { return "REF allele listed in the ALT field??"; }
|
1144
|
+
}
|
1145
|
+
return undef;
|
1146
|
+
}
|
1147
|
+
|
1148
|
+
=head2 validate_alt_field
|
1149
|
+
|
1150
|
+
Usage : my $x = $vcf->next_data_hash(); $vcf->validate_alt_field($$x{ALT});
|
1151
|
+
Args : The ALT arrayref
|
1152
|
+
Returns : Error message in case of an error.
|
1153
|
+
|
1154
|
+
=cut
|
1155
|
+
|
1156
|
+
sub validate_alt_field
|
1157
|
+
{
|
1158
|
+
my ($self,$values,$ref) = @_;
|
1159
|
+
|
1160
|
+
if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
|
1161
|
+
|
1162
|
+
my $ret = $self->_validate_alt_field($values,$ref);
|
1163
|
+
if ( $ret ) { return $ret; }
|
1164
|
+
|
1165
|
+
my @err;
|
1166
|
+
for my $item (@$values)
|
1167
|
+
{
|
1168
|
+
if ( $item=~/^[ACTGN]$/ ) { next; }
|
1169
|
+
elsif ( $item=~/^I[ACTGN]+$/ ) { next; }
|
1170
|
+
elsif ( $item=~/^D\d+$/ ) { next; }
|
1171
|
+
|
1172
|
+
push @err, $item;
|
1173
|
+
}
|
1174
|
+
if ( !@err ) { return undef; }
|
1175
|
+
return 'Could not parse the allele(s) [' .join(',',@err). ']';
|
1176
|
+
}
|
1177
|
+
|
1178
|
+
=head2 event_type
|
1179
|
+
|
1180
|
+
Usage : my $x = $vcf->next_data_hash();
|
1181
|
+
my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001');
|
1182
|
+
for my $allele (@$alleles)
|
1183
|
+
{
|
1184
|
+
my ($type,$len,$ht) = $vcf->event_type($x,$allele);
|
1185
|
+
}
|
1186
|
+
or
|
1187
|
+
my ($type,$len,$ht) = $vcf->event_type($ref,$al);
|
1188
|
+
Args : VCF data line parsed by next_data_hash or the reference allele
|
1189
|
+
: Allele
|
1190
|
+
Returns : 's' for SNP and number of SNPs in the record
|
1191
|
+
'i' for indel and a positive (resp. negative) number for the length of insertion (resp. deletion)
|
1192
|
+
'r' identical to the reference, length 0
|
1193
|
+
'o' for other (complex events) and the number of affected bases
|
1194
|
+
'b' breakend
|
1195
|
+
'u' unknown
|
1196
|
+
|
1197
|
+
=cut
|
1198
|
+
|
1199
|
+
sub event_type
|
1200
|
+
{
|
1201
|
+
my ($self,$rec,$allele) = @_;
|
1202
|
+
|
1203
|
+
my $ref = $rec;
|
1204
|
+
if ( ref($rec) eq 'HASH' )
|
1205
|
+
{
|
1206
|
+
if ( exists($$rec{_cached_events}{$allele}) ) { return (@{$$rec{_cached_events}{$allele}}); }
|
1207
|
+
$ref = $$rec{REF};
|
1208
|
+
}
|
1209
|
+
|
1210
|
+
my ($type,$len,$ht);
|
1211
|
+
if ( $allele eq $ref or $allele eq '.' ) { $len=0; $type='r'; $ht=$ref; }
|
1212
|
+
elsif ( $allele=~/^[ACGT]$/ ) { $len=1; $type='s'; $ht=$allele; }
|
1213
|
+
elsif ( $allele=~/^I/ ) { $len=length($allele)-1; $type='i'; $ht=$'; }
|
1214
|
+
elsif ( $allele=~/^D(\d+)/ ) { $len=-$1; $type='i'; $ht=''; }
|
1215
|
+
else
|
1216
|
+
{
|
1217
|
+
my $chr = ref($rec) eq 'HASH' ? $$rec{CHROM} : 'undef';
|
1218
|
+
my $pos = ref($rec) eq 'HASH' ? $$rec{POS} : 'undef';
|
1219
|
+
$self->throw("Eh?: $chr:$pos .. $ref $allele\n");
|
1220
|
+
}
|
1221
|
+
|
1222
|
+
if ( ref($rec) eq 'HASH' )
|
1223
|
+
{
|
1224
|
+
$$rec{_cached_events}{$allele} = [$type,$len,$ht];
|
1225
|
+
}
|
1226
|
+
return ($type,$len,$ht);
|
1227
|
+
}
|
1228
|
+
|
1229
|
+
|
1230
|
+
=head2 parse_AGtags
|
1231
|
+
|
1232
|
+
About : Breaks tags with variable number of fields (that is where Number is set to 'A' or 'G', such as GL) into hashes
|
1233
|
+
Usage : my $x = $vcf->next_data_hash(); my $values = $vcf->parse_AGtags($x);
|
1234
|
+
Args : VCF data line parsed by next_data_hash
|
1235
|
+
: Mapping between ALT representations based on different REFs [optional]
|
1236
|
+
: New REF [optional]
|
1237
|
+
Returns : Hash {Allele=>Value}
|
1238
|
+
|
1239
|
+
=cut
|
1240
|
+
|
1241
|
+
sub parse_AGtags
|
1242
|
+
{
|
1243
|
+
my ($self,$rec,$ref_alt_map,$new_ref) = @_;
|
1244
|
+
|
1245
|
+
if ( !exists($$rec{gtypes}) ) { return; }
|
1246
|
+
|
1247
|
+
my (@atags,@gtags);
|
1248
|
+
for my $fmt (@{$$rec{FORMAT}})
|
1249
|
+
{
|
1250
|
+
# These have been listed explicitly for proper merging of v4.0 VCFs
|
1251
|
+
if ( $$self{fix_v40_AGtags} )
|
1252
|
+
{
|
1253
|
+
if ( $fmt eq 'GL' or $fmt eq 'PL' ) { push @gtags,$fmt; next; }
|
1254
|
+
if ( $fmt eq 'AC' or $fmt eq 'AF' ) { push @atags,$fmt; next; }
|
1255
|
+
}
|
1256
|
+
if ( !exists($$self{header}{FORMAT}{$fmt}) ) { next; }
|
1257
|
+
if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'A' ) { push @atags,$fmt; next; }
|
1258
|
+
if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'G' ) { push @gtags,$fmt; next; }
|
1259
|
+
}
|
1260
|
+
my $missing = $$self{defaults}{default};
|
1261
|
+
if ( @atags )
|
1262
|
+
{
|
1263
|
+
# Parse Number=A tags
|
1264
|
+
my $alts;
|
1265
|
+
if ( defined $ref_alt_map )
|
1266
|
+
{
|
1267
|
+
$alts = [];
|
1268
|
+
for my $alt (@{$$rec{ALT}})
|
1269
|
+
{
|
1270
|
+
if ( !exists($$ref_alt_map{$new_ref}{$alt}) ) { $self->throw("FIXME: $new_ref $alt...?\n"); }
|
1271
|
+
push @$alts, $$ref_alt_map{$new_ref}{$alt};
|
1272
|
+
}
|
1273
|
+
}
|
1274
|
+
else
|
1275
|
+
{
|
1276
|
+
$alts = $$rec{ALT};
|
1277
|
+
}
|
1278
|
+
for my $tag (@atags)
|
1279
|
+
{
|
1280
|
+
for my $sample (values %{$$rec{gtypes}})
|
1281
|
+
{
|
1282
|
+
if ( !exists($$sample{$tag}) or $$sample{$tag} eq $missing ) { next; }
|
1283
|
+
my @values = split(/,/,$$sample{$tag});
|
1284
|
+
$$sample{$tag} = {};
|
1285
|
+
for (my $i=0; $i<@values; $i++)
|
1286
|
+
{
|
1287
|
+
$$sample{$tag}{$$alts[$i]} = $values[$i];
|
1288
|
+
}
|
1289
|
+
}
|
1290
|
+
}
|
1291
|
+
}
|
1292
|
+
if ( @gtags )
|
1293
|
+
{
|
1294
|
+
# Parse Number=G tags
|
1295
|
+
my @alleles;
|
1296
|
+
if ( defined $ref_alt_map )
|
1297
|
+
{
|
1298
|
+
push @alleles, $new_ref;
|
1299
|
+
for my $alt (@{$$rec{ALT}})
|
1300
|
+
{
|
1301
|
+
if ( !exists($$ref_alt_map{$new_ref}{$alt}) ) { $self->throw("FIXME: [$new_ref] [$alt]...?\n", Dumper($ref_alt_map,$rec)); }
|
1302
|
+
push @alleles, $$ref_alt_map{$new_ref}{$alt};
|
1303
|
+
}
|
1304
|
+
}
|
1305
|
+
else
|
1306
|
+
{
|
1307
|
+
@alleles = ($$rec{REF},@{$$rec{ALT}});
|
1308
|
+
}
|
1309
|
+
my @gtypes;
|
1310
|
+
for (my $i=0; $i<@alleles; $i++)
|
1311
|
+
{
|
1312
|
+
for (my $j=0; $j<=$i; $j++)
|
1313
|
+
{
|
1314
|
+
push @gtypes, $alleles[$i].'/'.$alleles[$j];
|
1315
|
+
}
|
1316
|
+
}
|
1317
|
+
for my $tag (@gtags)
|
1318
|
+
{
|
1319
|
+
for my $sample (values %{$$rec{gtypes}})
|
1320
|
+
{
|
1321
|
+
if ( !exists($$sample{$tag}) or $$sample{$tag} eq $missing ) { next; }
|
1322
|
+
my @values = split(/,/,$$sample{$tag});
|
1323
|
+
$$sample{$tag} = {};
|
1324
|
+
for (my $i=0; $i<@values; $i++)
|
1325
|
+
{
|
1326
|
+
$$sample{$tag}{$gtypes[$i]} = $values[$i];
|
1327
|
+
}
|
1328
|
+
}
|
1329
|
+
}
|
1330
|
+
}
|
1331
|
+
}
|
1332
|
+
|
1333
|
+
=head2 format_AGtag
|
1334
|
+
|
1335
|
+
About : Format tag with variable number of fields (that is where Number is set to 'A' or 'G', such as GL)
|
1336
|
+
Usage :
|
1337
|
+
Args :
|
1338
|
+
:
|
1339
|
+
:
|
1340
|
+
Returns :
|
1341
|
+
|
1342
|
+
=cut
|
1343
|
+
|
1344
|
+
sub format_AGtag
|
1345
|
+
{
|
1346
|
+
my ($self,$record,$tag_data,$tag) = @_;
|
1347
|
+
|
1348
|
+
# The FORMAT field is checked only once and the results are cached.
|
1349
|
+
if ( !exists($$record{_atags}) )
|
1350
|
+
{
|
1351
|
+
$$record{_atags} = {};
|
1352
|
+
|
1353
|
+
# Check if there are any A,G tags
|
1354
|
+
for my $fmt (@{$$record{FORMAT}})
|
1355
|
+
{
|
1356
|
+
# These have been listed explicitly for proper merging of v4.0 VCFs
|
1357
|
+
if ( $$self{fix_v40_AGtags} )
|
1358
|
+
{
|
1359
|
+
if ( $fmt eq 'GL' or $fmt eq 'PL' ) { $$record{_gtags}{$fmt}=1; next; }
|
1360
|
+
if ( $fmt eq 'AC' or $fmt eq 'AF' ) { $$record{_atags}{$fmt}=1; next; }
|
1361
|
+
}
|
1362
|
+
if ( !exists($$self{header}{FORMAT}{$fmt}) ) { next; }
|
1363
|
+
if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'A' ) { $$record{_atags}{$fmt}=1; next; }
|
1364
|
+
if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'G' ) { $$record{_gtags}{$fmt}=1; next; }
|
1365
|
+
}
|
1366
|
+
}
|
1367
|
+
|
1368
|
+
my @out;
|
1369
|
+
if ( exists($$record{_atags}{$tag}) )
|
1370
|
+
{
|
1371
|
+
for my $alt (@{$$record{ALT}})
|
1372
|
+
{
|
1373
|
+
push @out, exists($$tag_data{$alt}) ? $$tag_data{$alt} : $$self{defaults}{default};
|
1374
|
+
}
|
1375
|
+
}
|
1376
|
+
|
1377
|
+
if ( exists($$record{_gtags}{$tag}) )
|
1378
|
+
{
|
1379
|
+
my $gtypes = $$record{_gtypes};
|
1380
|
+
my $gtypes2 = $$record{_gtypes2};
|
1381
|
+
if ( !defined $gtypes )
|
1382
|
+
{
|
1383
|
+
$gtypes = [];
|
1384
|
+
$gtypes2 = [];
|
1385
|
+
|
1386
|
+
my @alleles = ( $$record{REF}, @{$$record{ALT}} );
|
1387
|
+
for (my $i=0; $i<@alleles; $i++)
|
1388
|
+
{
|
1389
|
+
for (my $j=0; $j<=$i; $j++)
|
1390
|
+
{
|
1391
|
+
push @$gtypes, $alleles[$i].'/'.$alleles[$j];
|
1392
|
+
push @$gtypes2, $alleles[$j].'/'.$alleles[$i];
|
1393
|
+
}
|
1394
|
+
}
|
1395
|
+
|
1396
|
+
$$record{_gtypes} = $gtypes;
|
1397
|
+
$$record{_gtypes2} = $gtypes2;
|
1398
|
+
}
|
1399
|
+
|
1400
|
+
for (my $i=0; $i<@$gtypes; $i++)
|
1401
|
+
{
|
1402
|
+
my $gt = $$gtypes[$i];
|
1403
|
+
if ( !exists($$tag_data{$gt}) ) { $gt = $$gtypes2[$i]; }
|
1404
|
+
push @out, exists($$tag_data{$gt}) ? $$tag_data{$gt} : $$self{defaults}{default};
|
1405
|
+
}
|
1406
|
+
}
|
1407
|
+
|
1408
|
+
return join(',',@out);
|
1409
|
+
}
|
1410
|
+
|
1411
|
+
=head2 parse_alleles
|
1412
|
+
|
1413
|
+
About : Deprecated, use parse_haplotype instead.
|
1414
|
+
Usage : my $x = $vcf->next_data_hash(); my ($al1,$sep,$al2) = $vcf->parse_alleles($x,'NA00001');
|
1415
|
+
Args : VCF data line parsed by next_data_hash
|
1416
|
+
: The genotype column name
|
1417
|
+
Returns : Alleles and the separator. If only one allele is present, $sep and $al2 will be an empty string.
|
1418
|
+
|
1419
|
+
=cut
|
1420
|
+
|
1421
|
+
sub parse_alleles
|
1422
|
+
{
|
1423
|
+
my ($self,$rec,$column) = @_;
|
1424
|
+
if ( !exists($$rec{gtypes}) || !exists($$rec{gtypes}{$column}) ) { $self->throw("The column not present: '$column'\n"); }
|
1425
|
+
|
1426
|
+
my $gtype = $$rec{gtypes}{$column}{GT};
|
1427
|
+
if ( !($gtype=~$$self{regex_gt}) ) { $self->throw("Could not parse gtype string [$gtype] [$$rec{CHROM}:$$rec{POS}]\n"); }
|
1428
|
+
my $al1 = $1;
|
1429
|
+
my $sep = $2;
|
1430
|
+
my $al2 = $3;
|
1431
|
+
|
1432
|
+
if ( !$al1 ) { $al1 = $$rec{REF}; }
|
1433
|
+
elsif ( $al1 ne '.' )
|
1434
|
+
{
|
1435
|
+
if ( !($al1=~/^\d+$/) ) { $self->throw("Uh, what is this? [$al1] $$rec{CHROM}:$$rec{POS}\n"); }
|
1436
|
+
$al1 = $$rec{ALT}[$al1-1];
|
1437
|
+
}
|
1438
|
+
|
1439
|
+
if ( !defined $al2 or $al2 eq '' )
|
1440
|
+
{
|
1441
|
+
$sep = '';
|
1442
|
+
$al2 = '';
|
1443
|
+
}
|
1444
|
+
else
|
1445
|
+
{
|
1446
|
+
if ( !$al2 ) { $al2 = $$rec{REF}; }
|
1447
|
+
elsif ( $al2 ne '.' ) { $al2 = $$rec{ALT}[$al2-1]; }
|
1448
|
+
}
|
1449
|
+
return ($al1,$sep,$al2);
|
1450
|
+
}
|
1451
|
+
|
1452
|
+
=head2 parse_haplotype
|
1453
|
+
|
1454
|
+
About : Similar to parse_alleles, supports also multiploid VCFs.
|
1455
|
+
Usage : my $x = $vcf->next_data_hash(); my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001');
|
1456
|
+
Args : VCF data line parsed by next_data_hash
|
1457
|
+
: The genotype column name
|
1458
|
+
Returns : Two array refs and two boolean flags: List of alleles, list of separators, and is_phased/empty flags. The values
|
1459
|
+
can be cashed and must be therefore considered read only!
|
1460
|
+
|
1461
|
+
=cut
|
1462
|
+
|
1463
|
+
sub parse_haplotype
|
1464
|
+
{
|
1465
|
+
my ($self,$rec,$column) = @_;
|
1466
|
+
if ( !exists($$rec{gtypes}{$column}) ) { $self->throw("The column not present: '$column'\n"); }
|
1467
|
+
if ( !exists($$rec{gtypes}{$column}{GT}) ) { return (['.'],[],0,1); }
|
1468
|
+
|
1469
|
+
my $gtype = $$rec{gtypes}{$column}{GT};
|
1470
|
+
if ( exists($$rec{_cached_haplotypes}{$gtype}) ) { return (@{$$rec{_cached_haplotypes}{$gtype}}); }
|
1471
|
+
|
1472
|
+
my @alleles = ();
|
1473
|
+
my @seps = ();
|
1474
|
+
my $is_phased = 0;
|
1475
|
+
my $is_empty = 1;
|
1476
|
+
|
1477
|
+
my $buf = $gtype;
|
1478
|
+
while ($buf ne '')
|
1479
|
+
{
|
1480
|
+
if ( !($buf=~m{^(\.|\d+)([|/]?)}) ) { $self->throw("Could not parse gtype string [$gtype] .. $$rec{CHROM}:$$rec{POS} $column\n"); }
|
1481
|
+
$buf = $';
|
1482
|
+
|
1483
|
+
if ( $1 eq '.' ) { push @alleles,'.'; }
|
1484
|
+
else
|
1485
|
+
{
|
1486
|
+
$is_empty = 0;
|
1487
|
+
if ( $1 eq '0' ) { push @alleles,$$rec{REF}; }
|
1488
|
+
elsif ( exists($$rec{ALT}[$1-1]) ) { push @alleles,$$rec{ALT}[$1-1]; }
|
1489
|
+
else { $self->throw(qq[The haplotype indexes in "$gtype" do not match the ALT column .. $$rec{CHROM}:$$rec{POS} $column\n]); }
|
1490
|
+
}
|
1491
|
+
if ( $2 )
|
1492
|
+
{
|
1493
|
+
if ( $2 eq '|' ) { $is_phased=1; }
|
1494
|
+
push @seps,$2;
|
1495
|
+
}
|
1496
|
+
}
|
1497
|
+
$$rec{_cached_haplotypes}{$gtype} = [\@alleles,\@seps,$is_phased,$is_empty];
|
1498
|
+
return (@{$$rec{_cached_haplotypes}{$gtype}});
|
1499
|
+
}
|
1500
|
+
|
1501
|
+
=head2 format_haplotype
|
1502
|
+
|
1503
|
+
Usage : my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001'); print $vcf->format_haplotype($alleles,$seps);
|
1504
|
+
|
1505
|
+
=cut
|
1506
|
+
|
1507
|
+
sub format_haplotype
|
1508
|
+
{
|
1509
|
+
my ($self,$alleles,$seps) = @_;
|
1510
|
+
if ( @$alleles != @$seps+1 ) { $self->throw(sprintf("Uh: %d vs %d\n",scalar @$alleles,scalar @$seps),Dumper($alleles,$seps)); }
|
1511
|
+
my $out = $$alleles[0];
|
1512
|
+
for (my $i=1; $i<@$alleles; $i++)
|
1513
|
+
{
|
1514
|
+
$out .= $$seps[$i-1];
|
1515
|
+
$out .= $$alleles[$i];
|
1516
|
+
}
|
1517
|
+
return $out;
|
1518
|
+
}
|
1519
|
+
|
1520
|
+
|
1521
|
+
=head2 format_genotype_strings
|
1522
|
+
|
1523
|
+
Usage : my $x = { REF=>'A', gtypes=>{'NA00001'=>{'GT'=>'A/C'}}, FORMAT=>['GT'], CHROM=>1, POS=>1, FILTER=>['.'], QUAL=>-1 };
|
1524
|
+
$vcf->format_genotype_strings($x);
|
1525
|
+
print $vcf->format_line($x);
|
1526
|
+
Args 1 : VCF data line in the format as if parsed by next_data_hash with alleles written as letters.
|
1527
|
+
2 : Optionally, a subset of columns can be supplied. See also format_line.
|
1528
|
+
Returns : Modifies the ALT array and the genotypes so that ref alleles become 0 and non-ref alleles
|
1529
|
+
numbers starting from 1.
|
1530
|
+
|
1531
|
+
=cut
|
1532
|
+
|
1533
|
+
sub format_genotype_strings
|
1534
|
+
{
|
1535
|
+
my ($self,$rec,$columns) = @_;
|
1536
|
+
|
1537
|
+
if ( !exists($$rec{gtypes}) ) { return; }
|
1538
|
+
|
1539
|
+
my $ref = $$rec{REF};
|
1540
|
+
my $nalts = 0;
|
1541
|
+
my %alts = ();
|
1542
|
+
my $gt_re = $$self{regex_gt2};
|
1543
|
+
|
1544
|
+
if ( !$columns ) { $columns = [keys %{$$rec{gtypes}}]; }
|
1545
|
+
|
1546
|
+
for my $key (@$columns)
|
1547
|
+
{
|
1548
|
+
my $gtype = $$rec{gtypes}{$key}{GT};
|
1549
|
+
my $buf = $gtype;
|
1550
|
+
my $out = '';
|
1551
|
+
while ($buf ne '')
|
1552
|
+
{
|
1553
|
+
if ( !($buf=~$gt_re) ) { $self->throw("Could not parse gtype string [$gtype]\n"); }
|
1554
|
+
$buf = $';
|
1555
|
+
|
1556
|
+
my $al = $1;
|
1557
|
+
my $sep = $2;
|
1558
|
+
if ( $al eq $ref or $al eq '0' or $al eq '*' ) { $al=0; }
|
1559
|
+
else
|
1560
|
+
{
|
1561
|
+
if ( $al=~/^\d+$/ )
|
1562
|
+
{
|
1563
|
+
if ( !exists($$rec{ALT}[$al-1]) ) { $self->throw("Broken ALT, index $al out of bounds\n"); }
|
1564
|
+
$al = $$rec{ALT}[$al-1];
|
1565
|
+
}
|
1566
|
+
|
1567
|
+
if ( exists($alts{$al}) ) { $al = $alts{$al} }
|
1568
|
+
elsif ( $al=~$$self{regex_snp} or $al=~$$self{regex_ins} or $al=~$$self{regex_del} )
|
1569
|
+
{
|
1570
|
+
$alts{$al} = ++$nalts;
|
1571
|
+
$al = $nalts;
|
1572
|
+
}
|
1573
|
+
elsif ( $al ne '.' )
|
1574
|
+
{
|
1575
|
+
$self->throw("Could not parse the genotype string [$gtype]\n");
|
1576
|
+
}
|
1577
|
+
|
1578
|
+
}
|
1579
|
+
$out .= $al;
|
1580
|
+
if ( $sep ) { $out .= $sep; }
|
1581
|
+
}
|
1582
|
+
$$rec{gtypes}{$key}{GT} = $out;
|
1583
|
+
}
|
1584
|
+
$$rec{ALT} = [ sort { $alts{$a}<=>$alts{$b} } keys %alts ];
|
1585
|
+
}
|
1586
|
+
|
1587
|
+
sub fill_ref_alt_mapping
|
1588
|
+
{
|
1589
|
+
my ($self,$map) = @_;
|
1590
|
+
|
1591
|
+
my $new_ref;
|
1592
|
+
for my $ref (keys %$map)
|
1593
|
+
{
|
1594
|
+
$new_ref = $ref;
|
1595
|
+
if ( $ref ne $new_ref ) { $self->throw("The reference prefixes do not agree: $ref vs $new_ref\n"); }
|
1596
|
+
for my $alt (keys %{$$map{$ref}})
|
1597
|
+
{
|
1598
|
+
$$map{$ref}{$alt} = $alt;
|
1599
|
+
}
|
1600
|
+
}
|
1601
|
+
$$map{$new_ref}{$new_ref} = $new_ref;
|
1602
|
+
return $new_ref;
|
1603
|
+
}
|
1604
|
+
|
1605
|
+
|
1606
|
+
=head2 format_header_line
|
1607
|
+
|
1608
|
+
Usage : $vcf->format_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'})
|
1609
|
+
Args :
|
1610
|
+
Returns :
|
1611
|
+
|
1612
|
+
=cut
|
1613
|
+
|
1614
|
+
sub format_header_line
|
1615
|
+
{
|
1616
|
+
my ($self,$rec) = @_;
|
1617
|
+
my $line = "##$$rec{key}";
|
1618
|
+
$line .= "=$$rec{value}" unless !exists($$rec{value});
|
1619
|
+
$line .= "=$$rec{ID}" unless !exists($$rec{ID});
|
1620
|
+
$line .= ",$$rec{Number}" unless !exists($$rec{Number});
|
1621
|
+
$line .= ",$$rec{Type}" unless !exists($$rec{Type});
|
1622
|
+
$line .= qq[,"$$rec{Description}"] unless !exists($$rec{Description});
|
1623
|
+
$line .= "\n";
|
1624
|
+
return $line;
|
1625
|
+
}
|
1626
|
+
|
1627
|
+
=head2 add_columns
|
1628
|
+
|
1629
|
+
Usage : $vcf->add_columns('NA001','NA0002');
|
1630
|
+
Args :
|
1631
|
+
Returns :
|
1632
|
+
|
1633
|
+
=cut
|
1634
|
+
|
1635
|
+
sub add_columns
|
1636
|
+
{
|
1637
|
+
my ($self,@columns) = @_;
|
1638
|
+
if ( !$$self{columns} )
|
1639
|
+
{
|
1640
|
+
# The columns should be initialized de novo. Figure out if the @columns contain also the mandatory
|
1641
|
+
# columns and if FORMAT should be present (it can be absent when there is no genotype column present).
|
1642
|
+
my $has_other = 0;
|
1643
|
+
for my $col (@columns)
|
1644
|
+
{
|
1645
|
+
if ( !exists($$self{reserved}{cols}{$col}) ) { $has_other=1; last; }
|
1646
|
+
}
|
1647
|
+
|
1648
|
+
$$self{columns} = [ @{$$self{mandatory}} ];
|
1649
|
+
if ( $has_other ) { push @{$$self{columns}},'FORMAT'; }
|
1650
|
+
|
1651
|
+
for my $col (@{$$self{columns}}) { $$self{has_column}{$col}=1; }
|
1652
|
+
}
|
1653
|
+
my $ncols = @{$$self{columns}};
|
1654
|
+
for my $col (@columns)
|
1655
|
+
{
|
1656
|
+
if ( $$self{has_column}{$col} ) { next; }
|
1657
|
+
$ncols++;
|
1658
|
+
push @{$$self{columns}}, $col;
|
1659
|
+
}
|
1660
|
+
}
|
1661
|
+
|
1662
|
+
=head2 add_format_field
|
1663
|
+
|
1664
|
+
Usage : $x=$vcf->next_data_hash(); $vcf->add_format_field($x,'FOO'); $$x{gtypes}{NA0001}{FOO}='Bar'; print $vcf->format_line($x);
|
1665
|
+
Args : The record obtained by next_data_hash
|
1666
|
+
: The field name
|
1667
|
+
Returns :
|
1668
|
+
|
1669
|
+
=cut
|
1670
|
+
|
1671
|
+
sub add_format_field
|
1672
|
+
{
|
1673
|
+
my ($self,$rec,$field) = @_;
|
1674
|
+
|
1675
|
+
if ( !$$rec{FORMAT} ) { $$rec{FORMAT}=[]; }
|
1676
|
+
|
1677
|
+
for my $key (@{$$rec{FORMAT}})
|
1678
|
+
{
|
1679
|
+
if ( $key eq $field ) { return; } # already there
|
1680
|
+
}
|
1681
|
+
push @{$$rec{FORMAT}}, $field;
|
1682
|
+
}
|
1683
|
+
|
1684
|
+
|
1685
|
+
=head2 remove_format_field
|
1686
|
+
|
1687
|
+
Usage : $x=$vcf->next_data_hash(); $vcf->remove_format_field($x,'FOO'); print $vcf->format_line($x);
|
1688
|
+
Args : The record obtained by next_data_hash
|
1689
|
+
: The field name
|
1690
|
+
Returns :
|
1691
|
+
|
1692
|
+
=cut
|
1693
|
+
|
1694
|
+
sub remove_format_field
|
1695
|
+
{
|
1696
|
+
my ($self,$rec,$field) = @_;
|
1697
|
+
|
1698
|
+
if ( !$$rec{FORMAT} ) { $$rec{FORMAT}=[]; }
|
1699
|
+
|
1700
|
+
my $i = 0;
|
1701
|
+
for my $key (@{$$rec{FORMAT}})
|
1702
|
+
{
|
1703
|
+
if ( $key eq $field ) { splice @{$$rec{FORMAT}},$i,1; }
|
1704
|
+
$i++;
|
1705
|
+
}
|
1706
|
+
}
|
1707
|
+
|
1708
|
+
|
1709
|
+
=head2 add_info_field
|
1710
|
+
|
1711
|
+
Usage : $x=$vcf->next_data_array(); $$x[7]=$vcf->add_info_field($$x[7],'FOO'=>'value','BAR'=>undef,'BAZ'=>''); print join("\t",@$x)."\n";
|
1712
|
+
Args : The record obtained by next_data_array
|
1713
|
+
: The INFO field name and value pairs. If value is undef and the key is present in $$x[7],
|
1714
|
+
it will be removed. To add fields without a value, use empty string ''.
|
1715
|
+
Returns : The formatted INFO.
|
1716
|
+
|
1717
|
+
=cut
|
1718
|
+
|
1719
|
+
sub add_info_field
|
1720
|
+
{
|
1721
|
+
my ($self,$info,%fields) = @_;
|
1722
|
+
|
1723
|
+
my @out = ();
|
1724
|
+
|
1725
|
+
# First handle the existing values, keep everything unless in %fields
|
1726
|
+
for my $field (split(/;/,$info))
|
1727
|
+
{
|
1728
|
+
my ($key,$value) = split(/=/,$field);
|
1729
|
+
if ( $key eq '.' ) { next; }
|
1730
|
+
if ( !exists($fields{$key}) ) { push @out,$field; next; }
|
1731
|
+
}
|
1732
|
+
|
1733
|
+
# Now add the new values and remove the unwanted ones
|
1734
|
+
while (my ($key,$value)=each %fields)
|
1735
|
+
{
|
1736
|
+
if ( !defined($value) ) { next; } # this one should be removed
|
1737
|
+
if ( $value eq '' ) { push @out,$key; } # this one is of the form HM2 in contrast to DP=3
|
1738
|
+
else { push @out,"$key=$value"; } # this is the standard key=value pair
|
1739
|
+
}
|
1740
|
+
if ( !@out ) { push @out,'.'; }
|
1741
|
+
return join(';',@out);
|
1742
|
+
}
|
1743
|
+
|
1744
|
+
|
1745
|
+
=head2 add_filter
|
1746
|
+
|
1747
|
+
Usage : $x=$vcf->next_data_array(); $$x[6]=$vcf->add_filter($$x[6],'SnpCluster'=>1,'q10'=>0); print join("\t",@$x)."\n";
|
1748
|
+
Args : The record obtained by next_data_array or next_data_hash
|
1749
|
+
: The key-value pairs for filter to be added. If value is 1, the filter will be added. If 0, the filter will be removed.
|
1750
|
+
Returns : The formatted filter field.
|
1751
|
+
|
1752
|
+
=cut
|
1753
|
+
|
1754
|
+
sub add_filter
|
1755
|
+
{
|
1756
|
+
my ($self,$filter,%filters) = @_;
|
1757
|
+
|
1758
|
+
my @out = ();
|
1759
|
+
my @filters = ref($filter) eq 'ARRAY' ? @$filter : split(/;/,$filter);
|
1760
|
+
|
1761
|
+
# First handle the existing filters, keep everything unless in %filters
|
1762
|
+
for my $key (@filters)
|
1763
|
+
{
|
1764
|
+
if ( $key eq '.' or $key eq 'PASS' ) { next; }
|
1765
|
+
if ( !exists($filters{$key}) ) { push @out,$key; next; }
|
1766
|
+
}
|
1767
|
+
|
1768
|
+
# Now add the new filters and remove the unwanted ones
|
1769
|
+
while (my ($key,$value)=each %filters)
|
1770
|
+
{
|
1771
|
+
if ( !$value ) { next; } # this one should be removed
|
1772
|
+
push @out,$key; # this one should be added
|
1773
|
+
}
|
1774
|
+
if ( !@out ) { push @out,'PASS'; }
|
1775
|
+
return ref($filter) eq 'ARRAY' ? return \@out : join(';',@out);
|
1776
|
+
}
|
1777
|
+
|
1778
|
+
|
1779
|
+
=head2 validate_filter_field
|
1780
|
+
|
1781
|
+
Usage : my $x = $vcf->next_data_hash(); $vcf->validate_filter_field($$x{FILTER});
|
1782
|
+
Args : The FILTER arrayref
|
1783
|
+
Returns : Error message in case of an error.
|
1784
|
+
|
1785
|
+
=cut
|
1786
|
+
|
1787
|
+
sub validate_filter_field
|
1788
|
+
{
|
1789
|
+
my ($self,$values) = @_;
|
1790
|
+
|
1791
|
+
if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
|
1792
|
+
|
1793
|
+
my @errs;
|
1794
|
+
my @missing;
|
1795
|
+
for my $item (@$values)
|
1796
|
+
{
|
1797
|
+
if ( $item eq $$self{filter_passed} ) { next; }
|
1798
|
+
if ( $item=~/,/ ) { push @errs,"Expected semicolon as a separator."; }
|
1799
|
+
if ( exists($$self{reserved}{FILTER}{$item}) ) { return qq[The filter name "$item" cannot be used, it is a reserved word.]; }
|
1800
|
+
if ( exists($$self{header}{FILTER}{$item}) ) { next; }
|
1801
|
+
push @missing, $item;
|
1802
|
+
$self->add_header_line({key=>'FILTER',ID=>$item,Description=>'No description'});
|
1803
|
+
}
|
1804
|
+
if ( !@errs && !@missing ) { return undef; }
|
1805
|
+
if ( $$self{version}<3.3 ) { return undef; }
|
1806
|
+
return join(',',@errs) .' '. 'The filter(s) [' . join(',',@missing) . '] not listed in the header.';
|
1807
|
+
}
|
1808
|
+
|
1809
|
+
|
1810
|
+
sub _add_unknown_field
|
1811
|
+
{
|
1812
|
+
my ($self,$field,$key,$nargs) = @_;
|
1813
|
+
$self->add_header_line({key=>$field,ID=>$key,Number=>$nargs,Type=>'String',Description=>'No description'});
|
1814
|
+
}
|
1815
|
+
|
1816
|
+
=head2 validate_header
|
1817
|
+
|
1818
|
+
About : Version specific header validation code.
|
1819
|
+
Usage : my $vcf = Vcf->new(); $vcf->parse_header(); $vcf->validate_header();
|
1820
|
+
Args :
|
1821
|
+
|
1822
|
+
=cut
|
1823
|
+
|
1824
|
+
sub validate_header
|
1825
|
+
{
|
1826
|
+
my ($self) = @_;
|
1827
|
+
}
|
1828
|
+
|
1829
|
+
=head2 validate_line
|
1830
|
+
|
1831
|
+
About : Version specific line validation code.
|
1832
|
+
Usage : my $vcf = Vcf->new(); $vcf->parse_header(); $x = $vcf->next_data_hash; $vcf->validate_line($x);
|
1833
|
+
Args :
|
1834
|
+
|
1835
|
+
=cut
|
1836
|
+
|
1837
|
+
sub validate_line
|
1838
|
+
{
|
1839
|
+
my ($self,$x) = @_;
|
1840
|
+
|
1841
|
+
# Is the ID composed of alphanumeric chars
|
1842
|
+
if ( !($$x{ID}=~/^[\w;\.]+$/) ) { $self->warn("Expected alphanumeric ID at $$x{CHROM}:$$x{POS}, but got [$$x{ID}]\n"); }
|
1843
|
+
}
|
1844
|
+
|
1845
|
+
=head2 validate_info_field
|
1846
|
+
|
1847
|
+
Usage : my $x = $vcf->next_data_hash(); $vcf->validate_info_field($$x{INFO},$$x{ALT});
|
1848
|
+
Args : The INFO hashref
|
1849
|
+
Returns : Error message in case of an error.
|
1850
|
+
|
1851
|
+
=cut
|
1852
|
+
|
1853
|
+
sub validate_info_field
|
1854
|
+
{
|
1855
|
+
my ($self,$values,$alts) = @_;
|
1856
|
+
|
1857
|
+
if ( !defined $values ) { return 'Empty INFO field.'; }
|
1858
|
+
|
1859
|
+
# First handle the empty INFO field (.)
|
1860
|
+
if ( scalar keys %$values == 1 && exists($$values{'.'}) ) { return undef; }
|
1861
|
+
|
1862
|
+
# Expected numbers
|
1863
|
+
my $ng = -1;
|
1864
|
+
my $na = -1;
|
1865
|
+
if ( $$self{version}>4.0 )
|
1866
|
+
{
|
1867
|
+
if ( $$alts[0] eq '.' ) { $ng=1; $na=1; }
|
1868
|
+
else
|
1869
|
+
{
|
1870
|
+
$na = @$alts;
|
1871
|
+
$ng = (1+$na+1)*($na+1)/2;
|
1872
|
+
}
|
1873
|
+
}
|
1874
|
+
|
1875
|
+
my @errs;
|
1876
|
+
while (my ($key,$value) = each %$values)
|
1877
|
+
{
|
1878
|
+
if ( !exists($$self{header}{INFO}{$key}) )
|
1879
|
+
{
|
1880
|
+
push @errs, "INFO tag [$key] not listed in the header" unless $$self{version}<3.3;
|
1881
|
+
my $nargs = defined $value ? -1 : 0;
|
1882
|
+
$self->_add_unknown_field('INFO',$key,$nargs);
|
1883
|
+
next;
|
1884
|
+
}
|
1885
|
+
my $type = $$self{header}{INFO}{$key};
|
1886
|
+
|
1887
|
+
my @vals = defined $value ? split(/,/, $value) : ();
|
1888
|
+
if ( $$type{Number} eq 'G' )
|
1889
|
+
{
|
1890
|
+
if ( $ng != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "INFO tag [$key=$value] expected different number of values (expected $ng, found ".scalar @vals.")"; }
|
1891
|
+
}
|
1892
|
+
elsif ( $$type{Number} eq 'A' )
|
1893
|
+
{
|
1894
|
+
if ( $na != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "INFO tag [$key=$value] expected different number of values (expected $na, found ".scalar @vals.")"; }
|
1895
|
+
}
|
1896
|
+
elsif ( $$type{Number}==0 )
|
1897
|
+
{
|
1898
|
+
if ( defined($value) ) { push @errs, "INFO tag [$key] did not expect any parameters, got [$value]"; }
|
1899
|
+
next;
|
1900
|
+
}
|
1901
|
+
elsif ( $$type{Number}!=-1 && @vals!=$$type{Number} )
|
1902
|
+
{
|
1903
|
+
push @errs, "INFO tag [$key=$value] expected different number of values ($$type{Number})";
|
1904
|
+
}
|
1905
|
+
if ( !$$type{handler} ) { next; }
|
1906
|
+
for my $val (@vals)
|
1907
|
+
{
|
1908
|
+
my $err = &{$$type{handler}}($self,$val,$$type{default});
|
1909
|
+
if ( $err ) { push @errs, $err; }
|
1910
|
+
}
|
1911
|
+
}
|
1912
|
+
if ( !@errs ) { return undef; }
|
1913
|
+
return join(',',@errs);
|
1914
|
+
}
|
1915
|
+
|
1916
|
+
=head2 validate_gtype_field
|
1917
|
+
|
1918
|
+
Usage : my $x = $vcf->next_data_hash(); $vcf->validate_gtype_field($$x{gtypes}{NA00001},$$x{ALT},$$x{FORMAT});
|
1919
|
+
Args : The genotype data hashref
|
1920
|
+
The ALT arrayref
|
1921
|
+
Returns : Error message in case of an error.
|
1922
|
+
|
1923
|
+
=cut
|
1924
|
+
|
1925
|
+
sub validate_gtype_field
|
1926
|
+
{
|
1927
|
+
my ($self,$data,$alts,$format) = @_;
|
1928
|
+
|
1929
|
+
# Expected numbers
|
1930
|
+
my $ng = -1;
|
1931
|
+
my $na = -1;
|
1932
|
+
if ( $$self{version}>4.0 )
|
1933
|
+
{
|
1934
|
+
if ( $$alts[0] eq '.' ) { $ng=1; $na=1; }
|
1935
|
+
else
|
1936
|
+
{
|
1937
|
+
$na = @$alts;
|
1938
|
+
$ng = (1+$na+1)*($na+1)/2;
|
1939
|
+
}
|
1940
|
+
}
|
1941
|
+
|
1942
|
+
my @errs;
|
1943
|
+
while (my ($key,$value) = each %$data)
|
1944
|
+
{
|
1945
|
+
if ( !exists($$self{header}{FORMAT}{$key}) )
|
1946
|
+
{
|
1947
|
+
push @errs, "FORMAT tag [$key] not listed in the header" unless $$self{version}<3.3;
|
1948
|
+
$self->_add_unknown_field('FORMAT',$key,-1);
|
1949
|
+
next;
|
1950
|
+
}
|
1951
|
+
my $type = $$self{header}{FORMAT}{$key};
|
1952
|
+
|
1953
|
+
my @vals = split(/,/, $value);
|
1954
|
+
if ( $$type{Number} eq 'G' )
|
1955
|
+
{
|
1956
|
+
if ( $ng != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "FORMAT tag [$key] expected different number of values (expected $ng, found ".scalar @vals.")"; }
|
1957
|
+
}
|
1958
|
+
elsif ( $$type{Number} eq 'A' )
|
1959
|
+
{
|
1960
|
+
if ( $na != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "FORMAT tag [$key] expected different number of values (expected $na, found ".scalar @vals.")"; }
|
1961
|
+
}
|
1962
|
+
elsif ( $$type{Number}!=-1 && @vals!=$$type{Number} )
|
1963
|
+
{
|
1964
|
+
push @errs, "FORMAT tag [$key] expected different number of values ($$type{Number})";
|
1965
|
+
}
|
1966
|
+
if ( !$$type{handler} ) { next; }
|
1967
|
+
for my $val (@vals)
|
1968
|
+
{
|
1969
|
+
my $err = &{$$type{handler}}($self,$val,$$type{default});
|
1970
|
+
if ( $err ) { push @errs, $err; }
|
1971
|
+
}
|
1972
|
+
}
|
1973
|
+
if ( !exists($$data{GT}) ) { push @errs, "The mandatory tag GT not present." unless $$self{ignore_missing_GT}; }
|
1974
|
+
else
|
1975
|
+
{
|
1976
|
+
my $buf = $$data{GT};
|
1977
|
+
while ($buf ne '')
|
1978
|
+
{
|
1979
|
+
my $al = $buf;
|
1980
|
+
if ( $buf=~$$self{regex_gtsep} )
|
1981
|
+
{
|
1982
|
+
$al = $`;
|
1983
|
+
$buf = $';
|
1984
|
+
if ( $buf eq '' ) { push @errs, "Unable to parse the GT field [$$data{GT}]."; last; }
|
1985
|
+
}
|
1986
|
+
else
|
1987
|
+
{
|
1988
|
+
$buf = '';
|
1989
|
+
}
|
1990
|
+
|
1991
|
+
if ( !defined $al ) { push @errs, "Unable to parse the GT field [$$data{GT}]."; last; }
|
1992
|
+
if ( $al eq '.' ) { next; }
|
1993
|
+
if ( $al eq '0' ) { next; }
|
1994
|
+
if ( !($al=~/^[0-9]+$/) ) { push @errs, "Unable to parse the GT field [$$data{GT}], expected integer."; last; }
|
1995
|
+
if ( !exists($$alts[$al-1]) ) { push @errs, "Bad ALT value in the GT field, the index [$al] out of bounds [$$data{GT}]."; last; }
|
1996
|
+
}
|
1997
|
+
}
|
1998
|
+
if ( !@errs ) { return undef; }
|
1999
|
+
return join(',',@errs);
|
2000
|
+
}
|
2001
|
+
|
2002
|
+
|
2003
|
+
sub validate_ref_field
|
2004
|
+
{
|
2005
|
+
my ($self,$ref) = @_;
|
2006
|
+
if ( !($ref=~/^[ACGTN]$/) ) { return "Expected one of A,C,G,T,N, got [$ref]\n"; }
|
2007
|
+
return undef;
|
2008
|
+
}
|
2009
|
+
|
2010
|
+
sub validate_int
|
2011
|
+
{
|
2012
|
+
my ($self,$value,$default) = @_;
|
2013
|
+
|
2014
|
+
if ( defined($default) && $value eq $default ) { return undef; }
|
2015
|
+
if ( $value =~ /^-?\d+$/ ) { return undef; }
|
2016
|
+
return "Could not validate the int [$value]";
|
2017
|
+
}
|
2018
|
+
|
2019
|
+
sub validate_float
|
2020
|
+
{
|
2021
|
+
my ($self,$value,$default) = @_;
|
2022
|
+
if ( defined($default) && $value eq $default ) { return undef; }
|
2023
|
+
if ( $value =~ /^-?\d+(?:\.\d*)$/ ) { return undef; }
|
2024
|
+
if ( $value =~ /^-?\d*(?:\.\d+)$/ ) { return undef; }
|
2025
|
+
if ( $value =~ /^-?\d+$/ ) { return undef; }
|
2026
|
+
if ( $value =~ /^-?\d*(?:\.?\d+)(?:[Ee][-+]?\d+)?$/ ) { return undef; }
|
2027
|
+
return "Could not validate the float [$value]";
|
2028
|
+
}
|
2029
|
+
|
2030
|
+
sub validate_char
|
2031
|
+
{
|
2032
|
+
my ($self,$value,$default) = @_;
|
2033
|
+
|
2034
|
+
if ( defined($default) && $value eq $default ) { return undef; }
|
2035
|
+
if ( length($value)==1) { return undef; }
|
2036
|
+
return "Could not validate the char value [$value]";
|
2037
|
+
}
|
2038
|
+
|
2039
|
+
|
2040
|
+
=head2 run_validation
|
2041
|
+
|
2042
|
+
About : Validates the VCF file.
|
2043
|
+
Usage : my $vcf = Vcf->new(file=>'file.vcf'); $vcf->run_validation('example.vcf.gz');
|
2044
|
+
Args : File name or file handle.
|
2045
|
+
|
2046
|
+
=cut
|
2047
|
+
|
2048
|
+
sub run_validation
|
2049
|
+
{
|
2050
|
+
my ($self) = @_;
|
2051
|
+
|
2052
|
+
$self->parse_header();
|
2053
|
+
$self->validate_header();
|
2054
|
+
|
2055
|
+
if ( !exists($$self{header}) ) { $self->warn(qq[The header not present.\n]); }
|
2056
|
+
elsif ( !exists($$self{header}{fileformat}) )
|
2057
|
+
{
|
2058
|
+
$self->warn(qq[The "fileformat" field not present in the header, assuming VCFv$$self{version}\n]);
|
2059
|
+
}
|
2060
|
+
elsif ( $$self{header_lines}[0]{key} ne 'fileformat' )
|
2061
|
+
{
|
2062
|
+
$self->warn(qq[The "fileformat" not the first line in the header\n]);
|
2063
|
+
}
|
2064
|
+
if ( !exists($$self{columns}) ) { $self->warn("No column descriptions found.\n"); }
|
2065
|
+
|
2066
|
+
my $default_qual = $$self{defaults}{QUAL};
|
2067
|
+
my $warn_sorted=1;
|
2068
|
+
my $warn_duplicates = exists($$self{warn_duplicates}) ? $$self{warn_duplicates} : 1;
|
2069
|
+
my ($prev_chrm,$prev_pos);
|
2070
|
+
while (my $x=$self->next_data_hash())
|
2071
|
+
{
|
2072
|
+
$self->validate_line($x);
|
2073
|
+
|
2074
|
+
# Is the position numeric?
|
2075
|
+
if ( !($$x{POS}=~/^\d+$/) ) { $self->warn("Expected integer for the position at $$x{CHROM}:$$x{POS}\n"); }
|
2076
|
+
|
2077
|
+
if ( $warn_duplicates )
|
2078
|
+
{
|
2079
|
+
if ( $prev_chrm && $prev_chrm eq $$x{CHROM} && $prev_pos eq $$x{POS} )
|
2080
|
+
{
|
2081
|
+
$self->warn("Warning: Duplicate entries, for example $$x{CHROM}:$$x{POS}\n");
|
2082
|
+
$warn_duplicates = 0;
|
2083
|
+
}
|
2084
|
+
}
|
2085
|
+
|
2086
|
+
# Is the file sorted?
|
2087
|
+
if ( $warn_sorted )
|
2088
|
+
{
|
2089
|
+
if ( $prev_chrm && $prev_chrm eq $$x{CHROM} && $prev_pos > $$x{POS} )
|
2090
|
+
{
|
2091
|
+
$self->warn("Warning: The file is not sorted, for example $$x{CHROM}:$$x{POS} comes after $prev_chrm:$prev_pos\n");
|
2092
|
+
$warn_sorted = 0;
|
2093
|
+
}
|
2094
|
+
$prev_chrm = $$x{CHROM};
|
2095
|
+
$prev_pos = $$x{POS};
|
2096
|
+
}
|
2097
|
+
|
2098
|
+
# The reference base: one of A,C,G,T,N, non-empty.
|
2099
|
+
my $err = $self->validate_ref_field($$x{REF});
|
2100
|
+
if ( $err ) { $self->warn("$$x{CHROM}:$$x{POS} .. $err\n"); }
|
2101
|
+
|
2102
|
+
# The ALT field (alternate non-reference base)
|
2103
|
+
$err = $self->validate_alt_field($$x{ALT},$$x{REF});
|
2104
|
+
if ( $err ) { $self->warn("$$x{CHROM}:$$x{POS} .. $err\n"); }
|
2105
|
+
|
2106
|
+
# The QUAL field
|
2107
|
+
my $ret = $self->validate_float($$x{QUAL},$default_qual);
|
2108
|
+
if ( $ret ) { $self->warn("QUAL field at $$x{CHROM}:$$x{POS} .. $ret\n"); }
|
2109
|
+
elsif ( $$x{QUAL}=~/^-?\d+$/ && $$x{QUAL}<-1 ) { $self->warn("QUAL field at $$x{CHROM}:$$x{POS} is negative .. $$x{QUAL}\n"); }
|
2110
|
+
|
2111
|
+
# The FILTER field
|
2112
|
+
$err = $self->validate_filter_field($$x{FILTER});
|
2113
|
+
if ( $err ) { $self->warn("FILTER field at $$x{CHROM}:$$x{POS} .. $err\n"); }
|
2114
|
+
|
2115
|
+
# The INFO field
|
2116
|
+
$err = $self->validate_info_field($$x{INFO},$$x{ALT});
|
2117
|
+
if ( $err ) { $self->warn("INFO field at $$x{CHROM}:$$x{POS} .. $err\n"); }
|
2118
|
+
|
2119
|
+
while (my ($gt,$data) = each %{$$x{gtypes}})
|
2120
|
+
{
|
2121
|
+
$err = $self->validate_gtype_field($data,$$x{ALT},$$x{FORMAT});
|
2122
|
+
if ( $err ) { $self->warn("column $gt at $$x{CHROM}:$$x{POS} .. $err\n"); }
|
2123
|
+
}
|
2124
|
+
|
2125
|
+
if ( scalar keys %{$$x{gtypes}} && (exists($$x{INFO}{AN}) || exists($$x{INFO}{AC})) )
|
2126
|
+
{
|
2127
|
+
my $nalt = scalar @{$$x{ALT}};
|
2128
|
+
if ( $nalt==1 && $$x{ALT}[0] eq '.' ) { $nalt=0; }
|
2129
|
+
my ($an,$ac) = $self->calc_an_ac($$x{gtypes},$nalt); # Allow alleles in ALT which are absent in samples
|
2130
|
+
if ( exists($$x{INFO}{AN}) && $an ne $$x{INFO}{AN} )
|
2131
|
+
{
|
2132
|
+
$self->warn("$$x{CHROM}:$$x{POS} .. AN is $$x{INFO}{AN}, should be $an\n");
|
2133
|
+
}
|
2134
|
+
if ( exists($$x{INFO}{AC}) && $ac ne $$x{INFO}{AC} )
|
2135
|
+
{
|
2136
|
+
$self->warn("$$x{CHROM}:$$x{POS} .. AC is $$x{INFO}{AC}, should be $ac\n");
|
2137
|
+
}
|
2138
|
+
}
|
2139
|
+
}
|
2140
|
+
}
|
2141
|
+
|
2142
|
+
|
2143
|
+
=head2 get_chromosomes
|
2144
|
+
|
2145
|
+
About : Get list of chromosomes from the VCF file. Must be bgzipped and tabix indexed.
|
2146
|
+
Usage : my $vcf = Vcf->new(); $vcf->get_chromosomes();
|
2147
|
+
Args : none
|
2148
|
+
|
2149
|
+
=cut
|
2150
|
+
|
2151
|
+
sub get_chromosomes
|
2152
|
+
{
|
2153
|
+
my ($self) = @_;
|
2154
|
+
if ( !$$self{file} ) { $self->throw(qq[The parameter "file" not set.\n]); }
|
2155
|
+
my (@out) = `tabix -l $$self{file}`;
|
2156
|
+
if ( $? )
|
2157
|
+
{
|
2158
|
+
$self->throw(qq[The command "tabix -l $$self{file}" exited with an error. Is the file tabix indexed?\n]);
|
2159
|
+
}
|
2160
|
+
for (my $i=0; $i<@out; $i++) { chomp($out[$i]); }
|
2161
|
+
return \@out;
|
2162
|
+
}
|
2163
|
+
|
2164
|
+
|
2165
|
+
=head2 get_samples
|
2166
|
+
|
2167
|
+
About : Get list of samples.
|
2168
|
+
Usage : my $vcf = Vcf->new(); $vcf->parse_header(); my (@samples) = $vcf->get_samples();
|
2169
|
+
Args : none
|
2170
|
+
|
2171
|
+
=cut
|
2172
|
+
|
2173
|
+
sub get_samples
|
2174
|
+
{
|
2175
|
+
my ($self) = @_;
|
2176
|
+
my $n = @{$$self{columns}} - 1;
|
2177
|
+
return (@{$$self{columns}}[9..$n]);
|
2178
|
+
}
|
2179
|
+
|
2180
|
+
|
2181
|
+
#------------------------------------------------
|
2182
|
+
# Version 3.2 specific functions
|
2183
|
+
|
2184
|
+
package Vcf3_2;
|
2185
|
+
use base qw(VcfReader);
|
2186
|
+
|
2187
|
+
sub new
|
2188
|
+
{
|
2189
|
+
my ($class,@args) = @_;
|
2190
|
+
my $self = $class->SUPER::new(@args);
|
2191
|
+
bless $self, ref($class) || $class;
|
2192
|
+
|
2193
|
+
$$self{_defaults} =
|
2194
|
+
{
|
2195
|
+
version => '3.2',
|
2196
|
+
drop_trailings => 1,
|
2197
|
+
filter_passed => 0,
|
2198
|
+
|
2199
|
+
defaults =>
|
2200
|
+
{
|
2201
|
+
QUAL => '-1',
|
2202
|
+
default => '.',
|
2203
|
+
Flag => undef,
|
2204
|
+
GT => '.',
|
2205
|
+
},
|
2206
|
+
|
2207
|
+
handlers =>
|
2208
|
+
{
|
2209
|
+
Integer => \&VcfReader::validate_int,
|
2210
|
+
Float => \&VcfReader::validate_float,
|
2211
|
+
Character => \&VcfReader::validate_char,
|
2212
|
+
String => undef,
|
2213
|
+
Flag => undef,
|
2214
|
+
},
|
2215
|
+
|
2216
|
+
regex_snp => qr/^[ACGTN]$/i,
|
2217
|
+
regex_ins => qr/^I[ACGTN]+$/,
|
2218
|
+
regex_del => qr/^D\d+$/,
|
2219
|
+
regex_gtsep => qr{[\\|/]},
|
2220
|
+
regex_gt => qr{^(\.|\d+)([\\|/]?)(\.?|\d*)$},
|
2221
|
+
regex_gt2 => qr{^(\.|[0-9ACGTNIDacgtn]+)([\\|/]?)},
|
2222
|
+
};
|
2223
|
+
|
2224
|
+
for my $key (keys %{$$self{_defaults}})
|
2225
|
+
{
|
2226
|
+
$$self{$key}=$$self{_defaults}{$key};
|
2227
|
+
}
|
2228
|
+
|
2229
|
+
|
2230
|
+
return $self;
|
2231
|
+
}
|
2232
|
+
|
2233
|
+
|
2234
|
+
#------------------------------------------------
|
2235
|
+
# Version 3.3 specific functions
|
2236
|
+
|
2237
|
+
package Vcf3_3;
|
2238
|
+
use base qw(VcfReader);
|
2239
|
+
|
2240
|
+
sub new
|
2241
|
+
{
|
2242
|
+
my ($class,@args) = @_;
|
2243
|
+
my $self = $class->SUPER::new(@args);
|
2244
|
+
bless $self, ref($class) || $class;
|
2245
|
+
|
2246
|
+
$$self{_defaults} =
|
2247
|
+
{
|
2248
|
+
version => '3.3',
|
2249
|
+
drop_trailings => 0,
|
2250
|
+
filter_passed => 0,
|
2251
|
+
|
2252
|
+
defaults =>
|
2253
|
+
{
|
2254
|
+
QUAL => '-1',
|
2255
|
+
Integer => '-1',
|
2256
|
+
Float => '-1',
|
2257
|
+
Character => '.',
|
2258
|
+
String => '.',
|
2259
|
+
Flag => undef,
|
2260
|
+
GT => './.',
|
2261
|
+
default => '.',
|
2262
|
+
},
|
2263
|
+
|
2264
|
+
handlers =>
|
2265
|
+
{
|
2266
|
+
Integer => \&VcfReader::validate_int,
|
2267
|
+
Float => \&VcfReader::validate_float,
|
2268
|
+
Character => \&VcfReader::validate_char,
|
2269
|
+
String => undef,
|
2270
|
+
Flag => undef,
|
2271
|
+
},
|
2272
|
+
|
2273
|
+
regex_snp => qr/^[ACGTN]$/i,
|
2274
|
+
regex_ins => qr/^I[ACGTN]+$/,
|
2275
|
+
regex_del => qr/^D\d+$/,
|
2276
|
+
regex_gtsep => qr{[\\|/]},
|
2277
|
+
regex_gt => qr{^(\.|\d+)([\\|/]?)(\.?|\d*)$},
|
2278
|
+
regex_gt2 => qr{^(\.|[0-9ACGTNIDacgtn]+)([\\|/]?)}, # . 0/1 0|1 A/A A|A D4/IACGT
|
2279
|
+
};
|
2280
|
+
|
2281
|
+
for my $key (keys %{$$self{_defaults}})
|
2282
|
+
{
|
2283
|
+
$$self{$key}=$$self{_defaults}{$key};
|
2284
|
+
}
|
2285
|
+
|
2286
|
+
return $self;
|
2287
|
+
}
|
2288
|
+
|
2289
|
+
|
2290
|
+
#------------------------------------------------
|
2291
|
+
# Version 4.0 specific functions
|
2292
|
+
|
2293
|
+
=head1 VCFv4.0
|
2294
|
+
|
2295
|
+
VCFv4.0 specific functions
|
2296
|
+
|
2297
|
+
=cut
|
2298
|
+
|
2299
|
+
package Vcf4_0;
|
2300
|
+
use base qw(VcfReader);
|
2301
|
+
|
2302
|
+
sub new
|
2303
|
+
{
|
2304
|
+
my ($class,@args) = @_;
|
2305
|
+
my $self = $class->SUPER::new(@args);
|
2306
|
+
bless $self, ref($class) || $class;
|
2307
|
+
|
2308
|
+
$$self{_defaults} =
|
2309
|
+
{
|
2310
|
+
version => '4.0',
|
2311
|
+
drop_trailings => 1,
|
2312
|
+
filter_passed => 'PASS',
|
2313
|
+
|
2314
|
+
defaults =>
|
2315
|
+
{
|
2316
|
+
QUAL => '.',
|
2317
|
+
Flag => undef,
|
2318
|
+
GT => '.',
|
2319
|
+
default => '.',
|
2320
|
+
},
|
2321
|
+
reserved =>
|
2322
|
+
{
|
2323
|
+
FILTER => { 0=>1 },
|
2324
|
+
},
|
2325
|
+
|
2326
|
+
handlers =>
|
2327
|
+
{
|
2328
|
+
Integer => \&VcfReader::validate_int,
|
2329
|
+
Float => \&VcfReader::validate_float,
|
2330
|
+
Character => \&VcfReader::validate_char,
|
2331
|
+
String => undef,
|
2332
|
+
Flag => undef,
|
2333
|
+
},
|
2334
|
+
|
2335
|
+
regex_snp => qr/^[ACGTN]$|^<[\w:.]+>$/i,
|
2336
|
+
regex_ins => qr/^[ACGTN]+$/,
|
2337
|
+
regex_del => qr/^[ACGTN]+$/,
|
2338
|
+
regex_gtsep => qr{[|/]}, # | /
|
2339
|
+
regex_gt => qr{^(\.|\d+)([|/]?)(\.?|\d*)$}, # . ./. 0/1 0|1
|
2340
|
+
regex_gt2 => qr{^(\.|[0-9ACGTNacgtn]+|<[\w:.]+>)([|/]?)}, # . ./. 0/1 0|1 A/A A|A 0|<DEL:ME:ALU>
|
2341
|
+
};
|
2342
|
+
|
2343
|
+
for my $key (keys %{$$self{_defaults}})
|
2344
|
+
{
|
2345
|
+
$$self{$key}=$$self{_defaults}{$key};
|
2346
|
+
}
|
2347
|
+
|
2348
|
+
return $self;
|
2349
|
+
}
|
2350
|
+
|
2351
|
+
sub Vcf4_0::format_header_line
|
2352
|
+
{
|
2353
|
+
my ($self,$rec) = @_;
|
2354
|
+
|
2355
|
+
my %tmp_rec = ( %$rec );
|
2356
|
+
if ( exists($tmp_rec{Number}) && $tmp_rec{Number} eq '-1' ) { $tmp_rec{Number} = '.' }
|
2357
|
+
my $value;
|
2358
|
+
if ( exists($tmp_rec{ID}) or $tmp_rec{key} eq 'PEDIGREE' )
|
2359
|
+
{
|
2360
|
+
my %has = ( key=>1, handler=>1, default=>1 ); # Internal keys not to be output
|
2361
|
+
my @items;
|
2362
|
+
for my $key (qw(ID Number Type Description), sort keys %tmp_rec)
|
2363
|
+
{
|
2364
|
+
if ( !exists($tmp_rec{$key}) or $has{$key} ) { next; }
|
2365
|
+
my $quote = ($key eq 'Description' or $tmp_rec{$key}=~/\s/) ? '"' : '';
|
2366
|
+
push @items, "$key=$quote$tmp_rec{$key}$quote";
|
2367
|
+
$has{$key}=1;
|
2368
|
+
}
|
2369
|
+
$value = '<' .join(',',@items). '>';
|
2370
|
+
}
|
2371
|
+
else { $value = $tmp_rec{value}; }
|
2372
|
+
|
2373
|
+
my $line = "##$tmp_rec{key}=".$value."\n";
|
2374
|
+
return $line;
|
2375
|
+
}
|
2376
|
+
|
2377
|
+
=head2 parse_header_line
|
2378
|
+
|
2379
|
+
Usage : $vcf->parse_header_line(q[##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">])
|
2380
|
+
$vcf->parse_header_line(q[reference=1000GenomesPilot-NCBI36])
|
2381
|
+
Args :
|
2382
|
+
Returns :
|
2383
|
+
|
2384
|
+
=cut
|
2385
|
+
|
2386
|
+
sub Vcf4_0::parse_header_line
|
2387
|
+
{
|
2388
|
+
my ($self,$line) = @_;
|
2389
|
+
|
2390
|
+
chomp($line);
|
2391
|
+
$line =~ s/^##//;
|
2392
|
+
|
2393
|
+
if ( !($line=~/^([^=]+)=/) ) { $self->throw("Expected key=value pair in the header: $line\n"); }
|
2394
|
+
my $key = $1;
|
2395
|
+
my $value = $';
|
2396
|
+
|
2397
|
+
if ( !($value=~/^<(.+)>\s*$/) )
|
2398
|
+
{
|
2399
|
+
# Simple sanity check for subtle typos
|
2400
|
+
if ( $key eq 'INFO' or $key eq 'FILTER' or $key eq 'FORMAT' or $key eq 'ALT' )
|
2401
|
+
{
|
2402
|
+
$self->throw("Hmm, is this a typo? [$key] [$value]");
|
2403
|
+
}
|
2404
|
+
return { key=>$key, value=>$value };
|
2405
|
+
}
|
2406
|
+
|
2407
|
+
my $rec = { key=>$key };
|
2408
|
+
my $tmp = $1;
|
2409
|
+
my ($attr_key,$attr_value,$quoted);
|
2410
|
+
while ($tmp ne '')
|
2411
|
+
{
|
2412
|
+
if ( !defined $attr_key )
|
2413
|
+
{
|
2414
|
+
if ( $tmp=~/^([^=]+)="/ ) { $attr_key=$1; $quoted=1; $tmp=$'; next; }
|
2415
|
+
elsif ( $tmp=~/^([^=]+)=/ ) { $attr_key=$1; $quoted=0; $tmp=$'; next; }
|
2416
|
+
else { $self->throw(qq[Could not parse header line: $line\nStopped at [$tmp].\n]); }
|
2417
|
+
}
|
2418
|
+
|
2419
|
+
if ( $tmp=~/^[^,\\"]+/ ) { $attr_value .= $&; $tmp = $'; }
|
2420
|
+
if ( $tmp=~/^\\\\/ ) { $attr_value .= '\\\\'; $tmp = $'; next; }
|
2421
|
+
if ( $tmp=~/^\\"/ ) { $attr_value .= '\\"'; $tmp = $'; next; }
|
2422
|
+
if ( $tmp eq '' or ($tmp=~/^,/ && !$quoted) or $tmp=~/^"/ )
|
2423
|
+
{
|
2424
|
+
if ( $attr_key=~/^\s+/ or $attr_key=~/\s+$/ or $attr_value=~/^\s+/ or $attr_value=~/\s+$/ )
|
2425
|
+
{
|
2426
|
+
$self->warn("Leading or trailing space in attr_key-attr_value pairs is discouraged:\n\t[$attr_key] [$attr_value]\n\t$line\n");
|
2427
|
+
$attr_key =~ s/^\s+//;
|
2428
|
+
$attr_key =~ s/\s+$//;
|
2429
|
+
$attr_value =~ s/^\s+//;
|
2430
|
+
$attr_value =~ s/\s+$//;
|
2431
|
+
}
|
2432
|
+
$$rec{$attr_key} = $attr_value;
|
2433
|
+
$tmp = $';
|
2434
|
+
if ( $quoted && $tmp=~/^,/ ) { $tmp = $'; }
|
2435
|
+
$attr_key = $attr_value = $quoted = undef;
|
2436
|
+
next;
|
2437
|
+
}
|
2438
|
+
if ( $tmp=~/^,/ ) { $attr_value .= $&; $tmp = $'; next; }
|
2439
|
+
$self->throw(qq[Could not parse header line: $line\nStopped at [$tmp].\n]);
|
2440
|
+
}
|
2441
|
+
|
2442
|
+
if ( $key ne 'PEDIGREE' && !exists($$rec{ID}) ) { $self->throw("Missing the ID tag in $line\n"); }
|
2443
|
+
if ( $key eq 'INFO' or $key eq 'FILTER' or $key eq 'FORMAT' )
|
2444
|
+
{
|
2445
|
+
if ( !exists($$rec{Description}) ) { $self->throw("Missing the Description tag in $line\n"); }
|
2446
|
+
}
|
2447
|
+
if ( exists($$rec{Number}) && $$rec{Number} eq '-1' ) { $self->warn("The use of -1 for unknown number of values is deprecated, please use '.' instead.\n\t$line\n"); }
|
2448
|
+
if ( exists($$rec{Number}) && $$rec{Number} eq '.' ) { $$rec{Number}=-1; }
|
2449
|
+
|
2450
|
+
return $rec;
|
2451
|
+
}
|
2452
|
+
|
2453
|
+
sub Vcf4_0::validate_ref_field
|
2454
|
+
{
|
2455
|
+
my ($self,$ref) = @_;
|
2456
|
+
if ( !($ref=~/^[ACGTN]+$/) )
|
2457
|
+
{
|
2458
|
+
my $offending = $ref;
|
2459
|
+
$offending =~ s/[ACGTN]+//g;
|
2460
|
+
return "Expected combination of A,C,G,T,N for REF, got [$ref], the offending chars were [$offending]\n";
|
2461
|
+
}
|
2462
|
+
return undef;
|
2463
|
+
}
|
2464
|
+
|
2465
|
+
sub Vcf4_0::validate_alt_field
|
2466
|
+
{
|
2467
|
+
my ($self,$values,$ref) = @_;
|
2468
|
+
|
2469
|
+
if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
|
2470
|
+
|
2471
|
+
my $ret = $self->_validate_alt_field($values,$ref);
|
2472
|
+
if ( $ret ) { return $ret; }
|
2473
|
+
|
2474
|
+
my $ref_len = length($ref);
|
2475
|
+
my $ref1 = substr($ref,0,1);
|
2476
|
+
|
2477
|
+
my @err;
|
2478
|
+
my $msg = '';
|
2479
|
+
for my $item (@$values)
|
2480
|
+
{
|
2481
|
+
if ( !($item=~/^[ACTGN]+$|^<[^<>\s]+>$/) ) { push @err,$item; next; }
|
2482
|
+
if ( $item=~/^<[^<>\s]+>$/ ) { next; }
|
2483
|
+
if ( $ref_len==length($item) ) { next; }
|
2484
|
+
if ( substr($item,0,1) ne $ref1 ) { $msg=', first base does not match the reference.'; push @err,$item; next; }
|
2485
|
+
}
|
2486
|
+
if ( !@err ) { return undef; }
|
2487
|
+
return 'Could not parse the allele(s) [' .join(',',@err). ']' . $msg;
|
2488
|
+
}
|
2489
|
+
|
2490
|
+
|
2491
|
+
=head2 fill_ref_alt_mapping
|
2492
|
+
|
2493
|
+
About : A tool for merging VCFv4.0 records. The subroutine unifies the REFs and creates a mapping
|
2494
|
+
from the original haplotypes to the haplotypes based on the new REF. Consider the following
|
2495
|
+
example:
|
2496
|
+
REF ALT
|
2497
|
+
G GA
|
2498
|
+
GT G
|
2499
|
+
GT GA
|
2500
|
+
GT GAA
|
2501
|
+
GTC G
|
2502
|
+
G <DEL>
|
2503
|
+
my $map={G=>{GA=>1},GT=>{G=>1,GA=>1,GAA=>1},GTC=>{G=>1},G=>{'<DEL>'=>1}};
|
2504
|
+
my $new_ref=$vcf->fill_ref_alt_mapping($map);
|
2505
|
+
|
2506
|
+
The call returns GTC and $map is now
|
2507
|
+
G GA -> GTC GATC
|
2508
|
+
GT G -> GTC GC
|
2509
|
+
GT GA -> GTC GAC
|
2510
|
+
GT GAA -> GTC GAAC
|
2511
|
+
GTC G -> GTC G
|
2512
|
+
G <DEL> -> GTC <DEL>
|
2513
|
+
Args :
|
2514
|
+
Returns : New REF string and fills the hash with appropriate ALT.
|
2515
|
+
|
2516
|
+
=cut
|
2517
|
+
|
2518
|
+
sub Vcf4_0::fill_ref_alt_mapping
|
2519
|
+
{
|
2520
|
+
my ($self,$map) = @_;
|
2521
|
+
|
2522
|
+
my $max_len = 0;
|
2523
|
+
my $new_ref;
|
2524
|
+
for my $ref (keys %$map)
|
2525
|
+
{
|
2526
|
+
my $len = length($ref);
|
2527
|
+
if ( $max_len<$len )
|
2528
|
+
{
|
2529
|
+
$max_len = $len;
|
2530
|
+
$new_ref = $ref;
|
2531
|
+
}
|
2532
|
+
$$map{$ref}{$ref} = 1;
|
2533
|
+
}
|
2534
|
+
for my $ref (keys %$map)
|
2535
|
+
{
|
2536
|
+
my $rlen = length($ref);
|
2537
|
+
if ( substr($new_ref,0,$rlen) ne $ref ) { $self->throw("The reference prefixes do not agree: $ref vs $new_ref\n"); }
|
2538
|
+
for my $alt (keys %{$$map{$ref}})
|
2539
|
+
{
|
2540
|
+
# The second part of the regex is for VCF>4.0, but does no harm for v<=4.0
|
2541
|
+
if ( $alt=~/^<.+>$/ or $alt=~/\[|\]/ ) { $$map{$ref}{$alt} = $alt; next; }
|
2542
|
+
my $new = $alt;
|
2543
|
+
if ( $rlen<$max_len ) { $new .= substr($new_ref,$rlen); }
|
2544
|
+
$$map{$ref}{$alt} = $new;
|
2545
|
+
}
|
2546
|
+
}
|
2547
|
+
return $new_ref;
|
2548
|
+
}
|
2549
|
+
|
2550
|
+
|
2551
|
+
|
2552
|
+
sub Vcf4_0::event_type
|
2553
|
+
{
|
2554
|
+
my ($self,$rec,$allele) = @_;
|
2555
|
+
|
2556
|
+
my $ref = $rec;
|
2557
|
+
if ( ref($rec) eq 'HASH' )
|
2558
|
+
{
|
2559
|
+
if ( exists($$rec{_cached_events}{$allele}) ) { return (@{$$rec{_cached_events}{$allele}}); }
|
2560
|
+
$ref = $$rec{REF};
|
2561
|
+
}
|
2562
|
+
|
2563
|
+
if ( $allele=~/^<[^>]+>$/ )
|
2564
|
+
{
|
2565
|
+
if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = ['u',0,$allele]; }
|
2566
|
+
return ('u',0,$allele);
|
2567
|
+
}
|
2568
|
+
if ( $allele eq '.' )
|
2569
|
+
{
|
2570
|
+
if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = ['r',0,$ref]; }
|
2571
|
+
return ('r',0,$ref);
|
2572
|
+
}
|
2573
|
+
|
2574
|
+
my $reflen = length($ref);
|
2575
|
+
my $len = length($allele);
|
2576
|
+
|
2577
|
+
my $ht;
|
2578
|
+
my $type;
|
2579
|
+
if ( $len==$reflen )
|
2580
|
+
{
|
2581
|
+
# This can be a reference, a SNP, or multiple SNPs
|
2582
|
+
my $mism = 0;
|
2583
|
+
for (my $i=0; $i<$len; $i++)
|
2584
|
+
{
|
2585
|
+
if ( substr($ref,$i,1) ne substr($allele,$i,1) ) { $mism++; }
|
2586
|
+
}
|
2587
|
+
if ( $mism==0 ) { $type='r'; $len=0; }
|
2588
|
+
else { $type='s'; $len=$mism; }
|
2589
|
+
}
|
2590
|
+
else
|
2591
|
+
{
|
2592
|
+
($len,$ht)=is_indel($ref,$allele);
|
2593
|
+
if ( $len )
|
2594
|
+
{
|
2595
|
+
# Indel
|
2596
|
+
$type = 'i';
|
2597
|
+
$allele = $ht;
|
2598
|
+
}
|
2599
|
+
else
|
2600
|
+
{
|
2601
|
+
$type = 'o'; $len = $len>$reflen ? $len-1 : $reflen-1;
|
2602
|
+
}
|
2603
|
+
}
|
2604
|
+
|
2605
|
+
if ( ref($rec) eq 'HASH' )
|
2606
|
+
{
|
2607
|
+
$$rec{_cached_events}{$allele} = [$type,$len,$allele];
|
2608
|
+
}
|
2609
|
+
return ($type,$len,$allele);
|
2610
|
+
}
|
2611
|
+
|
2612
|
+
# The sequences start at the same position, which simplifies things greatly.
|
2613
|
+
sub is_indel
|
2614
|
+
{
|
2615
|
+
my ($seq1,$seq2) = @_;
|
2616
|
+
|
2617
|
+
my $len1 = length($seq1);
|
2618
|
+
my $len2 = length($seq2);
|
2619
|
+
if ( $len1 eq $len2 ) { return (0,''); }
|
2620
|
+
|
2621
|
+
my ($del,$len,$LEN);
|
2622
|
+
if ( $len1<$len2 )
|
2623
|
+
{
|
2624
|
+
$len = $len1;
|
2625
|
+
$LEN = $len2;
|
2626
|
+
$del = 1;
|
2627
|
+
}
|
2628
|
+
else
|
2629
|
+
{
|
2630
|
+
$len = $len2;
|
2631
|
+
$LEN = $len1;
|
2632
|
+
$del = -1;
|
2633
|
+
my $tmp=$seq1; $seq1=$seq2; $seq2=$tmp;
|
2634
|
+
}
|
2635
|
+
|
2636
|
+
my $ileft;
|
2637
|
+
for ($ileft=0; $ileft<$len; $ileft++)
|
2638
|
+
{
|
2639
|
+
if ( substr($seq1,$ileft,1) ne substr($seq2,$ileft,1) ) { last; }
|
2640
|
+
}
|
2641
|
+
if ( $ileft==$len )
|
2642
|
+
{
|
2643
|
+
return ($del*($LEN-$len), substr($seq2,$ileft));
|
2644
|
+
}
|
2645
|
+
|
2646
|
+
my $iright;
|
2647
|
+
for ($iright=0; $iright<$len; $iright++)
|
2648
|
+
{
|
2649
|
+
if ( substr($seq1,$len-$iright,1) ne substr($seq2,$LEN-$iright,1) ) { last; }
|
2650
|
+
}
|
2651
|
+
if ( $iright+$ileft<=$len ) { return (0,''); }
|
2652
|
+
|
2653
|
+
return ($del*($LEN-$len),substr($seq2,$ileft,$LEN-$len));
|
2654
|
+
}
|
2655
|
+
|
2656
|
+
|
2657
|
+
#------------------------------------------------
|
2658
|
+
# Version 4.1 specific functions
|
2659
|
+
|
2660
|
+
=head1 VCFv4.1
|
2661
|
+
|
2662
|
+
VCFv4.1 specific functions
|
2663
|
+
|
2664
|
+
=cut
|
2665
|
+
|
2666
|
+
package Vcf4_1;
|
2667
|
+
use base qw(Vcf4_0);
|
2668
|
+
|
2669
|
+
sub new
|
2670
|
+
{
|
2671
|
+
my ($class,@args) = @_;
|
2672
|
+
my $self = $class->SUPER::new(@args);
|
2673
|
+
bless $self, ref($class) || $class;
|
2674
|
+
|
2675
|
+
$$self{_defaults} =
|
2676
|
+
{
|
2677
|
+
version => '4.1',
|
2678
|
+
drop_trailings => 1,
|
2679
|
+
filter_passed => 'PASS',
|
2680
|
+
|
2681
|
+
defaults =>
|
2682
|
+
{
|
2683
|
+
QUAL => '.',
|
2684
|
+
Flag => undef,
|
2685
|
+
GT => '.',
|
2686
|
+
default => '.',
|
2687
|
+
},
|
2688
|
+
reserved =>
|
2689
|
+
{
|
2690
|
+
FILTER => { 0=>1 },
|
2691
|
+
},
|
2692
|
+
|
2693
|
+
handlers =>
|
2694
|
+
{
|
2695
|
+
Integer => \&VcfReader::validate_int,
|
2696
|
+
Float => \&VcfReader::validate_float,
|
2697
|
+
Character => \&VcfReader::validate_char,
|
2698
|
+
String => undef,
|
2699
|
+
Flag => undef,
|
2700
|
+
},
|
2701
|
+
|
2702
|
+
regex_snp => qr/^[ACGTN]$|^<[\w:.]+>$/i,
|
2703
|
+
regex_ins => qr/^[ACGTN]+$/i,
|
2704
|
+
regex_del => qr/^[ACGTN]+$/i,
|
2705
|
+
regex_gtsep => qr{[|/]}, # | /
|
2706
|
+
regex_gt => qr{^(\.|\d+)([|/]?)(\.?|\d*)$}, # . ./. 0/1 0|1
|
2707
|
+
regex_gt2 => qr{^(\.|[0-9ACGTNacgtn]+|<[\w:.]+>)([|/]?)}, # . ./. 0/1 0|1 A/A A|A 0|<DEL:ME:ALU>
|
2708
|
+
};
|
2709
|
+
|
2710
|
+
$$self{ignore_missing_GT} = 1;
|
2711
|
+
|
2712
|
+
for my $key (keys %{$$self{_defaults}})
|
2713
|
+
{
|
2714
|
+
$$self{$key}=$$self{_defaults}{$key};
|
2715
|
+
}
|
2716
|
+
|
2717
|
+
return $self;
|
2718
|
+
}
|
2719
|
+
|
2720
|
+
sub Vcf4_1::validate_header
|
2721
|
+
{
|
2722
|
+
my ($self) = @_;
|
2723
|
+
my $lines = $self->get_header_line(key=>'reference');
|
2724
|
+
if ( !@$lines ) { $self->warn("The header tag 'reference' not present. (Not required but highly recommended.)\n"); }
|
2725
|
+
}
|
2726
|
+
|
2727
|
+
sub Vcf4_1::validate_line
|
2728
|
+
{
|
2729
|
+
my ($self,$line) = @_;
|
2730
|
+
|
2731
|
+
if ( !$$self{_contig_validated}{$$line{CHROM}} )
|
2732
|
+
{
|
2733
|
+
my $lines = $self->get_header_line(key=>'contig',ID=>$$line{CHROM});
|
2734
|
+
if ( !@$lines ) { $self->warn("The header tag 'contig' not present for CHROM=$$line{CHROM}. (Not required but highly recommended.)\n"); }
|
2735
|
+
$$self{_contig_validated}{$$line{CHROM}} = 1;
|
2736
|
+
}
|
2737
|
+
|
2738
|
+
# Is the ID composed of alphanumeric chars
|
2739
|
+
if ( !($$line{ID}=~/^\S+$/) ) { $self->warn("Expected non-whitespace ID at $$line{CHROM}:$$line{POS}, but got [$$line{ID}]\n"); }
|
2740
|
+
}
|
2741
|
+
|
2742
|
+
sub Vcf4_1::validate_alt_field
|
2743
|
+
{
|
2744
|
+
my ($self,$values,$ref) = @_;
|
2745
|
+
|
2746
|
+
if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
|
2747
|
+
|
2748
|
+
my $ret = $self->_validate_alt_field($values,$ref);
|
2749
|
+
if ( $ret ) { return $ret; }
|
2750
|
+
|
2751
|
+
my $ref_len = length($ref);
|
2752
|
+
my $ref1 = substr($ref,0,1);
|
2753
|
+
|
2754
|
+
my @err;
|
2755
|
+
my $msg = '';
|
2756
|
+
for my $item (@$values)
|
2757
|
+
{
|
2758
|
+
if ( $item=~/^(.*)\[(.+)\[(.*)$/ or $item=~/^(.*)\](.+)\](.*)$/ )
|
2759
|
+
{
|
2760
|
+
if ( $1 ne '' && $3 ne '' ) { $msg=', two replacement strings given (expected one)'; push @err,$item; next; }
|
2761
|
+
my $rpl;
|
2762
|
+
if ( $1 ne '' )
|
2763
|
+
{
|
2764
|
+
$rpl = $1;
|
2765
|
+
if ( $rpl ne '.' )
|
2766
|
+
{
|
2767
|
+
my $rref = substr($rpl,0,1);
|
2768
|
+
if ( $rref ne $ref1 ) { $msg=', the first base of the replacement string does not match the reference'; push @err,$item; next; }
|
2769
|
+
}
|
2770
|
+
}
|
2771
|
+
else
|
2772
|
+
{
|
2773
|
+
$rpl = $3;
|
2774
|
+
if ( $rpl ne '.' )
|
2775
|
+
{
|
2776
|
+
my $rref = substr($rpl,-1,1);
|
2777
|
+
if ( $rref ne $ref1 ) { $msg=', the last base of the replacement string does not match the reference'; push @err,$item; next; }
|
2778
|
+
}
|
2779
|
+
}
|
2780
|
+
my $pos = $2;
|
2781
|
+
if ( !($rpl=~/^[ACTGNacgtn]+$/) && $rpl ne '.' ) { $msg=', replacement string not valid (expected [ACTGNacgtn]+)'; push @err,$item; next; }
|
2782
|
+
if ( !($pos=~/^\S+:\d+$/) ) { $msg=', cannot parse sequence:position'; push @err,$item; next; }
|
2783
|
+
next;
|
2784
|
+
}
|
2785
|
+
if ( $item=~/^\.([ACTGNactgn])[ACTGNactgn]*$/ )
|
2786
|
+
{
|
2787
|
+
if ( $ref1 ne $1 ) { $msg=', first base does not match the reference'; push @err,$item; }
|
2788
|
+
next;
|
2789
|
+
}
|
2790
|
+
elsif ( $item=~/^[ACTGNactgn]*([ACTGNactgn])\.$/ )
|
2791
|
+
{
|
2792
|
+
if ( substr($ref,-1,1) ne $1 ) { $msg=', last base does not match the reference'; push @err,$item; }
|
2793
|
+
next;
|
2794
|
+
}
|
2795
|
+
if ( !($item=~/^[ACTGNactgn]+$|^<[^<>\s]+>$/) ) { push @err,$item; next; }
|
2796
|
+
if ( $item=~/^<[^<>\s]+>$/ ) { next; }
|
2797
|
+
if ( $ref_len==length($item) ) { next; }
|
2798
|
+
if ( substr($item,0,1) ne $ref1 ) { $msg=', first base does not match the reference'; push @err,$item; next; }
|
2799
|
+
}
|
2800
|
+
if ( !@err ) { return undef; }
|
2801
|
+
return 'Could not parse the allele(s) [' .join(',',@err). ']' . $msg;
|
2802
|
+
}
|
2803
|
+
|
2804
|
+
sub Vcf4_1::next_data_hash
|
2805
|
+
{
|
2806
|
+
my ($self,@args) = @_;
|
2807
|
+
|
2808
|
+
my $out = $self->SUPER::next_data_hash(@args);
|
2809
|
+
if ( !defined $out ) { return $out; }
|
2810
|
+
|
2811
|
+
# Case-insensitive ALT and REF bases
|
2812
|
+
$$out{REF} = uc($$out{REF});
|
2813
|
+
my $nalt = @{$$out{ALT}};
|
2814
|
+
for (my $i=0; $i<$nalt; $i++)
|
2815
|
+
{
|
2816
|
+
if ( $$out{ALT}[$i]=~/^</ ) { next; }
|
2817
|
+
$$out{ALT}[$i] = uc($$out{ALT}[$i]);
|
2818
|
+
}
|
2819
|
+
|
2820
|
+
return $out;
|
2821
|
+
}
|
2822
|
+
|
2823
|
+
sub Vcf4_1::next_data_array
|
2824
|
+
{
|
2825
|
+
my ($self,@args) = @_;
|
2826
|
+
|
2827
|
+
my $out = $self->SUPER::next_data_array(@args);
|
2828
|
+
if ( !defined $out or $$self{assume_uppercase} ) { return $out; }
|
2829
|
+
|
2830
|
+
# Case-insensitive ALT and REF bases
|
2831
|
+
$$out[3] = uc($$out[3]);
|
2832
|
+
my $alt = $$out[4];
|
2833
|
+
$$out[4] = '';
|
2834
|
+
while ($alt=~/[^<>]+/)
|
2835
|
+
{
|
2836
|
+
$$out[4] .= $`;
|
2837
|
+
$$out[4] .= (length($`) && substr($`,-1,1) eq '<' ) ? $& : uc($&);
|
2838
|
+
$alt = $';
|
2839
|
+
if ( $alt=~/^<[^<>]+>/ ) { $$out[4] .= $&; $alt=$'; }
|
2840
|
+
}
|
2841
|
+
|
2842
|
+
return $out;
|
2843
|
+
}
|
2844
|
+
|
2845
|
+
sub Vcf4_1::event_type
|
2846
|
+
{
|
2847
|
+
my ($self,$rec,$allele) = @_;
|
2848
|
+
if ( $allele=~/\[|\]|^\..+|.+\.$/ ) { return 'b'; }
|
2849
|
+
return $self->SUPER::event_type($rec,$allele);
|
2850
|
+
}
|
2851
|
+
|
2852
|
+
1;
|
2853
|
+
|