ngs_server 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/ngs_server +72 -50
- data/ext/bamtools/extconf.rb +3 -3
- data/ext/vcftools/Makefile +28 -0
- data/ext/vcftools/README.txt +36 -0
- data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
- data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
- data/ext/vcftools/cpp/.svn/entries +708 -0
- data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
- data/ext/vcftools/cpp/Makefile +46 -0
- data/ext/vcftools/cpp/dgeev.cpp +146 -0
- data/ext/vcftools/cpp/dgeev.h +43 -0
- data/ext/vcftools/cpp/output_log.cpp +79 -0
- data/ext/vcftools/cpp/output_log.h +34 -0
- data/ext/vcftools/cpp/parameters.cpp +535 -0
- data/ext/vcftools/cpp/parameters.h +154 -0
- data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
- data/ext/vcftools/cpp/vcf_entry.h +190 -0
- data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
- data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
- data/ext/vcftools/cpp/vcf_file.cpp +495 -0
- data/ext/vcftools/cpp/vcf_file.h +184 -0
- data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
- data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
- data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
- data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
- data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
- data/ext/vcftools/cpp/vcftools.cpp +107 -0
- data/ext/vcftools/cpp/vcftools.h +25 -0
- data/ext/vcftools/examples/.svn/all-wcprops +185 -0
- data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
- data/ext/vcftools/examples/.svn/entries +1048 -0
- data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
- data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
- data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
- data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
- data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
- data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
- data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
- data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
- data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
- data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
- data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
- data/ext/vcftools/examples/annotate-test.vcf +37 -0
- data/ext/vcftools/examples/annotate.out +23 -0
- data/ext/vcftools/examples/annotate.txt +7 -0
- data/ext/vcftools/examples/annotate2.out +52 -0
- data/ext/vcftools/examples/annotate3.out +23 -0
- data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
- data/ext/vcftools/examples/cmp-test.out +53 -0
- data/ext/vcftools/examples/concat-a.vcf +21 -0
- data/ext/vcftools/examples/concat-b.vcf +13 -0
- data/ext/vcftools/examples/concat-c.vcf +19 -0
- data/ext/vcftools/examples/concat.out +39 -0
- data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
- data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
- data/ext/vcftools/examples/merge-test-a.vcf +17 -0
- data/ext/vcftools/examples/merge-test-b.vcf +17 -0
- data/ext/vcftools/examples/merge-test-c.vcf +15 -0
- data/ext/vcftools/examples/merge-test.vcf.out +31 -0
- data/ext/vcftools/examples/perl-api-1.pl +46 -0
- data/ext/vcftools/examples/query-test.out +6 -0
- data/ext/vcftools/examples/shuffle-test.vcf +12 -0
- data/ext/vcftools/examples/subset.SNPs.out +10 -0
- data/ext/vcftools/examples/subset.indels.out +18 -0
- data/ext/vcftools/examples/subset.vcf +21 -0
- data/ext/vcftools/examples/valid-3.3.vcf +30 -0
- data/ext/vcftools/examples/valid-4.0.vcf +34 -0
- data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
- data/ext/vcftools/examples/valid-4.1.vcf +37 -0
- data/ext/vcftools/extconf.rb +2 -0
- data/ext/vcftools/perl/.svn/all-wcprops +149 -0
- data/ext/vcftools/perl/.svn/entries +844 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
- data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
- data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
- data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
- data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
- data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
- data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
- data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
- data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
- data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
- data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
- data/ext/vcftools/perl/ChangeLog +84 -0
- data/ext/vcftools/perl/FaSlice.pm +214 -0
- data/ext/vcftools/perl/Makefile +12 -0
- data/ext/vcftools/perl/Vcf.pm +2853 -0
- data/ext/vcftools/perl/VcfStats.pm +681 -0
- data/ext/vcftools/perl/fill-aa +103 -0
- data/ext/vcftools/perl/fill-an-ac +56 -0
- data/ext/vcftools/perl/fill-ref-md5 +204 -0
- data/ext/vcftools/perl/tab-to-vcf +92 -0
- data/ext/vcftools/perl/test.t +376 -0
- data/ext/vcftools/perl/vcf-annotate +1099 -0
- data/ext/vcftools/perl/vcf-compare +1193 -0
- data/ext/vcftools/perl/vcf-concat +310 -0
- data/ext/vcftools/perl/vcf-convert +180 -0
- data/ext/vcftools/perl/vcf-fix-newlines +97 -0
- data/ext/vcftools/perl/vcf-isec +660 -0
- data/ext/vcftools/perl/vcf-merge +577 -0
- data/ext/vcftools/perl/vcf-query +286 -0
- data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
- data/ext/vcftools/perl/vcf-sort +79 -0
- data/ext/vcftools/perl/vcf-stats +160 -0
- data/ext/vcftools/perl/vcf-subset +206 -0
- data/ext/vcftools/perl/vcf-to-tab +112 -0
- data/ext/vcftools/perl/vcf-validator +145 -0
- data/ext/vcftools/website/.svn/all-wcprops +41 -0
- data/ext/vcftools/website/.svn/entries +238 -0
- data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
- data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
- data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
- data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
- data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
- data/ext/vcftools/website/Makefile +6 -0
- data/ext/vcftools/website/README +2 -0
- data/ext/vcftools/website/VCF-poster.pdf +0 -0
- data/ext/vcftools/website/default.css +250 -0
- data/ext/vcftools/website/favicon.ico +0 -0
- data/ext/vcftools/website/favicon.png +0 -0
- data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/img/.svn/entries +300 -0
- data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
- data/ext/vcftools/website/img/bg.gif +0 -0
- data/ext/vcftools/website/img/bgcode.gif +0 -0
- data/ext/vcftools/website/img/bgcontainer.gif +0 -0
- data/ext/vcftools/website/img/bgul.gif +0 -0
- data/ext/vcftools/website/img/header.gif +0 -0
- data/ext/vcftools/website/img/li.gif +0 -0
- data/ext/vcftools/website/img/quote.gif +0 -0
- data/ext/vcftools/website/img/search.gif +0 -0
- data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/src/.svn/entries +300 -0
- data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
- data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
- data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
- data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
- data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
- data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
- data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
- data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
- data/ext/vcftools/website/src/docs.inc +202 -0
- data/ext/vcftools/website/src/index.inc +52 -0
- data/ext/vcftools/website/src/index.php +80 -0
- data/ext/vcftools/website/src/license.inc +27 -0
- data/ext/vcftools/website/src/links.inc +13 -0
- data/ext/vcftools/website/src/options.inc +654 -0
- data/ext/vcftools/website/src/perl_module.inc +249 -0
- data/ext/vcftools/website/src/specs.inc +18 -0
- data/lib/config.ru +9 -0
- data/lib/ngs_server/add.rb +9 -0
- data/lib/ngs_server/version.rb +1 -1
- data/lib/ngs_server.rb +55 -3
- data/ngs_server.gemspec +5 -2
- metadata +296 -6
|
@@ -0,0 +1,1099 @@
|
|
|
1
|
+
#!/usr/bin/env perl
|
|
2
|
+
#
|
|
3
|
+
# Author: petr.danecek@sanger
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
use strict;
|
|
7
|
+
use warnings;
|
|
8
|
+
use Carp;
|
|
9
|
+
use Vcf;
|
|
10
|
+
|
|
11
|
+
my %filters =
|
|
12
|
+
(
|
|
13
|
+
MinAB => { dflt=>2, usage=>'INT', desc=>'Minimum number of alternate bases (INFO/DP4)', nick=>'a' },
|
|
14
|
+
SnpCluster => { dflt=>undef, usage=>'INT1,INT2', desc=>"Filters clusters of 'INT1' or more SNPs within a run of 'INT2' bases", nick=>'c' },
|
|
15
|
+
MinDP => { dflt=>2, usage=>'INT', desc=>"Minimum read depth (INFO/DP or INFO/DP4)", nick=>'d' },
|
|
16
|
+
MaxDP => { dflt=>10_000_000, usage=>'INT', desc=>"Maximum read depth (INFO/DP or INFO/DP4)", nick=>'D' },
|
|
17
|
+
MinMQ => { dflt=>10, usage=>'INT', desc=>"Minimum RMS mapping quality for SNPs (INFO/MQ)", nick=>'q' },
|
|
18
|
+
SnpGap => { dflt=>10, usage=>'INT', desc=>"SNP within INT bp around a gap to be filtered", nick=>'w' },
|
|
19
|
+
GapWin => { dflt=>3, usage=>'INT', desc=>"Window size for filtering adjacent gaps", nick=>'W' },
|
|
20
|
+
StrandBias => { dflt=>1e-4, usage=>'FLOAT', desc=>"Min P-value for strand bias (INFO/PV4)", nick=>'1' },
|
|
21
|
+
BaseQualBias => { dflt=>0, usage=>'FLOAT', desc=>"Min P-value for baseQ bias (INFO/PV4)", nick=>'2' },
|
|
22
|
+
MapQualBias => { dflt=>0, usage=>'FLOAT', desc=>"Min P-value for mapQ bias (INFO/PV4)", nick=>'3' },
|
|
23
|
+
EndDistBias => { dflt=>1e-4, usage=>'FLOAT', desc=>"Min P-value for end distance bias (INFO/PV4)", nick=>'4' },
|
|
24
|
+
RefN => { dflt=>'', usage=>'', desc=>"Reference base is N", nick=>'r' },
|
|
25
|
+
Qual => { dflt=>'10', usage=>'INT', desc=>"Minimum value of the QUAL field", nick=>'Q' },
|
|
26
|
+
VDB => { dflt=>'0.015', usage=>'FLOAT', desc=>"Minimum Variant Distance Bias (INFO/VDB)", nick=>'v' },
|
|
27
|
+
HWE => { dflt=>1e-4, usage=>'FLOAT', desc=>"Minimum P-value for HWE (plus F<0) (INFO/HWE and INFO/G3)", nick=>'H' },
|
|
28
|
+
);
|
|
29
|
+
|
|
30
|
+
my $opts = parse_params();
|
|
31
|
+
annotate($opts);
|
|
32
|
+
|
|
33
|
+
exit;
|
|
34
|
+
|
|
35
|
+
#--------------------------------
|
|
36
|
+
|
|
37
|
+
sub error
|
|
38
|
+
{
|
|
39
|
+
my (@msg) = @_;
|
|
40
|
+
if ( scalar @msg ) { confess @msg; }
|
|
41
|
+
|
|
42
|
+
my @filters;
|
|
43
|
+
for my $key (sort {lc($filters{$a}{nick}) cmp lc($filters{$b}{nick})} keys %filters)
|
|
44
|
+
{
|
|
45
|
+
push @filters, sprintf("\t%s, %-25s\t\t%s [%s]\n", $filters{$key}{nick},$key.' '.$filters{$key}{usage},$filters{$key}{desc},defined($filters{$key}{dflt})? $filters{$key}{dflt} : '');
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
print
|
|
49
|
+
"About: Annotates VCF file, adding filters or custom annotations. Requires tabix indexed file with annotations.\n",
|
|
50
|
+
" Currently annotates only the ID and INFO column, but it will be extended on popular demand.\n",
|
|
51
|
+
"Usage: cat in.vcf | vcf-annotate [OPTIONS] > out.vcf\n",
|
|
52
|
+
"Options:\n",
|
|
53
|
+
" -a, --annotations <file.gz> The tabix indexed file with the annotations: CHR\\tFROM[\\tTO][\\tVALUE]+.\n",
|
|
54
|
+
" -c, --columns <list> The list of columns in the annotation file, e.g. CHROM,FROM,TO,-,INFO/STR,INFO/GN. The dash\n",
|
|
55
|
+
" in this example indicates that the third column should be ignored. If TO is not\n",
|
|
56
|
+
" present, it is assumed that TO equals to FROM. When REF and ALT columns are present, only\n",
|
|
57
|
+
" matching lines are annotated.\n",
|
|
58
|
+
" -d, --description <file|string> Header annotation, e.g. key=INFO,ID=HM2,Number=0,Type=Flag,Description='HapMap2 membership'.\n",
|
|
59
|
+
" The descriptions can be read from a file, one annotation per line.\n",
|
|
60
|
+
" -f, --filter <file|list> Apply filters, list is in the format flt1=value/flt2/flt3=value/etc. If argument to -f is a file,\n",
|
|
61
|
+
" user-defined filters be applied. See User Defined Filters below.\n",
|
|
62
|
+
" -h, -?, --help This help message.\n",
|
|
63
|
+
"Filters:\n",
|
|
64
|
+
sprintf("\t+ %-25s\t\tApply all filters with default values (can be overriden, see the example below).\n",''),
|
|
65
|
+
sprintf("\t-X %-25s\t\tExclude the filter X\n",''),
|
|
66
|
+
join('',@filters),
|
|
67
|
+
"Examples:\n",
|
|
68
|
+
" zcat in.vcf.gz | vcf-annotate -a annotations.gz -d descriptions.txt -c FROM,TO,CHROM,ID,INFO/DP | bgzip -c >out.vcf.gz \n",
|
|
69
|
+
" zcat in.vcf.gz | vcf-annotate -f +/-a/c=3,10/q=3/d=5/-D -a annotations.gz -d descriptions.txt | bgzip -c >out.vcf.gz \n",
|
|
70
|
+
" zcat in.vcf.gz | vcf-annotate -a dbSNPv132.tab.gz -c CHROM,FROM,REF,ALT,ID,-,-,- | bgzip -c >out.vcf.gz \n",
|
|
71
|
+
"Where descriptions.txt contains:\n",
|
|
72
|
+
" key=INFO,ID=GN,Number=1,Type=String,Description='Gene Name'\n",
|
|
73
|
+
" key=INFO,ID=STR,Number=1,Type=Integer,Description='Strand'\n",
|
|
74
|
+
"The file dbSNPv132.tab.gz with dbSNP IDs can be downloaded from\n",
|
|
75
|
+
" ftp://ftp.sanger.ac.uk/pub/1000genomes/pd3/dbSNP/\n",
|
|
76
|
+
"\n",
|
|
77
|
+
"User-defined filters:\n",
|
|
78
|
+
" # The examples below are self-explanatory. Notice the use of the predefined\n",
|
|
79
|
+
" # variables (\$MATCH, \$PASS, \$FAIL) and methods (error).\n",
|
|
80
|
+
" #\n",
|
|
81
|
+
" # In this example, a minimum value of AF1=0.1 is required\n",
|
|
82
|
+
" {\n",
|
|
83
|
+
" tag => 'INFO/AF1', # The VCF tag to apply this filter on\n",
|
|
84
|
+
" name => 'MinAF', # The filter ID\n",
|
|
85
|
+
" desc => 'Minimum AF1 [0.01]', # Description for the VCF header\n",
|
|
86
|
+
" test => sub { return \$MATCH < 0.01 ? \$FAIL : \$PASS },\n",
|
|
87
|
+
" },\n",
|
|
88
|
+
" # Filter all indels (presence of INDEL tag is tested)\n",
|
|
89
|
+
" {\n",
|
|
90
|
+
" tag => 'INFO/INDEL',\n",
|
|
91
|
+
" apply_to => 'indels', # Can be one of SNPs, indels, all. Default: [All]\n",
|
|
92
|
+
" name => 'Indel',\n",
|
|
93
|
+
" desc => 'INDEL tag present',\n",
|
|
94
|
+
" test => sub { return \$FAIL },\n",
|
|
95
|
+
" },\n",
|
|
96
|
+
" # Only loci with enough reads supporting the variant will pass the filter\n",
|
|
97
|
+
" {\n",
|
|
98
|
+
" tag => 'INFO/DP4',\n",
|
|
99
|
+
" name => 'FewAlts',\n",
|
|
100
|
+
" desc => 'Too few reads supporting the variant',\n",
|
|
101
|
+
" apply_to => 'SNPs',\n",
|
|
102
|
+
" test => sub {\n",
|
|
103
|
+
" if ( !(\$MATCH =~ /^([^,]+),([^,]+),([^,]+),(.+)\$/) )\n",
|
|
104
|
+
" {\n",
|
|
105
|
+
" error(\"Could not parse INFO/DP4: \$CHROM:\$POS [\$MATCH]\");\n",
|
|
106
|
+
" }\n",
|
|
107
|
+
" if ( 0.1*(\$1+\$2) > \$3+\$4 ) { return \$PASS; }\n",
|
|
108
|
+
" return \$FAIL;\n",
|
|
109
|
+
" },\n",
|
|
110
|
+
" },\n",
|
|
111
|
+
"\n";
|
|
112
|
+
exit -1;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
sub parse_params
|
|
116
|
+
{
|
|
117
|
+
my $opts = { args=>[$0, @ARGV], };
|
|
118
|
+
while (defined(my $arg=shift(@ARGV)))
|
|
119
|
+
{
|
|
120
|
+
if ( $arg eq '-d' || $arg eq '--description' )
|
|
121
|
+
{
|
|
122
|
+
my $desc = shift(@ARGV);
|
|
123
|
+
if ( -e $desc )
|
|
124
|
+
{
|
|
125
|
+
open(my $fh,'<',$desc) or error("$desc: $!");
|
|
126
|
+
while (my $line=<$fh>)
|
|
127
|
+
{
|
|
128
|
+
if ( $line=~/^\s*$/ or $line=~/^#/ ) { next; }
|
|
129
|
+
chomp($line);
|
|
130
|
+
push @{$$opts{desc}}, $line;
|
|
131
|
+
}
|
|
132
|
+
close($fh);
|
|
133
|
+
}
|
|
134
|
+
else
|
|
135
|
+
{
|
|
136
|
+
push @{$$opts{desc}}, $desc;
|
|
137
|
+
}
|
|
138
|
+
next;
|
|
139
|
+
}
|
|
140
|
+
if ( $arg eq '-f' || $arg eq '--filter' )
|
|
141
|
+
{
|
|
142
|
+
my $filter = shift(@ARGV);
|
|
143
|
+
parse_filters($opts,$filter);
|
|
144
|
+
next;
|
|
145
|
+
}
|
|
146
|
+
if ( $arg eq '-c' || $arg eq '--columns' )
|
|
147
|
+
{
|
|
148
|
+
my $cols = shift(@ARGV);
|
|
149
|
+
$$opts{cols} = [ split(/,/,$cols) ];
|
|
150
|
+
next;
|
|
151
|
+
}
|
|
152
|
+
if ( $arg eq '-a' || $arg eq '--annotations' ) { $$opts{annotations} = shift(@ARGV); next }
|
|
153
|
+
if ( $arg eq '-t' || $arg eq '--tag' ) { $$opts{tag} = shift(@ARGV); next }
|
|
154
|
+
if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
|
|
155
|
+
if ( -e $arg ) { $$opts{file}=$arg; next; }
|
|
156
|
+
error("Unknown parameter \"$arg\". Run -h for help.\n");
|
|
157
|
+
}
|
|
158
|
+
if ( !exists($$opts{filters}) && !exists($$opts{udef_filters}) )
|
|
159
|
+
{
|
|
160
|
+
if ( !exists($$opts{annotations}) ) { error("Missing the -a or -f option.\n") }
|
|
161
|
+
}
|
|
162
|
+
if ( exists($$opts{annotations}) && !exists($$opts{cols}) ) { error("Missing the -c option.\n"); }
|
|
163
|
+
return $opts;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
sub parse_user_defined_filters
|
|
167
|
+
{
|
|
168
|
+
my ($opts,$str) = @_;
|
|
169
|
+
my $filters = [ do $str ];
|
|
170
|
+
for my $filter (@$filters)
|
|
171
|
+
{
|
|
172
|
+
if ( !exists($$filter{tag}) ) { error("Missing 'tag' key for one of the filters in $str\n"); }
|
|
173
|
+
if ( !($$filter{tag}=~m{^INFO/(.+)$}) ) { error("Currently only INFO tags supported. Could not parse the tag [$$filter{tag}]\n"); }
|
|
174
|
+
my $ID = $1;
|
|
175
|
+
|
|
176
|
+
$$filter{ID} = $ID;
|
|
177
|
+
$$filter{tag_re} = qr{$ID=?([^\t;]*)};
|
|
178
|
+
|
|
179
|
+
if ( !exists($$filter{name}) ) { error("Missing 'name' key for the filter [$$filter{tag}]\n"); }
|
|
180
|
+
if ( !exists($$filter{desc}) ) { error("Missing 'desc' key for the filter [$$filter{tag}]\n"); }
|
|
181
|
+
|
|
182
|
+
my $name = $$filter{name};
|
|
183
|
+
push @{$$opts{desc}}, "key=FILTER,ID=$name,Description='$$filter{desc}";
|
|
184
|
+
if ( !exists($$filter{apply_to}) or lc($$filter{apply_to}) eq 'all' )
|
|
185
|
+
{
|
|
186
|
+
$$opts{udef_filters}{'all'}{$name} = $filter;
|
|
187
|
+
$$opts{udef_filters}{'s'}{$name} = $filter;
|
|
188
|
+
$$opts{udef_filters}{'i'}{$name} = $filter;
|
|
189
|
+
}
|
|
190
|
+
elsif ( exists($$filter{apply_to}) and lc($$filter{apply_to}) eq 'snps' )
|
|
191
|
+
{
|
|
192
|
+
$$opts{udef_filters}{'s'}{$name} = $filter;
|
|
193
|
+
$$opts{udef_filters_typecheck_needed} = 1;
|
|
194
|
+
}
|
|
195
|
+
elsif ( exists($$filter{apply_to}) and lc($$filter{apply_to}) eq 'indels' )
|
|
196
|
+
{
|
|
197
|
+
$$opts{udef_filters}{'i'}{$name} = $filter;
|
|
198
|
+
$$opts{udef_filters_typecheck_needed} = 1;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
sub parse_filters
|
|
204
|
+
{
|
|
205
|
+
my ($opts,$str) = @_;
|
|
206
|
+
|
|
207
|
+
if ( -e $str )
|
|
208
|
+
{
|
|
209
|
+
parse_user_defined_filters($opts,$str);
|
|
210
|
+
return;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
my $has_filters = 0;
|
|
214
|
+
my $set_defaults = 0;
|
|
215
|
+
my @filters = split(m{/},$str);
|
|
216
|
+
for my $fltr (@filters)
|
|
217
|
+
{
|
|
218
|
+
if ( $fltr eq '+' ) { $set_defaults=1; last; }
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
my %mapping;
|
|
222
|
+
for my $flt (keys %filters)
|
|
223
|
+
{
|
|
224
|
+
if ( exists($mapping{$filters{$flt}{nick}}) ) { error("FIXME: the nick $filters{$flt}{nick} is not unique.\n"); }
|
|
225
|
+
$mapping{$filters{$flt}{nick}} = $flt;
|
|
226
|
+
|
|
227
|
+
if ( !defined($filters{$flt}{dflt}) ) { next; }
|
|
228
|
+
if ( $set_defaults )
|
|
229
|
+
{
|
|
230
|
+
$$opts{filters}{$flt} = $filters{$flt}{dflt};
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
for my $filter (@filters)
|
|
235
|
+
{
|
|
236
|
+
my ($key,$val) = split(/=/,$filter);
|
|
237
|
+
if ( $key eq '+' ) { next; }
|
|
238
|
+
my $to_be_deleted = 0;
|
|
239
|
+
if ( $key=~/^-(.+)$/ ) { $to_be_deleted=1; $key = $1; }
|
|
240
|
+
if ( !exists($filters{$key}) ) { $key = $mapping{$key}; }
|
|
241
|
+
if ( !exists($filters{$key}) && !exists($mapping{$key}) ) { error("The filter [$key] not recognised.\n"); }
|
|
242
|
+
if ( $to_be_deleted ) { delete($$opts{filters}{$key}); next; }
|
|
243
|
+
|
|
244
|
+
if ( $key eq 'c' || $key eq 'SnpCluster' )
|
|
245
|
+
{
|
|
246
|
+
($$opts{SnpCluster_count},$$opts{SnpCluster_win}) = split(/,/,$val);
|
|
247
|
+
|
|
248
|
+
# Simple sanity check
|
|
249
|
+
if ( $$opts{SnpCluster_count}>$$opts{SnpCluster_win} )
|
|
250
|
+
{
|
|
251
|
+
error("Did you really mean snpCluster=$$opts{SnpCluster_count},$$opts{SnpCluster_win}? The win (INT2) must be bigger or equal to count (INT1).");
|
|
252
|
+
}
|
|
253
|
+
$$opts{SnpCluster_buffer} = [];
|
|
254
|
+
push @{$$opts{desc}}, "key=FILTER,ID=SnpCluster,Description='$filters{SnpCluster}{desc} [win=$$opts{SnpCluster_win},count=$$opts{SnpCluster_count}]'";
|
|
255
|
+
$has_filters = 1;
|
|
256
|
+
next;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
$$opts{filters}{$key} = $val;
|
|
260
|
+
$has_filters = 1;
|
|
261
|
+
}
|
|
262
|
+
for my $key (keys %{$$opts{filters}})
|
|
263
|
+
{
|
|
264
|
+
push @{$$opts{desc}}, "key=FILTER,ID=$key,Description='$filters{$key}{desc}" . (defined $$opts{filters}{$key} ? " [$$opts{filters}{$key}]'" : "'");
|
|
265
|
+
}
|
|
266
|
+
if ( !$has_filters && !scalar keys %{$$opts{filters}} ) { delete($$opts{filters}); }
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
# Convert text descriptions given on command line to hashes which will be
|
|
271
|
+
# passed to Vcf::add_header_line
|
|
272
|
+
sub parse_descriptions
|
|
273
|
+
{
|
|
274
|
+
my ($descs) = @_;
|
|
275
|
+
my @out;
|
|
276
|
+
for my $str (@$descs)
|
|
277
|
+
{
|
|
278
|
+
my $desc = {};
|
|
279
|
+
my $tmp = $str;
|
|
280
|
+
while ($tmp)
|
|
281
|
+
{
|
|
282
|
+
my ($key,$value);
|
|
283
|
+
if ( $tmp=~/^([^=]+)=["']([^\"]+)["']/ ) { $key=$1; $value=$2; }
|
|
284
|
+
elsif ( $tmp=~/^([^=]+)=([^,"]+)/ && $1 eq 'Description' )
|
|
285
|
+
{
|
|
286
|
+
# The command line eats the quotes
|
|
287
|
+
$key=$1; $value=$2.$';
|
|
288
|
+
$$desc{$key} = $value;
|
|
289
|
+
last;
|
|
290
|
+
}
|
|
291
|
+
elsif ( $tmp=~/^([^=]+)=([^,"]+)/ )
|
|
292
|
+
{
|
|
293
|
+
$key=$1; $value=$2;
|
|
294
|
+
}
|
|
295
|
+
else { error(qq[Could not parse the description: [$str]\n]); }
|
|
296
|
+
$$desc{$key} = $value;
|
|
297
|
+
|
|
298
|
+
$tmp = $';
|
|
299
|
+
if ( $tmp=~/^,/ ) { $tmp = $'; }
|
|
300
|
+
|
|
301
|
+
}
|
|
302
|
+
if ( !exists($$desc{ID}) ) { error("No ID in description? [$str]\n"); }
|
|
303
|
+
push @out, $desc;
|
|
304
|
+
}
|
|
305
|
+
return \@out;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
# Create mapping from the annotation IDs to column indexes. The mapping is used
|
|
309
|
+
# to determine which columns should be used from the annotation file. The
|
|
310
|
+
# following structure is returned:
|
|
311
|
+
# {
|
|
312
|
+
# CHROM => col_idx,
|
|
313
|
+
# FROM => col_idx,
|
|
314
|
+
# TO => col_idx,
|
|
315
|
+
# annots =>
|
|
316
|
+
# [
|
|
317
|
+
# { col=>col_idx, id=>annot_id, vcf_col=>vcf_column, is_flag=>0 },
|
|
318
|
+
# ]
|
|
319
|
+
# }
|
|
320
|
+
# If {annots}{is_flag} is nonzero, "annot_id" will be written to VCF instead of "annot_id=value".
|
|
321
|
+
# Currently only one VCF column (INFO) is supported.
|
|
322
|
+
#
|
|
323
|
+
sub parse_columns
|
|
324
|
+
{
|
|
325
|
+
my ($cols,$descriptions) = @_;
|
|
326
|
+
|
|
327
|
+
my %desc = ();
|
|
328
|
+
my %out = ( annots=>[] );
|
|
329
|
+
|
|
330
|
+
if ( !defined $cols ) { return \%out; }
|
|
331
|
+
|
|
332
|
+
for my $d (@$descriptions)
|
|
333
|
+
{
|
|
334
|
+
$desc{$$d{key}.'/'.$$d{ID}} = $d;
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
for (my $i=0; $i<@$cols; $i++)
|
|
338
|
+
{
|
|
339
|
+
my $col = $$cols[$i];
|
|
340
|
+
|
|
341
|
+
if ( $col eq '-' ) { next; }
|
|
342
|
+
elsif ( $col eq 'CHROM' ) { $out{$col}=$i; }
|
|
343
|
+
elsif ( $col eq 'FROM' ) { $out{$col}=$i; }
|
|
344
|
+
elsif ( $col eq 'POS' ) { $out{'FROM'}=$i; }
|
|
345
|
+
elsif ( $col eq 'TO' ) { $out{$col}=$i; }
|
|
346
|
+
elsif ( $col eq 'ID' ) { $out{$col}=$i; }
|
|
347
|
+
elsif ( $col eq 'REF' ) { $out{$col}=$i; }
|
|
348
|
+
elsif ( $col eq 'ALT' ) { $out{$col}=$i; }
|
|
349
|
+
else
|
|
350
|
+
{
|
|
351
|
+
if ( !exists($desc{$col}) && exists($desc{"INFO/$col"}) )
|
|
352
|
+
{
|
|
353
|
+
print STDERR qq[The description for "$col" does not exist, assuming "INFO/$col"\n];
|
|
354
|
+
$col = "INFO/$col";
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
if ( !exists($desc{$col}))
|
|
358
|
+
{
|
|
359
|
+
error("Missing the -d parameter for the column [$col]\n");
|
|
360
|
+
}
|
|
361
|
+
if ( !($col=~m{^(.+)/(.+)$}) ) { error("Could not parse the column [$col].\n"); }
|
|
362
|
+
my $key = $1;
|
|
363
|
+
my $id = $2;
|
|
364
|
+
my $rec = { col=>$i, id=>$id, vcf_col=>$key, is_flag=>($desc{$col}{Type} eq 'Flag' ? 1 : 0) };
|
|
365
|
+
push @{$out{annots}}, $rec;
|
|
366
|
+
if ( $key ne 'INFO' ) { error("TODO: other than INFO columns\n"); }
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
if ( !exists($out{CHROM}) ) { $out{CHROM}=0; }
|
|
370
|
+
if ( !exists($out{FROM}) ) { $out{FROM}=1; }
|
|
371
|
+
if ( !exists($out{TO}) ) { $out{TO}=$out{FROM}; }
|
|
372
|
+
if ( exists($out{REF}) && !exists($out{ALT}) or !exists($out{REF}) && exists($out{ALT}) ) { error("Expected both REF and ALT columns in the annotation file.\n"); }
|
|
373
|
+
return \%out;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
sub annotate
|
|
377
|
+
{
|
|
378
|
+
my ($opts) = @_;
|
|
379
|
+
|
|
380
|
+
# Init the variables
|
|
381
|
+
my $descs = parse_descriptions($$opts{desc});
|
|
382
|
+
my $cols = parse_columns($$opts{cols},$descs);
|
|
383
|
+
|
|
384
|
+
# Open VCF file and add all required header lines
|
|
385
|
+
my $vcf = $$opts{vcf} = exists($$opts{file}) ? Vcf->new(file=>$$opts{file}) : Vcf->new(fh=>\*STDIN);
|
|
386
|
+
$vcf->parse_header();
|
|
387
|
+
for my $desc (@$descs)
|
|
388
|
+
{
|
|
389
|
+
$vcf->add_header_line($desc,silent=>1);
|
|
390
|
+
}
|
|
391
|
+
$vcf->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp');
|
|
392
|
+
print $vcf->format_header();
|
|
393
|
+
|
|
394
|
+
my ($prev_chr,$prev_pos,$annot_from,$annot_to,$annot_line);
|
|
395
|
+
my @annots = @{$$cols{annots}};
|
|
396
|
+
my $id_col = exists($$cols{ID}) ? $$cols{ID} : undef;
|
|
397
|
+
my $from_col = $$cols{FROM};
|
|
398
|
+
my $to_col = $$cols{TO};
|
|
399
|
+
my $ref_col = exists($$cols{REF}) ? $$cols{REF} : undef;
|
|
400
|
+
my $alt_col = exists($$cols{ALT}) ? $$cols{ALT} : undef;
|
|
401
|
+
|
|
402
|
+
# Initialize the annotation reader
|
|
403
|
+
my $reader;
|
|
404
|
+
if ( exists($$opts{annotations}) )
|
|
405
|
+
{
|
|
406
|
+
$reader = Reader->new(file=>$$opts{annotations});
|
|
407
|
+
my $line = $vcf->next_line();
|
|
408
|
+
if ( !defined $line )
|
|
409
|
+
{
|
|
410
|
+
# VCF file is empty
|
|
411
|
+
undef $reader;
|
|
412
|
+
}
|
|
413
|
+
else
|
|
414
|
+
{
|
|
415
|
+
my @rec = split(/\t/,$line);
|
|
416
|
+
$prev_chr = $rec[0];
|
|
417
|
+
$prev_pos = $rec[1];
|
|
418
|
+
$vcf->_unread_line($line);
|
|
419
|
+
$reader->open(region=>$prev_chr);
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
while (defined $reader)
|
|
424
|
+
{
|
|
425
|
+
# Read next annotation group, i.e. all records with the same position (or overlapping in case of intervals)
|
|
426
|
+
my (@annot_lines,$annot_prev_from,$annot_prev_to);
|
|
427
|
+
while ($reader)
|
|
428
|
+
{
|
|
429
|
+
my $annot_line = $reader->next_line();
|
|
430
|
+
if ( !defined $annot_line ) { last; }
|
|
431
|
+
my $annot_from = $$annot_line[$from_col];
|
|
432
|
+
my $annot_to = $$annot_line[$to_col];
|
|
433
|
+
if ( !@annot_lines )
|
|
434
|
+
{
|
|
435
|
+
push @annot_lines, $annot_line;
|
|
436
|
+
$annot_prev_from = $annot_from;
|
|
437
|
+
$annot_prev_to = $annot_to;
|
|
438
|
+
next;
|
|
439
|
+
}
|
|
440
|
+
if ( $annot_from <= $annot_prev_to or $annot_to <= $annot_prev_to )
|
|
441
|
+
{
|
|
442
|
+
push @annot_lines, $annot_line;
|
|
443
|
+
if ( $annot_prev_to < $annot_to ) { $annot_prev_to = $annot_to; }
|
|
444
|
+
next;
|
|
445
|
+
}
|
|
446
|
+
$reader->unread_line($annot_line);
|
|
447
|
+
last;
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
# Now loop through the VCF records
|
|
451
|
+
my $line;
|
|
452
|
+
while ($line = $vcf->next_line())
|
|
453
|
+
{
|
|
454
|
+
my @rec = split(/\t/,$line);
|
|
455
|
+
my $chr = $rec[0];
|
|
456
|
+
my $pos = $rec[1];
|
|
457
|
+
chomp($rec[-1]);
|
|
458
|
+
|
|
459
|
+
if ( $chr ne $prev_chr )
|
|
460
|
+
{
|
|
461
|
+
$vcf->_unread_line($line);
|
|
462
|
+
$prev_chr = $chr;
|
|
463
|
+
$reader->open(region=>$prev_chr);
|
|
464
|
+
last;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
# Quick position-based check: Is there an annotation for this record?
|
|
468
|
+
if ( !defined $annot_prev_from or $pos < $annot_prev_from )
|
|
469
|
+
{
|
|
470
|
+
output_line($opts,\@rec);
|
|
471
|
+
next;
|
|
472
|
+
}
|
|
473
|
+
if ( $pos > $annot_prev_to )
|
|
474
|
+
{
|
|
475
|
+
$vcf->_unread_line($line);
|
|
476
|
+
last;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
# Initialize the REF,ALT-based check
|
|
480
|
+
my ($ref,%alt);
|
|
481
|
+
if ( defined $alt_col )
|
|
482
|
+
{
|
|
483
|
+
$ref = $rec[3];
|
|
484
|
+
%alt = map { $_=>1 } split(/,/,$rec[4]);
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
# Now fill the annotations; Existing annotations with the same tag will be overwritten
|
|
488
|
+
my %values = ();
|
|
489
|
+
my %ids = ();
|
|
490
|
+
for my $annot_line (@annot_lines)
|
|
491
|
+
{
|
|
492
|
+
# Skip annotation lines which are not relevant to this VCF record
|
|
493
|
+
if ( $$annot_line[$from_col] > $pos or $$annot_line[$to_col] < $pos ) { next; }
|
|
494
|
+
if ( defined $alt_col && $$annot_line[$ref_col] ne '.' )
|
|
495
|
+
{
|
|
496
|
+
if ( $$annot_line[$ref_col] ne $ref ) { next; }
|
|
497
|
+
if ( !exists($alt{$$annot_line[$alt_col]}) ) { next; }
|
|
498
|
+
}
|
|
499
|
+
for my $info (@annots)
|
|
500
|
+
{
|
|
501
|
+
my $val = $$annot_line[$$info{col}];
|
|
502
|
+
chomp($val);
|
|
503
|
+
|
|
504
|
+
if ( $val eq '' or $val eq '.' ) { $val=undef; } # Existing annotation should be removed
|
|
505
|
+
elsif ( $$info{is_flag} )
|
|
506
|
+
{
|
|
507
|
+
if ( $val ) { $val=''; } # Flag annotation should be added
|
|
508
|
+
else { $val=undef; } # Flag annotation should be removed
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
# Only single undef values can be present are overriden if defined in other records
|
|
512
|
+
if ( !defined $val && exists($values{$$info{id}}) ) { next; }
|
|
513
|
+
elsif ( exists($values{$$info{id}}) && !defined $values{$$info{id}}[0] )
|
|
514
|
+
{
|
|
515
|
+
$values{$$info{id}}[0] = $val;
|
|
516
|
+
next;
|
|
517
|
+
}
|
|
518
|
+
push @{$values{$$info{id}}}, $val;
|
|
519
|
+
}
|
|
520
|
+
if ( defined $id_col && $$annot_line[$id_col] ne '' ) { $ids{$$annot_line[$id_col]} = 1; }
|
|
521
|
+
}
|
|
522
|
+
if ( scalar keys %ids ) { $rec[2] = join(';', keys %ids); }
|
|
523
|
+
if ( scalar keys %values )
|
|
524
|
+
{
|
|
525
|
+
for my $key (keys %values)
|
|
526
|
+
{
|
|
527
|
+
# Cannot use join on undef values
|
|
528
|
+
$values{$key} = scalar @{$values{$key}} == 1 ? $values{$key}[0] : join(',', @{$values{$key}});
|
|
529
|
+
}
|
|
530
|
+
$rec[7] = $vcf->add_info_field($rec[7],%values);
|
|
531
|
+
}
|
|
532
|
+
output_line($opts,\@rec);
|
|
533
|
+
}
|
|
534
|
+
if ( !defined $line ) { last; }
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
# Finish the VCF, no annotations for this part
|
|
538
|
+
while (my $line=$vcf->next_line)
|
|
539
|
+
{
|
|
540
|
+
my @rec = split(/\t/,$line);
|
|
541
|
+
chomp($rec[-1]);
|
|
542
|
+
output_line($opts,\@rec);
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
# Output any lines left in the buffer
|
|
546
|
+
output_line($opts);
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
# Stage the lines and then apply filtering if requested, otherwise just print the line
|
|
551
|
+
sub output_line
|
|
552
|
+
{
|
|
553
|
+
my ($opts,$line) = @_;
|
|
554
|
+
|
|
555
|
+
if ( !exists($$opts{filters}) && !exists($$opts{udef_filters}) )
|
|
556
|
+
{
|
|
557
|
+
# No filters requested, print the line
|
|
558
|
+
print_line($line);
|
|
559
|
+
return;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
if ( defined $line )
|
|
563
|
+
{
|
|
564
|
+
# Local filters return the line back immediately
|
|
565
|
+
if ( scalar keys %{$$opts{filters}} )
|
|
566
|
+
{
|
|
567
|
+
$line = apply_local_filters($opts,$line);
|
|
568
|
+
}
|
|
569
|
+
if ( exists($$opts{udef_filters}) )
|
|
570
|
+
{
|
|
571
|
+
$line = apply_user_defined_filters($opts,$line);
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
# Staging filters may return nothing or multiple lines. If $line is not defined, they will
|
|
576
|
+
# empty the buffers
|
|
577
|
+
my @lines;
|
|
578
|
+
if ( exists($$opts{filters}{SnpGap}) )
|
|
579
|
+
{
|
|
580
|
+
@lines = apply_snpgap_filter($opts,$line);
|
|
581
|
+
if ( defined $line && !scalar @lines ) { return; }
|
|
582
|
+
}
|
|
583
|
+
elsif ( defined $line ) { @lines=($line); }
|
|
584
|
+
|
|
585
|
+
if ( exists($$opts{filters}{GapWin}) )
|
|
586
|
+
{
|
|
587
|
+
my @tmp;
|
|
588
|
+
if ( !defined $line ) { push @lines,undef; }
|
|
589
|
+
for my $line (@lines)
|
|
590
|
+
{
|
|
591
|
+
push @tmp, apply_gapwin_filter($opts,$line);
|
|
592
|
+
}
|
|
593
|
+
@lines = @tmp;
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
if ( exists($$opts{SnpCluster_count}) )
|
|
597
|
+
{
|
|
598
|
+
my @tmp;
|
|
599
|
+
if ( !defined $line ) { push @lines,undef; }
|
|
600
|
+
for my $line (@lines)
|
|
601
|
+
{
|
|
602
|
+
push @tmp, apply_snpcluster_filter($opts,$line);
|
|
603
|
+
}
|
|
604
|
+
@lines = @tmp;
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
for my $line (@lines)
|
|
608
|
+
{
|
|
609
|
+
print_line($line);
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
sub apply_user_defined_filters
|
|
614
|
+
{
|
|
615
|
+
my ($opts,$line) = @_;
|
|
616
|
+
|
|
617
|
+
our($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF);
|
|
618
|
+
$CHROM = $$line[0];
|
|
619
|
+
$POS = $$line[1];
|
|
620
|
+
$FAIL = 1;
|
|
621
|
+
$PASS = 0;
|
|
622
|
+
$RECORD = $line;
|
|
623
|
+
$VCF = $$opts{vcf};
|
|
624
|
+
|
|
625
|
+
my %filters = ();
|
|
626
|
+
if ( $$opts{udef_filters_typecheck_needed} )
|
|
627
|
+
{
|
|
628
|
+
# Check if the line has an indel, SNP or both
|
|
629
|
+
for my $alt (split(/,/,$$line[4]))
|
|
630
|
+
{
|
|
631
|
+
my ($type,$len,$ht) = $$opts{vcf}->event_type($$line[3],$alt);
|
|
632
|
+
if ( exists($$opts{udef_filters}{$type}) )
|
|
633
|
+
{
|
|
634
|
+
%filters = ( %filters, %{$$opts{udef_filters}{$type}} );
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
# Return if the line does not have the wanted variant type
|
|
638
|
+
if ( !scalar %filters ) { return $line; }
|
|
639
|
+
}
|
|
640
|
+
else
|
|
641
|
+
{
|
|
642
|
+
%filters = %{$$opts{udef_filters}{all}};
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
my %apply;
|
|
646
|
+
for my $filter (values %filters)
|
|
647
|
+
{
|
|
648
|
+
if ( !($$line[7]=~$$filter{tag_re}) ) { next; }
|
|
649
|
+
$MATCH = $1;
|
|
650
|
+
$apply{ $$filter{name} } = &{$$filter{test}} == $PASS ? 0 : 1;
|
|
651
|
+
}
|
|
652
|
+
if ( scalar keys %apply )
|
|
653
|
+
{
|
|
654
|
+
$$line[6] = $$opts{vcf}->add_filter($$line[6],%apply);
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
return $line;
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
sub apply_local_filters
|
|
661
|
+
{
|
|
662
|
+
my ($opts,$line) = @_;
|
|
663
|
+
|
|
664
|
+
if ( !defined $line ) { return; }
|
|
665
|
+
|
|
666
|
+
my $filters = $$opts{filters};
|
|
667
|
+
my %apply;
|
|
668
|
+
|
|
669
|
+
my ($dp,$dp_alt,$mq,$vdb);
|
|
670
|
+
if ( $$line[7]=~/DP4=(\d+),(\d+),(\d+),(\d+)/i )
|
|
671
|
+
{
|
|
672
|
+
$dp = $1 + $2 + $3 + $4;
|
|
673
|
+
$dp_alt = $3 + $4;
|
|
674
|
+
}
|
|
675
|
+
if ( $$line[7]=~/DP=(\d+)/i ) { $dp = $1; }
|
|
676
|
+
if ( $$line[7]=~/MQ=(\d+)/i ) { $mq = $1; }
|
|
677
|
+
if ( $$line[7]=~/VDB=([^;,\t]+)/i ) { $vdb = $1; }
|
|
678
|
+
|
|
679
|
+
if ( exists($$filters{RefN}) )
|
|
680
|
+
{
|
|
681
|
+
$apply{RefN} = ($$line[3]=~/N/) ? 1 : 0;
|
|
682
|
+
}
|
|
683
|
+
if ( exists($$filters{Qual}) && $$line[5] ne '.' )
|
|
684
|
+
{
|
|
685
|
+
$apply{Qual} = $$line[5] < $$filters{Qual} ? 1 : 0;
|
|
686
|
+
}
|
|
687
|
+
if ( exists($$filters{HWE}) && $$line[7]=~/G3=([^,]+),([^,]+),/ )
|
|
688
|
+
{
|
|
689
|
+
my ($rr,$ra);
|
|
690
|
+
$rr = $1;
|
|
691
|
+
$ra = $2;
|
|
692
|
+
$apply{HWE} = 0;
|
|
693
|
+
if ( $$line[7]=~/HWE=([^;\t]+)/ && $1<$$filters{HWE} )
|
|
694
|
+
{
|
|
695
|
+
my $p = 2*$rr + $ra;
|
|
696
|
+
if ( $p>0 && $p<1 && $ra/($p*(1-$p))>1 )
|
|
697
|
+
{
|
|
698
|
+
$apply{HWE} = 1;
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
if ( defined $dp_alt )
|
|
703
|
+
{
|
|
704
|
+
if ( exists($$filters{MinAB}) )
|
|
705
|
+
{
|
|
706
|
+
$apply{MinAB} = $dp_alt < $$filters{MinAB} ? 1 : 0;
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
if ( defined $vdb )
|
|
710
|
+
{
|
|
711
|
+
$apply{VDB} = $vdb < $$filters{VDB} ? 1 : 0;
|
|
712
|
+
}
|
|
713
|
+
if ( defined $dp )
|
|
714
|
+
{
|
|
715
|
+
if ( exists($$filters{MinDP}) )
|
|
716
|
+
{
|
|
717
|
+
$apply{MinDP} = $dp < $$filters{MinDP} ? 1 : 0;
|
|
718
|
+
}
|
|
719
|
+
if ( exists($$filters{MaxDP}) )
|
|
720
|
+
{
|
|
721
|
+
$apply{MaxDP} = $dp > $$filters{MaxDP} ? 1 : 0;
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
if ( defined $mq )
|
|
725
|
+
{
|
|
726
|
+
if ( exists($$filters{MinMQ}) )
|
|
727
|
+
{
|
|
728
|
+
$apply{MinMQ} = $mq < $$filters{MinMQ} ? 1 : 0;
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
if ( $$line[7]=~/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ )
|
|
732
|
+
{
|
|
733
|
+
if ( exists($$filters{StrandBias}) )
|
|
734
|
+
{
|
|
735
|
+
$apply{StrandBias} = $1 < $$filters{StrandBias} ? 1 : 0;
|
|
736
|
+
}
|
|
737
|
+
if ( exists($$filters{BaseQualBias}) )
|
|
738
|
+
{
|
|
739
|
+
$apply{BaseQualBias} = $2 < $$filters{BaseQualBias} ? 1 : 0;
|
|
740
|
+
}
|
|
741
|
+
if ( exists($$filters{MapQualBias}) )
|
|
742
|
+
{
|
|
743
|
+
$apply{MapQualBias} = $3 < $$filters{MapQualBias} ? 1 : 0;
|
|
744
|
+
}
|
|
745
|
+
if ( exists($$filters{EndDistBias}) )
|
|
746
|
+
{
|
|
747
|
+
$apply{EndDistBias} = $4 < $$filters{EndDistBias} ? 1 : 0;
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
if ( scalar keys %apply )
|
|
751
|
+
{
|
|
752
|
+
$$line[6] = $$opts{vcf}->add_filter($$line[6],%apply);
|
|
753
|
+
}
|
|
754
|
+
return $line;
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
sub apply_snpgap_filter
|
|
758
|
+
{
|
|
759
|
+
my ($opts,$line) = @_;
|
|
760
|
+
if ( !exists($$opts{SnpGap_buffer}) ) { $$opts{SnpGap_buffer}=[]; }
|
|
761
|
+
|
|
762
|
+
my $vcf = $$opts{vcf};
|
|
763
|
+
my $win = $$opts{filters}{SnpGap};
|
|
764
|
+
my $buffer = $$opts{SnpGap_buffer};
|
|
765
|
+
my ($indel_chr,$indel_pos,$to);
|
|
766
|
+
|
|
767
|
+
if ( defined $line )
|
|
768
|
+
{
|
|
769
|
+
# There may be multiple variants, look for an indel. Anything what is not ref can be filtered.
|
|
770
|
+
my $is_indel = 0;
|
|
771
|
+
my $can_be_filtered = 0;
|
|
772
|
+
for my $alt (split(/,/,$$line[4]))
|
|
773
|
+
{
|
|
774
|
+
my ($type,$len,$ht) = $vcf->event_type($$line[3],$alt);
|
|
775
|
+
if ( $type eq 'i' )
|
|
776
|
+
{
|
|
777
|
+
$is_indel = 1;
|
|
778
|
+
$indel_chr = $$line[0];
|
|
779
|
+
$indel_pos = $$line[1]+1;
|
|
780
|
+
}
|
|
781
|
+
elsif ( $type ne 'r' ) { $can_be_filtered = 1; }
|
|
782
|
+
}
|
|
783
|
+
# The indel boundaries are based on REF (POS+1,POS+rlen-1). This is not
|
|
784
|
+
# correct as the indel can begin anywhere in the VCF4.x record with
|
|
785
|
+
# respect to POS. Specifically mpileup likes to write REF=CAGAGAGAGA
|
|
786
|
+
# ALT=CAGAGAGAGAGA. Thus this filtering is more strict and may remove
|
|
787
|
+
# some valid SNPs.
|
|
788
|
+
$to = $is_indel ? $$line[1]+length($$line[3])-1 : $$line[1];
|
|
789
|
+
push @$buffer, { line=>$line, chr=>$$line[0], from=>defined $indel_pos ? $indel_pos : $$line[1], to=>$to, exclude=>0, can_be_filtered=>$can_be_filtered, is_indel=>$is_indel };
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
my $n = @$buffer;
|
|
793
|
+
|
|
794
|
+
# Is the new line an indel? If yes, check the distance to all previous lines
|
|
795
|
+
if ( defined $indel_chr )
|
|
796
|
+
{
|
|
797
|
+
for (my $i=0; $i<$n-1; $i++)
|
|
798
|
+
{
|
|
799
|
+
my $buf = $$buffer[$i];
|
|
800
|
+
if ( $$buf{chr} ne $indel_chr ) { next; }
|
|
801
|
+
if ( !$$buf{can_be_filtered} ) { next; }
|
|
802
|
+
if ( $$buf{is_indel} ) { next; }
|
|
803
|
+
if ( $$buf{to}>=$indel_pos-$win ) { $$buf{exclude}=1; }
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
if ( defined $line && $$buffer[0]{chr} eq $$buffer[-1]{chr} && $win+$$buffer[0]{to}>=$$buffer[-1]{from} )
|
|
808
|
+
{
|
|
809
|
+
# There are not enough rows in the buffer: the SnpGap window spans them all. Wait until there is more rows
|
|
810
|
+
# or a new chromosome
|
|
811
|
+
return ();
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
# 'Look-behind' filtering was done above, now comes 'look-ahead' filtering
|
|
815
|
+
my $indel_to;
|
|
816
|
+
for (my $i=0; $i<$n; $i++)
|
|
817
|
+
{
|
|
818
|
+
my $buf = $$buffer[$i];
|
|
819
|
+
if ( $$buf{is_indel} )
|
|
820
|
+
{
|
|
821
|
+
$indel_to = $$buf{to};
|
|
822
|
+
$indel_chr = $$buf{chr};
|
|
823
|
+
next;
|
|
824
|
+
}
|
|
825
|
+
if ( !defined $indel_to ) { next; }
|
|
826
|
+
if ( !$$buf{can_be_filtered} ) { next; }
|
|
827
|
+
if ( $$buf{chr} ne $indel_chr )
|
|
828
|
+
{
|
|
829
|
+
undef $indel_to;
|
|
830
|
+
next;
|
|
831
|
+
}
|
|
832
|
+
if ( $$buf{from}<=$indel_to+$win ) { $$buf{exclude}=1; }
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
# Output. If no $line was given, output everything
|
|
836
|
+
$to = $$buffer[-1]{from}-$win;
|
|
837
|
+
my $chr = $$buffer[-1]{chr};
|
|
838
|
+
my @out;
|
|
839
|
+
while (@$buffer)
|
|
840
|
+
{
|
|
841
|
+
if ( $$buffer[0]{chr} eq $chr && $$buffer[0]{to}+$win>=$to && defined $line ) { last; }
|
|
842
|
+
|
|
843
|
+
my $buf = shift(@$buffer);
|
|
844
|
+
if ( $$buf{exclude} )
|
|
845
|
+
{
|
|
846
|
+
$$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpGap'=>1);
|
|
847
|
+
}
|
|
848
|
+
else
|
|
849
|
+
{
|
|
850
|
+
$$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpGap'=>0);
|
|
851
|
+
}
|
|
852
|
+
push @out,$$buf{line};
|
|
853
|
+
}
|
|
854
|
+
return @out;
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
|
|
858
|
+
sub apply_gapwin_filter
|
|
859
|
+
{
|
|
860
|
+
my ($opts,$line) = @_;
|
|
861
|
+
if ( !exists($$opts{GapWin_buffer}) ) { $$opts{GapWin_buffer}=[]; }
|
|
862
|
+
|
|
863
|
+
my $vcf = $$opts{vcf};
|
|
864
|
+
my $win = $$opts{filters}{GapWin};
|
|
865
|
+
my $buffer = $$opts{GapWin_buffer};
|
|
866
|
+
my ($indel_chr,$indel_pos,$to);
|
|
867
|
+
|
|
868
|
+
if ( defined $line )
|
|
869
|
+
{
|
|
870
|
+
# There may be multiple variants, only indels can be filtered
|
|
871
|
+
my $is_indel = 0;
|
|
872
|
+
for my $alt (split(/,/,$$line[4]))
|
|
873
|
+
{
|
|
874
|
+
my ($type,$len,$ht) = $vcf->event_type($$line[3],$alt);
|
|
875
|
+
if ( $type eq 'i' )
|
|
876
|
+
{
|
|
877
|
+
$is_indel = 1;
|
|
878
|
+
$indel_chr = $$line[0];
|
|
879
|
+
$indel_pos = $$line[1] + 1;
|
|
880
|
+
}
|
|
881
|
+
}
|
|
882
|
+
# The indel boundaries are based on REF (POS+1,POS+rlen-1). This is not
|
|
883
|
+
# correct as the indel can begin anywhere in the VCF4.x record with
|
|
884
|
+
# respect to POS. Specifically mpileup likes to write REF=CAGAGAGAGA
|
|
885
|
+
# ALT=CAGAGAGAGAGA. Thus this filtering is more strict and may remove
|
|
886
|
+
# some valid SNPs.
|
|
887
|
+
$to = $is_indel ? $$line[1]+length($$line[3])-1 : $$line[1];
|
|
888
|
+
push @$buffer, { line=>$line, chr=>$$line[0], from=>defined $indel_pos ? $indel_pos : $$line[1], to=>$to, exclude=>0, is_indel=>$is_indel };
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
my $n = @$buffer;
|
|
892
|
+
|
|
893
|
+
# Is the new line an indel? If yes, check the distance to all previous lines
|
|
894
|
+
if ( defined $indel_chr )
|
|
895
|
+
{
|
|
896
|
+
for (my $i=0; $i<$n-1; $i++)
|
|
897
|
+
{
|
|
898
|
+
my $buf = $$buffer[$i];
|
|
899
|
+
if ( $$buf{chr} ne $indel_chr ) { next; }
|
|
900
|
+
if ( !$$buf{is_indel} ) { next; }
|
|
901
|
+
if ( $$buf{to}>=$indel_pos-$win ) { $$buf{exclude}=1; $$buffer[-1]{exclude}=1; }
|
|
902
|
+
}
|
|
903
|
+
}
|
|
904
|
+
|
|
905
|
+
if ( defined $line && $$buffer[0]{chr} eq $$buffer[-1]{chr} && $win+$$buffer[0]{to}>=$$buffer[-1]{from} )
|
|
906
|
+
{
|
|
907
|
+
# There are not enough rows in the buffer: the GapWin window spans them all. Wait until there is more rows
|
|
908
|
+
# or a new chromosome
|
|
909
|
+
return ();
|
|
910
|
+
}
|
|
911
|
+
|
|
912
|
+
# Output. If no $line was given, output everything
|
|
913
|
+
$to = $$buffer[-1]{from}-$win;
|
|
914
|
+
my $chr = $$buffer[-1]{chr};
|
|
915
|
+
my @out;
|
|
916
|
+
while (@$buffer)
|
|
917
|
+
{
|
|
918
|
+
if ( $$buffer[0]{chr} eq $chr && $$buffer[0]{to}+$win>=$to && defined $line ) { last; }
|
|
919
|
+
|
|
920
|
+
my $buf = shift(@$buffer);
|
|
921
|
+
if ( $$buf{exclude} )
|
|
922
|
+
{
|
|
923
|
+
$$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'GapWin'=>1);
|
|
924
|
+
}
|
|
925
|
+
else
|
|
926
|
+
{
|
|
927
|
+
$$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'GapWin'=>0);
|
|
928
|
+
}
|
|
929
|
+
push @out,$$buf{line};
|
|
930
|
+
}
|
|
931
|
+
return @out;
|
|
932
|
+
}
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
sub apply_snpcluster_filter
|
|
936
|
+
{
|
|
937
|
+
my ($opts,$line) = @_;
|
|
938
|
+
|
|
939
|
+
my $buffer = $$opts{SnpCluster_buffer};
|
|
940
|
+
my $n = @$buffer;
|
|
941
|
+
|
|
942
|
+
# The buffer is empty and the line contains only reference alleles, print directly
|
|
943
|
+
if ( $n==0 && defined $line && $$line[4] eq '.' )
|
|
944
|
+
{
|
|
945
|
+
$$line[6] = $$opts{vcf}->add_filter($$line[6],'SnpCluster'=>0);
|
|
946
|
+
return $line;
|
|
947
|
+
}
|
|
948
|
+
|
|
949
|
+
# Store the line in buffer and check how many lines can be printed
|
|
950
|
+
my $to; # All lines up to and including this index will be printed
|
|
951
|
+
my $win = $$opts{SnpCluster_win};
|
|
952
|
+
if ( defined $line )
|
|
953
|
+
{
|
|
954
|
+
# Exclude REF (and maybe also other filters?) form SnpCluster
|
|
955
|
+
my $can_be_filtered = $$line[4] eq '.' ? 0 : 1;
|
|
956
|
+
push @$buffer, { line=>$line, chr=>$$line[0], pos=>$$line[1], can_be_filtered=>$can_be_filtered, in_cluster=>0 };
|
|
957
|
+
$n++;
|
|
958
|
+
|
|
959
|
+
# Does the buffer hold enough lines now?
|
|
960
|
+
my $last_chr = $$buffer[-1]{chr};
|
|
961
|
+
my $last_pos = $$buffer[-1]{pos};
|
|
962
|
+
for (my $i=$n-1; $i>=0; $i--)
|
|
963
|
+
{
|
|
964
|
+
my $buf = $$buffer[$i];
|
|
965
|
+
if ( $$buf{chr} ne $last_chr ) { $to=$i; last; }
|
|
966
|
+
if ( $last_pos - $$buf{pos} >= $win ) { $to=$i; last; }
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
if ( !defined $to ) { return; }
|
|
970
|
+
}
|
|
971
|
+
if ( !defined $to ) { $to=$n-1; }
|
|
972
|
+
|
|
973
|
+
# Calculate the number of variants within the window
|
|
974
|
+
my $count = 0;
|
|
975
|
+
my $max_count = $$opts{SnpCluster_count};
|
|
976
|
+
my $start_chr = $$buffer[0]{chr};
|
|
977
|
+
my $start_pos = $$buffer[0]{pos};
|
|
978
|
+
my $idx;
|
|
979
|
+
for ($idx=0; $idx<$n; $idx++)
|
|
980
|
+
{
|
|
981
|
+
my $buf = $$buffer[$idx];
|
|
982
|
+
if ( $$buf{chr} ne $start_chr ) { last; }
|
|
983
|
+
if ( $$buf{pos} - $win >= $start_pos ) { last; }
|
|
984
|
+
if ( $$buf{can_be_filtered} ) { $count++; }
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
# If a SNP cluster was found, set the in_cluster flag for all relevant sites.
|
|
988
|
+
# The buffer will be flushed and the orphans would pass unnoticed.
|
|
989
|
+
if ( $count>=$max_count )
|
|
990
|
+
{
|
|
991
|
+
for (my $i=0; $i<$idx; $i++)
|
|
992
|
+
{
|
|
993
|
+
if ( $$buffer[$i]{can_be_filtered} ) { $$buffer[$i]{in_cluster}=1; }
|
|
994
|
+
}
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
# Now output the lines, adding or removing the filter
|
|
998
|
+
my @out = ();
|
|
999
|
+
for (my $i=0; $i<=$to; $i++)
|
|
1000
|
+
{
|
|
1001
|
+
my $buf = shift(@$buffer);
|
|
1002
|
+
if ( $$buf{in_cluster} )
|
|
1003
|
+
{
|
|
1004
|
+
$$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>1);
|
|
1005
|
+
}
|
|
1006
|
+
else
|
|
1007
|
+
{
|
|
1008
|
+
$$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>0);
|
|
1009
|
+
}
|
|
1010
|
+
push @out,$$buf{line};
|
|
1011
|
+
}
|
|
1012
|
+
|
|
1013
|
+
# Output all non-variant lines at the beggining of the buffer
|
|
1014
|
+
while (@$buffer)
|
|
1015
|
+
{
|
|
1016
|
+
if ( $$buffer[0]{can_be_filtered} ) { last; }
|
|
1017
|
+
my $buf = shift(@$buffer);
|
|
1018
|
+
$$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>0);
|
|
1019
|
+
push @out,$$buf{line};
|
|
1020
|
+
}
|
|
1021
|
+
return @out;
|
|
1022
|
+
}
|
|
1023
|
+
|
|
1024
|
+
sub print_line
|
|
1025
|
+
{
|
|
1026
|
+
my ($line) = @_;
|
|
1027
|
+
if ( defined $line ) { print join("\t",@$line) . "\n"; }
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
#---------------------------------
|
|
1033
|
+
|
|
1034
|
+
package Reader;
|
|
1035
|
+
|
|
1036
|
+
use strict;
|
|
1037
|
+
use warnings;
|
|
1038
|
+
use Carp;
|
|
1039
|
+
|
|
1040
|
+
sub new
|
|
1041
|
+
{
|
|
1042
|
+
my ($class,@args) = @_;
|
|
1043
|
+
my $self = @args ? {@args} : {};
|
|
1044
|
+
bless $self, ref($class) || $class;
|
|
1045
|
+
if ( !$$self{delim} ) { $$self{delim} = qr/\t/; }
|
|
1046
|
+
if ( !$$self{chr} ) { $$self{chr} = 0; } # the index of the chromosome column (indexed from 0)
|
|
1047
|
+
if ( !$$self{from} ) { $$self{from} = 1; } # the index of the from column
|
|
1048
|
+
if ( !$$self{to} ) { $$self{to} = 2; } # the index of the to column
|
|
1049
|
+
return $self;
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
sub throw
|
|
1053
|
+
{
|
|
1054
|
+
my ($self,@msg) = @_;
|
|
1055
|
+
confess @msg;
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
sub open
|
|
1059
|
+
{
|
|
1060
|
+
my ($self,%args) = @_;
|
|
1061
|
+
if ( !$$self{file} ) { return; }
|
|
1062
|
+
$self->close();
|
|
1063
|
+
open($$self{fh},"tabix $$self{file} $args{region} |") or $self->throw("tabix $$self{file}: $!");
|
|
1064
|
+
}
|
|
1065
|
+
|
|
1066
|
+
sub close
|
|
1067
|
+
{
|
|
1068
|
+
my ($self) = @_;
|
|
1069
|
+
if ( !$$self{fh} ) { return; }
|
|
1070
|
+
close($$self{fh});
|
|
1071
|
+
delete($$self{fh});
|
|
1072
|
+
delete($$self{buffer});
|
|
1073
|
+
}
|
|
1074
|
+
|
|
1075
|
+
sub unread_line
|
|
1076
|
+
{
|
|
1077
|
+
my ($self,$line) = @_;
|
|
1078
|
+
unshift @{$$self{buffer}}, $line;
|
|
1079
|
+
return;
|
|
1080
|
+
}
|
|
1081
|
+
|
|
1082
|
+
sub next_line
|
|
1083
|
+
{
|
|
1084
|
+
my ($self) = @_;
|
|
1085
|
+
if ( !$$self{fh} ) { return undef; } # Run in dummy mode
|
|
1086
|
+
if ( $$self{buffer} && @{$$self{buffer}} ) { return shift(@{$$self{buffer}}); }
|
|
1087
|
+
my $line;
|
|
1088
|
+
# Skip comments
|
|
1089
|
+
while (1)
|
|
1090
|
+
{
|
|
1091
|
+
$line = readline($$self{fh});
|
|
1092
|
+
if ( !defined $line ) { return undef; }
|
|
1093
|
+
if ( $line=~/^#/ ) { next; }
|
|
1094
|
+
last;
|
|
1095
|
+
}
|
|
1096
|
+
my @items = split($$self{delim},$line);
|
|
1097
|
+
return \@items;
|
|
1098
|
+
}
|
|
1099
|
+
|