ngs_server 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/ngs_server +72 -50
- data/ext/bamtools/extconf.rb +3 -3
- data/ext/vcftools/Makefile +28 -0
- data/ext/vcftools/README.txt +36 -0
- data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
- data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
- data/ext/vcftools/cpp/.svn/entries +708 -0
- data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
- data/ext/vcftools/cpp/Makefile +46 -0
- data/ext/vcftools/cpp/dgeev.cpp +146 -0
- data/ext/vcftools/cpp/dgeev.h +43 -0
- data/ext/vcftools/cpp/output_log.cpp +79 -0
- data/ext/vcftools/cpp/output_log.h +34 -0
- data/ext/vcftools/cpp/parameters.cpp +535 -0
- data/ext/vcftools/cpp/parameters.h +154 -0
- data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
- data/ext/vcftools/cpp/vcf_entry.h +190 -0
- data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
- data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
- data/ext/vcftools/cpp/vcf_file.cpp +495 -0
- data/ext/vcftools/cpp/vcf_file.h +184 -0
- data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
- data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
- data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
- data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
- data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
- data/ext/vcftools/cpp/vcftools.cpp +107 -0
- data/ext/vcftools/cpp/vcftools.h +25 -0
- data/ext/vcftools/examples/.svn/all-wcprops +185 -0
- data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
- data/ext/vcftools/examples/.svn/entries +1048 -0
- data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
- data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
- data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
- data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
- data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
- data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
- data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
- data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
- data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
- data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
- data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
- data/ext/vcftools/examples/annotate-test.vcf +37 -0
- data/ext/vcftools/examples/annotate.out +23 -0
- data/ext/vcftools/examples/annotate.txt +7 -0
- data/ext/vcftools/examples/annotate2.out +52 -0
- data/ext/vcftools/examples/annotate3.out +23 -0
- data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
- data/ext/vcftools/examples/cmp-test.out +53 -0
- data/ext/vcftools/examples/concat-a.vcf +21 -0
- data/ext/vcftools/examples/concat-b.vcf +13 -0
- data/ext/vcftools/examples/concat-c.vcf +19 -0
- data/ext/vcftools/examples/concat.out +39 -0
- data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
- data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
- data/ext/vcftools/examples/merge-test-a.vcf +17 -0
- data/ext/vcftools/examples/merge-test-b.vcf +17 -0
- data/ext/vcftools/examples/merge-test-c.vcf +15 -0
- data/ext/vcftools/examples/merge-test.vcf.out +31 -0
- data/ext/vcftools/examples/perl-api-1.pl +46 -0
- data/ext/vcftools/examples/query-test.out +6 -0
- data/ext/vcftools/examples/shuffle-test.vcf +12 -0
- data/ext/vcftools/examples/subset.SNPs.out +10 -0
- data/ext/vcftools/examples/subset.indels.out +18 -0
- data/ext/vcftools/examples/subset.vcf +21 -0
- data/ext/vcftools/examples/valid-3.3.vcf +30 -0
- data/ext/vcftools/examples/valid-4.0.vcf +34 -0
- data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
- data/ext/vcftools/examples/valid-4.1.vcf +37 -0
- data/ext/vcftools/extconf.rb +2 -0
- data/ext/vcftools/perl/.svn/all-wcprops +149 -0
- data/ext/vcftools/perl/.svn/entries +844 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
- data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
- data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
- data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
- data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
- data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
- data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
- data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
- data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
- data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
- data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
- data/ext/vcftools/perl/ChangeLog +84 -0
- data/ext/vcftools/perl/FaSlice.pm +214 -0
- data/ext/vcftools/perl/Makefile +12 -0
- data/ext/vcftools/perl/Vcf.pm +2853 -0
- data/ext/vcftools/perl/VcfStats.pm +681 -0
- data/ext/vcftools/perl/fill-aa +103 -0
- data/ext/vcftools/perl/fill-an-ac +56 -0
- data/ext/vcftools/perl/fill-ref-md5 +204 -0
- data/ext/vcftools/perl/tab-to-vcf +92 -0
- data/ext/vcftools/perl/test.t +376 -0
- data/ext/vcftools/perl/vcf-annotate +1099 -0
- data/ext/vcftools/perl/vcf-compare +1193 -0
- data/ext/vcftools/perl/vcf-concat +310 -0
- data/ext/vcftools/perl/vcf-convert +180 -0
- data/ext/vcftools/perl/vcf-fix-newlines +97 -0
- data/ext/vcftools/perl/vcf-isec +660 -0
- data/ext/vcftools/perl/vcf-merge +577 -0
- data/ext/vcftools/perl/vcf-query +286 -0
- data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
- data/ext/vcftools/perl/vcf-sort +79 -0
- data/ext/vcftools/perl/vcf-stats +160 -0
- data/ext/vcftools/perl/vcf-subset +206 -0
- data/ext/vcftools/perl/vcf-to-tab +112 -0
- data/ext/vcftools/perl/vcf-validator +145 -0
- data/ext/vcftools/website/.svn/all-wcprops +41 -0
- data/ext/vcftools/website/.svn/entries +238 -0
- data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
- data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
- data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
- data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
- data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
- data/ext/vcftools/website/Makefile +6 -0
- data/ext/vcftools/website/README +2 -0
- data/ext/vcftools/website/VCF-poster.pdf +0 -0
- data/ext/vcftools/website/default.css +250 -0
- data/ext/vcftools/website/favicon.ico +0 -0
- data/ext/vcftools/website/favicon.png +0 -0
- data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/img/.svn/entries +300 -0
- data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
- data/ext/vcftools/website/img/bg.gif +0 -0
- data/ext/vcftools/website/img/bgcode.gif +0 -0
- data/ext/vcftools/website/img/bgcontainer.gif +0 -0
- data/ext/vcftools/website/img/bgul.gif +0 -0
- data/ext/vcftools/website/img/header.gif +0 -0
- data/ext/vcftools/website/img/li.gif +0 -0
- data/ext/vcftools/website/img/quote.gif +0 -0
- data/ext/vcftools/website/img/search.gif +0 -0
- data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/src/.svn/entries +300 -0
- data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
- data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
- data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
- data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
- data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
- data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
- data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
- data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
- data/ext/vcftools/website/src/docs.inc +202 -0
- data/ext/vcftools/website/src/index.inc +52 -0
- data/ext/vcftools/website/src/index.php +80 -0
- data/ext/vcftools/website/src/license.inc +27 -0
- data/ext/vcftools/website/src/links.inc +13 -0
- data/ext/vcftools/website/src/options.inc +654 -0
- data/ext/vcftools/website/src/perl_module.inc +249 -0
- data/ext/vcftools/website/src/specs.inc +18 -0
- data/lib/config.ru +9 -0
- data/lib/ngs_server/add.rb +9 -0
- data/lib/ngs_server/version.rb +1 -1
- data/lib/ngs_server.rb +55 -3
- data/ngs_server.gemspec +5 -2
- metadata +296 -6
@@ -0,0 +1,1193 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# Author: petr.danecek@sanger
|
4
|
+
#
|
5
|
+
|
6
|
+
use strict;
|
7
|
+
use warnings;
|
8
|
+
use Carp;
|
9
|
+
use Vcf;
|
10
|
+
use FaSlice;
|
11
|
+
|
12
|
+
my $opts = parse_params();
|
13
|
+
if ( exists($$opts{plot}) )
|
14
|
+
{
|
15
|
+
plot_stats($opts);
|
16
|
+
}
|
17
|
+
else
|
18
|
+
{
|
19
|
+
compare_vcfs($opts);
|
20
|
+
}
|
21
|
+
|
22
|
+
exit;
|
23
|
+
|
24
|
+
#--------------------------------
|
25
|
+
|
26
|
+
sub error
|
27
|
+
{
|
28
|
+
my (@msg) = @_;
|
29
|
+
if ( scalar @msg )
|
30
|
+
{
|
31
|
+
croak @msg;
|
32
|
+
}
|
33
|
+
die
|
34
|
+
"About: Compare bgzipped and tabix indexed VCF files. (E.g. bgzip file.vcf; tabix -p vcf file.vcf.gz)\n",
|
35
|
+
"Usage: vcf-compare [OPTIONS] file1.vcf file2.vcf ...\n",
|
36
|
+
" vcf-compare -p plots chr1.cmp chr2.cmp ...\n",
|
37
|
+
"Options:\n",
|
38
|
+
" -c, --chromosomes <list|file> Same as -r, left for backward compatibility. Please do not use as it will be dropped in the future.\n",
|
39
|
+
" -d, --debug Debugging information. Giving the option multiple times increases verbosity\n",
|
40
|
+
" -g, --cmp-genotypes Compare genotypes, not only positions\n",
|
41
|
+
" -m, --name-mapping <list|file> Use with -g when comparing files with differing column names. The argument to this options is a\n",
|
42
|
+
" comma-separated list or one mapping per line in a file. The names are colon separated and must\n",
|
43
|
+
" appear in the same order as the files on the command line.\n",
|
44
|
+
" -p, --plot <prefix> Create plots. Multiple files (e.g. per-chromosome outputs from vcf-compare) can be given.\n",
|
45
|
+
" -R, --refseq <file> Compare the actual sequence, not just positions. Use with -w to compare indels.\n",
|
46
|
+
" -r, --regions <list|file> Process the given regions (comma-separated list or one region per line in a file).\n",
|
47
|
+
" -s, --samples <list|file> Process only the listed samples. Excluding unwanted samples may increase performance considerably.\n",
|
48
|
+
" -t, --title <string> Title for graphs (see also -p)\n",
|
49
|
+
" -w, --win <int> In repetitive sequences, the same indel can be called at different positions. Consider\n",
|
50
|
+
" records this far apart as matching (be it a SNP or an indel).\n",
|
51
|
+
" -h, -?, --help This help message.\n",
|
52
|
+
"\n";
|
53
|
+
}
|
54
|
+
|
55
|
+
|
56
|
+
sub parse_params
|
57
|
+
{
|
58
|
+
my $opts = { positions=>0 };
|
59
|
+
while (my $arg=shift(@ARGV))
|
60
|
+
{
|
61
|
+
if ( $arg eq '--all-samples-af' ) { $$opts{all_samples_af}=1; next; }
|
62
|
+
if ( $arg eq '--INFO/AF1-af' ) { $$opts{INFO_AF1_af}=1; next; }
|
63
|
+
if ( $arg eq '--ignore-indels' ) { $$opts{ignore_indels}=1; next; }
|
64
|
+
if ( $arg eq '--high-conf-gls' ) { $$opts{high_confidence_gls}=shift(@ARGV); next; }
|
65
|
+
if ( $arg eq '-m' || $arg eq '--name-mapping' ) { $$opts{mappings_list}=shift(@ARGV); next; }
|
66
|
+
if ( $arg eq '-R' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; }
|
67
|
+
if ( $arg eq '-c' || $arg eq '--chromosomes' ) { $$opts{regions_list}=shift(@ARGV); next; }
|
68
|
+
if ( $arg eq '-r' || $arg eq '--regions' ) { $$opts{regions_list}=shift(@ARGV); next; }
|
69
|
+
if ( $arg eq '-g' || $arg eq '--cmp-genotypes' ) { $$opts{cmp_genotypes}=1; next; }
|
70
|
+
if ( $arg eq '-s' || $arg eq '--samples' )
|
71
|
+
{
|
72
|
+
my $samples = shift(@ARGV);
|
73
|
+
my @samples = ( -e $samples ) ? read_list($samples) : split(/,/,$samples);
|
74
|
+
$$opts{samples} = \@samples;
|
75
|
+
next;
|
76
|
+
}
|
77
|
+
if ( $arg eq '-d' || $arg eq '--debug' ) { $$opts{debug}++; next; }
|
78
|
+
if ( $arg eq '-w' || $arg eq '--win' ) { $$opts{win}=shift(@ARGV); next; }
|
79
|
+
if ( $arg eq '-p' || $arg eq '--plot' ) { $$opts{plot}=shift(@ARGV); next; }
|
80
|
+
if ( $arg eq '-t' || $arg eq '--title' ) { $$opts{title}=shift(@ARGV); next; }
|
81
|
+
if ( -e $arg ) { push @{$$opts{files}}, $arg; next }
|
82
|
+
if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
|
83
|
+
error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n");
|
84
|
+
}
|
85
|
+
if ( !exists($$opts{files}) ) { error("What files should be compared?\n") }
|
86
|
+
return $opts;
|
87
|
+
}
|
88
|
+
|
89
|
+
sub read_list
|
90
|
+
{
|
91
|
+
my ($fname) = @_;
|
92
|
+
my @regions;
|
93
|
+
if ( -e $fname )
|
94
|
+
{
|
95
|
+
open(my $rgs,'<',$fname) or error("$fname: $!");
|
96
|
+
while (my $line=<$rgs>)
|
97
|
+
{
|
98
|
+
chomp($line);
|
99
|
+
push @regions, $line;
|
100
|
+
}
|
101
|
+
close($rgs);
|
102
|
+
}
|
103
|
+
else
|
104
|
+
{
|
105
|
+
@regions = split(/,/,$fname);
|
106
|
+
}
|
107
|
+
return (@regions);
|
108
|
+
}
|
109
|
+
|
110
|
+
sub read_mappings_list
|
111
|
+
{
|
112
|
+
my ($fname,$files) = @_;
|
113
|
+
my @maps = read_list($fname);
|
114
|
+
my %mapping;
|
115
|
+
for my $map (@maps)
|
116
|
+
{
|
117
|
+
my @items = split(/:/,$map);
|
118
|
+
if ( scalar @items != scalar @$files ) { error(sprintf "Expected %d column names, found [$map].\n", scalar @$files); }
|
119
|
+
for (my $i=1; $i<@$files; $i++)
|
120
|
+
{
|
121
|
+
$mapping{$$files[$i]}{$items[$i]} = $items[0];
|
122
|
+
warn("Using column name '$items[0]' for $$files[$i]:$items[$i]\n");
|
123
|
+
}
|
124
|
+
}
|
125
|
+
return \%mapping;
|
126
|
+
}
|
127
|
+
|
128
|
+
sub compare_vcfs
|
129
|
+
{
|
130
|
+
my ($opts) = @_;
|
131
|
+
|
132
|
+
$$opts{match} = {};
|
133
|
+
$$opts{hapls} = {};
|
134
|
+
|
135
|
+
# Open the VCF files and initialize the list of chromosomes
|
136
|
+
my @vcfs;
|
137
|
+
my (@regions,%has_chrom,$mappings);
|
138
|
+
if ( exists($$opts{regions_list}) ) { @regions = read_list($$opts{regions_list}); }
|
139
|
+
if ( exists($$opts{mappings_list}) ) { $mappings = read_mappings_list($$opts{mappings_list},$$opts{files}); }
|
140
|
+
|
141
|
+
print "# This file was generated by vcf-compare.\n#\n";
|
142
|
+
if ( $$opts{debug} )
|
143
|
+
{
|
144
|
+
print
|
145
|
+
"#SD Site discordance. Use `grep ^SD | cut -f 2-` to extract this part.\n",
|
146
|
+
"#SD The columns are: \n",
|
147
|
+
"#SD 1 .. chromosome\n",
|
148
|
+
"#SD 2 .. position\n",
|
149
|
+
"#SD 3 .. indicates matching (+) or mismatching (-) site\n",
|
150
|
+
"#SD 4 .. number of Hom_RR mismatches\n",
|
151
|
+
"#SD 5 .. number of Het_RA mismatches\n",
|
152
|
+
"#SD 6 .. number of Hom_AA mismatches\n";
|
153
|
+
}
|
154
|
+
|
155
|
+
my $ifile = 0;
|
156
|
+
for my $file (@{$$opts{files}})
|
157
|
+
{
|
158
|
+
my $vcf = Vcf->new(file=>$file);
|
159
|
+
$$vcf{vcf_compare_ID} = $ifile++;
|
160
|
+
$vcf->parse_header();
|
161
|
+
$vcf->close();
|
162
|
+
$$vcf{nread} = 0;
|
163
|
+
push @vcfs, $vcf;
|
164
|
+
|
165
|
+
# Update the list of known chromosomes
|
166
|
+
if ( !exists($$opts{regions_list}) )
|
167
|
+
{
|
168
|
+
my $chrms = $vcf->get_chromosomes();
|
169
|
+
for my $chr (@$chrms)
|
170
|
+
{
|
171
|
+
if ( exists($has_chrom{$chr}) ) { next; }
|
172
|
+
$has_chrom{$chr} = 1;
|
173
|
+
push @regions, $chr;
|
174
|
+
}
|
175
|
+
}
|
176
|
+
|
177
|
+
# Check if column names need to be renamed
|
178
|
+
if ( defined $mappings && exists($$mappings{$$vcf{file}}) )
|
179
|
+
{
|
180
|
+
$$vcf{_col_mapping} = $$mappings{$$vcf{file}};
|
181
|
+
for my $name (keys %{$$vcf{_col_mapping}})
|
182
|
+
{
|
183
|
+
if ( !exists($$vcf{has_column}{$name}) ) { error("No such column [$name] in the file $$vcf{file}\n"); }
|
184
|
+
my $new_name = $$vcf{_col_mapping}{$name};
|
185
|
+
$$vcf{_col_mapping_rev}{$new_name} = $name;
|
186
|
+
}
|
187
|
+
}
|
188
|
+
}
|
189
|
+
|
190
|
+
# Include only matching samples in haplotype comparison
|
191
|
+
if ( $$opts{cmp_genotypes} )
|
192
|
+
{
|
193
|
+
my %all_samples;
|
194
|
+
for my $vcf (@vcfs)
|
195
|
+
{
|
196
|
+
if ( exists $$opts{samples} )
|
197
|
+
{
|
198
|
+
for my $sample (@{$$opts{samples}})
|
199
|
+
{
|
200
|
+
if ( exists($$vcf{_col_mapping}) && exists($$vcf{_col_mapping}{$sample}) ) { $sample = $$vcf{_col_mapping}{$sample}; }
|
201
|
+
if ( exists($$vcf{has_column}{$sample}) ) { $all_samples{$sample}++ }
|
202
|
+
}
|
203
|
+
}
|
204
|
+
else
|
205
|
+
{
|
206
|
+
my @samples = $vcf->get_samples();
|
207
|
+
for my $sample (@samples)
|
208
|
+
{
|
209
|
+
if ( exists($$vcf{_col_mapping}) && exists($$vcf{_col_mapping}{$sample}) ) { $sample = $$vcf{_col_mapping}{$sample}; }
|
210
|
+
$all_samples{$sample}++
|
211
|
+
}
|
212
|
+
}
|
213
|
+
}
|
214
|
+
my @include_samples;
|
215
|
+
while (my ($sample,$count)=each %all_samples)
|
216
|
+
{
|
217
|
+
if ( $count != scalar @vcfs ) { next; }
|
218
|
+
push @include_samples, $sample;
|
219
|
+
}
|
220
|
+
if ( !@include_samples )
|
221
|
+
{
|
222
|
+
error("Error: There is no overlap between any of the samples, yet haplotype comparison was requested.\n");
|
223
|
+
}
|
224
|
+
$$opts{gt_samples_compared} = scalar @include_samples;
|
225
|
+
for my $vcf (@vcfs)
|
226
|
+
{
|
227
|
+
my @include;
|
228
|
+
if ( !exists($$vcf{_col_mapping}) ) { @include=@include_samples; }
|
229
|
+
else
|
230
|
+
{
|
231
|
+
for my $sample (@include_samples)
|
232
|
+
{
|
233
|
+
push @include, exists($$vcf{_col_mapping_rev}{$sample}) ? $$vcf{_col_mapping_rev}{$sample} : $sample
|
234
|
+
}
|
235
|
+
}
|
236
|
+
$vcf->set_samples(include=>\@include);
|
237
|
+
}
|
238
|
+
}
|
239
|
+
|
240
|
+
# Go through all the files simultaneously and get the stats.
|
241
|
+
for my $region (@regions)
|
242
|
+
{
|
243
|
+
# Open files
|
244
|
+
for my $vcf (@vcfs)
|
245
|
+
{
|
246
|
+
delete($$vcf{last_line});
|
247
|
+
$vcf->open(region=>$region,parse_header=>1);
|
248
|
+
delete($$vcf{eof});
|
249
|
+
}
|
250
|
+
do_region_stats($opts,\@vcfs);
|
251
|
+
}
|
252
|
+
|
253
|
+
report_stats($opts,\@vcfs);
|
254
|
+
|
255
|
+
for my $vcf (@vcfs)
|
256
|
+
{
|
257
|
+
if ( !$$vcf{nread} ) { warn("Warning: Read 0 lines from $$vcf{file}, the tabix index may be broken.\n"); }
|
258
|
+
}
|
259
|
+
}
|
260
|
+
|
261
|
+
sub report_stats
|
262
|
+
{
|
263
|
+
my ($opts,$vcfs) = @_;
|
264
|
+
|
265
|
+
# if ( $$opts{debug} )
|
266
|
+
# {
|
267
|
+
# use Data::Dumper; print Dumper($opts);
|
268
|
+
# }
|
269
|
+
|
270
|
+
my (@counts,%totals);
|
271
|
+
while (my ($key,$num) = each %{$$opts{match}})
|
272
|
+
{
|
273
|
+
my @files = split(/\s+/,$key);
|
274
|
+
for my $file (@files)
|
275
|
+
{
|
276
|
+
$totals{$file} += $num;
|
277
|
+
}
|
278
|
+
push @counts, {count=>$num, files=>[@files]};
|
279
|
+
}
|
280
|
+
|
281
|
+
print
|
282
|
+
"#VN 'Venn-Diagram Numbers'. Use `grep ^VN | cut -f 2-` to extract this part.\n",
|
283
|
+
"#VN The columns are: \n",
|
284
|
+
"#VN 1 .. number of sites unique to this particular combination of files\n",
|
285
|
+
"#VN 2- .. combination of files and space-separated number, a fraction of sites in the file\n";
|
286
|
+
for my $rec (sort {$$a{count}<=>$$b{count}} @counts)
|
287
|
+
{
|
288
|
+
my $num = $$rec{count};
|
289
|
+
my $files = $$rec{files};
|
290
|
+
|
291
|
+
print "VN\t$num";
|
292
|
+
for my $file (@$files)
|
293
|
+
{
|
294
|
+
printf "\t$file (%.1f%%)", $num*100./$totals{$file};
|
295
|
+
}
|
296
|
+
print "\n";
|
297
|
+
}
|
298
|
+
|
299
|
+
if ( $$opts{refseq} && $$opts{indels} )
|
300
|
+
{
|
301
|
+
print
|
302
|
+
"#IN Indel Numbers. Use `grep ^IN | cut -f 2-` to extract this part.\n",
|
303
|
+
"#IN .. todo\n",
|
304
|
+
"#IN Number of matching indel haplotypes shared across:\n";
|
305
|
+
while (my ($file,$stat) = each %{$$opts{indels}})
|
306
|
+
{
|
307
|
+
print "IN\t$file\n";
|
308
|
+
my $match = $$stat{match} ? $$stat{match} : 0;
|
309
|
+
my $mismatch = $$stat{mismatch} ? $$stat{mismatch} : 0;
|
310
|
+
printf "\t\tNumber of matches: %d\n", $match;
|
311
|
+
printf "\t\t mismatches: %d\n", $mismatch;
|
312
|
+
printf "\t\t error rate: %.1f%%\n", 100*$mismatch/($match+$mismatch);
|
313
|
+
}
|
314
|
+
}
|
315
|
+
|
316
|
+
print "#SN Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n";
|
317
|
+
printf "SN\tNumber of REF matches:\t%d\n", exists($$opts{ref_match}) ? $$opts{ref_match} : 0;
|
318
|
+
printf "SN\tNumber of ALT matches:\t%d\n", exists($$opts{alt_match}) ? $$opts{alt_match} : 0;
|
319
|
+
printf "SN\tNumber of REF mismatches:\t%d\n", exists($$opts{ref_mismatch}) ? $$opts{ref_mismatch} : 0;
|
320
|
+
printf "SN\tNumber of ALT mismatches:\t%d\n", exists($$opts{alt_mismatch}) ? $$opts{alt_mismatch} : 0;
|
321
|
+
printf "SN\tNumber of samples in GT comparison:\t%d\n", $$opts{gt_samples_compared} ? $$opts{gt_samples_compared} : 0;
|
322
|
+
|
323
|
+
my $out;
|
324
|
+
for my $vcf (@$vcfs)
|
325
|
+
{
|
326
|
+
if ( !exists($totals{$$vcf{file}}) ) { $totals{$$vcf{file}}=0; }
|
327
|
+
if ( $totals{$$vcf{file}} == $$vcf{nread} ) { next; }
|
328
|
+
|
329
|
+
my $diff = $$vcf{nread}-$totals{$$vcf{file}};
|
330
|
+
my $reported = $totals{$$vcf{file}};
|
331
|
+
my $total = $$vcf{nread};
|
332
|
+
$out .= sprintf "SN\t%d (%.1f%%) .. read %d, reported %d\t%s\n", $diff,$diff*100./$total,$total,$reported,$$vcf{file};
|
333
|
+
}
|
334
|
+
if ( $out )
|
335
|
+
{
|
336
|
+
print "SN\tNumber of sites lost due to grouping (e.g. duplicate sites)\n";
|
337
|
+
print $out;
|
338
|
+
}
|
339
|
+
|
340
|
+
|
341
|
+
if ( !$$opts{cmp_genotypes} ) { return; }
|
342
|
+
|
343
|
+
my %summary;
|
344
|
+
for my $id (keys %{$$opts{hapls}})
|
345
|
+
{
|
346
|
+
for my $key qw(hom_RR_ het_RA_ hom_AA_ het_AA_)
|
347
|
+
{
|
348
|
+
if ( !exists($$opts{hapls}{$id}{$key.'gtype_mismatch'}) ) { $$opts{hapls}{$id}{$key.'gtype_mismatch'}=0; }
|
349
|
+
$$opts{hapls}{$id}{total_gtype_mismatch} += $$opts{hapls}{$id}{$key.'gtype_mismatch'};
|
350
|
+
|
351
|
+
if ( !exists($$opts{hapls}{$id}{$key.'gtype_match'}) ) { $$opts{hapls}{$id}{$key.'gtype_match'}=0; }
|
352
|
+
$$opts{hapls}{$id}{total_gtype_match} += $$opts{hapls}{$id}{$key.'gtype_match'};
|
353
|
+
|
354
|
+
if ( !exists($$opts{hapls}{$id}{$key.'gtype_lost'}) ) { $$opts{hapls}{$id}{$key.'gtype_lost'}=0; }
|
355
|
+
$$opts{hapls}{$id}{total_gtype_lost} += $$opts{hapls}{$id}{$key.'gtype_lost'};
|
356
|
+
|
357
|
+
if ( !exists($$opts{hapls}{$id}{$key.'gtype_gained'}) ) { $$opts{hapls}{$id}{$key.'gtype_gained'}=0; }
|
358
|
+
$$opts{hapls}{$id}{total_gtype_gained} += $$opts{hapls}{$id}{$key.'gtype_gained'};
|
359
|
+
|
360
|
+
$summary{$key}{match} += $$opts{hapls}{$id}{$key.'gtype_match'};
|
361
|
+
$summary{$key}{mismatch} += $$opts{hapls}{$id}{$key.'gtype_mismatch'};
|
362
|
+
}
|
363
|
+
for my $key qw(het_RA_ het_AA_)
|
364
|
+
{
|
365
|
+
if ( !exists($$opts{hapls}{$id}{$key.'phase_match'}) ) { $$opts{hapls}{$id}{$key.'phase_match'}=0; }
|
366
|
+
$$opts{hapls}{$id}{total_phase_match} += $$opts{hapls}{$id}{$key.'phase_match'};
|
367
|
+
|
368
|
+
if ( !exists($$opts{hapls}{$id}{$key.'phase_mismatch'}) ) { $$opts{hapls}{$id}{$key.'phase_mismatch'}=0; }
|
369
|
+
$$opts{hapls}{$id}{total_phase_mismatch} += $$opts{hapls}{$id}{$key.'phase_mismatch'};
|
370
|
+
|
371
|
+
if ( !exists($$opts{hapls}{$id}{$key.'phase_lost'}) ) { $$opts{hapls}{$id}{$key.'phase_lost'}=0; }
|
372
|
+
$$opts{hapls}{$id}{total_phase_lost} += $$opts{hapls}{$id}{$key.'phase_lost'};
|
373
|
+
}
|
374
|
+
}
|
375
|
+
print
|
376
|
+
"#GS Genotype Comparison Summary. Use `grep ^GS | cut -f 2-` to extract this part.\n",
|
377
|
+
"#GS The columns are:\n",
|
378
|
+
"#GS 1 .. variant type\n",
|
379
|
+
"#GS 2 .. number of mismatches\n",
|
380
|
+
"#GS 3 .. number of matches\n",
|
381
|
+
"#GS 4 .. discordance\n";
|
382
|
+
print_gs($opts,\%summary);
|
383
|
+
|
384
|
+
print
|
385
|
+
"\n",
|
386
|
+
"#GC Genotype Comparison. Use `grep ^GC | cut -f 2-` to extract this part.\n",
|
387
|
+
"#GC The columns are:\n",
|
388
|
+
"#GC 1 .. Sample\n",
|
389
|
+
"#GC 2-6 .. Gtype mismatches: total hom_RR hom_AA het_RA het_AA \n",
|
390
|
+
"#GC 7-9 .. Gtype lost: total het_RA het_AA \n",
|
391
|
+
"#GC 10-14 .. Gtype gained: total hom_RR hom_AA het_RA het_AA \n",
|
392
|
+
"#GC 15-17 .. Phase lost: total het_RA het_AA \n",
|
393
|
+
"#GC 18 .. Phase gained\n",
|
394
|
+
"#GC 19-23 .. Matching sites: total hom_RR hom_AA het_RA het_AA \n",
|
395
|
+
"#GC 24-26 .. Phased matches: total het_RA het_AA \n",
|
396
|
+
"#GC 27-29 .. Misphased matches: total het_RA het_AA \n";
|
397
|
+
|
398
|
+
for my $id (keys %{$$opts{hapls}})
|
399
|
+
{
|
400
|
+
print "GC\t$id";
|
401
|
+
for my $key qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_mismatch'}; }
|
402
|
+
for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_lost'}; }
|
403
|
+
for my $key qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_gained'}; }
|
404
|
+
for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'phase_lost'}; }
|
405
|
+
if ( !exists($$opts{hapls}{$id}{phase_gained}) ) { $$opts{hapls}{$id}{phase_gained}=0; }
|
406
|
+
print "\t",$$opts{hapls}{$id}{phase_gained};
|
407
|
+
for my $key qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_match'}; }
|
408
|
+
for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'phase_match'}; }
|
409
|
+
for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'phase_mismatch'}; }
|
410
|
+
print "\n";
|
411
|
+
}
|
412
|
+
|
413
|
+
print
|
414
|
+
"#AF Number of matching and mismatching genotypes vs non-ref allele frequency. Use `^AF | cut -f 2-` to extract this part.\n",
|
415
|
+
"#AF The columns are:\n",
|
416
|
+
"#AF 1 .. Non-ref allele count\n",
|
417
|
+
"#AF 2 .. Hom(RR) matches\n",
|
418
|
+
"#AF 3 .. Het(RA) matches\n",
|
419
|
+
"#AF 4 .. Hom(AA) matches\n",
|
420
|
+
"#AF 5 .. Het(AA) matches\n",
|
421
|
+
"#AF 6 .. Hom(RR) mismatches\n",
|
422
|
+
"#AF 7 .. Het(RA) mismatches\n",
|
423
|
+
"#AF 8 .. Hom(AA) mismatches\n",
|
424
|
+
"#AF 9 .. Het(AA) mismatches\n";
|
425
|
+
for my $ac (sort {$a<=>$b} keys %{$$opts{counts_by_af}})
|
426
|
+
{
|
427
|
+
print "AF\t$ac";
|
428
|
+
for my $key qw(hom_RR_ het_RA_ hom_AA_ het_AA_)
|
429
|
+
{
|
430
|
+
print "\t", $$opts{counts_by_af}{$ac}{$key}{matches} ? $$opts{counts_by_af}{$ac}{$key}{matches} : 0;
|
431
|
+
}
|
432
|
+
for my $key qw(hom_RR_ het_RA_ hom_AA_ het_AA_)
|
433
|
+
{
|
434
|
+
print "\t", $$opts{counts_by_af}{$ac}{$key}{mismatches} ? $$opts{counts_by_af}{$ac}{$key}{mismatches} : 0;
|
435
|
+
}
|
436
|
+
print "\n";
|
437
|
+
}
|
438
|
+
|
439
|
+
print "#DP Counts by depth. Use `grep ^DP | cut -f 2-` to extract this part.\n";
|
440
|
+
print "#DP The columns are:\n";
|
441
|
+
print "#DP 1 .. depth\n";
|
442
|
+
print "#DP 2 .. RR matches\n";
|
443
|
+
print "#DP 3 .. RA matches\n";
|
444
|
+
print "#DP 4 .. AA matches\n";
|
445
|
+
print "#DP 5 .. RR -> RA mismatches\n";
|
446
|
+
print "#DP 6 .. RR -> AA mismatches\n";
|
447
|
+
print "#DP 7 .. RA -> RR mismatches\n";
|
448
|
+
print "#DP 8 .. RA -> AA mismatches\n";
|
449
|
+
print "#DP 9 .. AA -> RR mismatches\n";
|
450
|
+
print "#DP 10 .. AA -> RA mismatches\n";
|
451
|
+
for my $dp (sort {$a<=>$b} keys %{$$opts{counts_by_dp}})
|
452
|
+
{
|
453
|
+
print "DP\t$dp";
|
454
|
+
for my $type qw(hom_RR_-hom_RR_ het_RA_-het_RA_ hom_AA_-hom_AA_ hom_RR_-het_RA_ hom_RR_-hom_AA_ het_RA_-hom_RR_ het_RA_-hom_AA_ hom_AA_-hom_RR_ hom_AA_-het_RA_)
|
455
|
+
{
|
456
|
+
printf "\t%d", exists($$opts{counts_by_dp}{$dp}{$type}) ? $$opts{counts_by_dp}{$dp}{$type} : 0;
|
457
|
+
}
|
458
|
+
print "\n";
|
459
|
+
}
|
460
|
+
|
461
|
+
if ( $$opts{debug} )
|
462
|
+
{
|
463
|
+
print "#MT Mismatch Types\n";
|
464
|
+
for my $t1 (keys %{$$opts{mismatch_types}})
|
465
|
+
{
|
466
|
+
for my $t2 (keys %{$$opts{mismatch_types}{$t1}})
|
467
|
+
{
|
468
|
+
print "MT\t$t1\t$t2\t$$opts{mismatch_types}{$t1}{$t2}\n";
|
469
|
+
}
|
470
|
+
}
|
471
|
+
}
|
472
|
+
}
|
473
|
+
|
474
|
+
sub print_gs
|
475
|
+
{
|
476
|
+
my ($opts,$stats) = @_;
|
477
|
+
my ($ndr_ms,$ndr_m);
|
478
|
+
for my $key qw(hom_RR het_RA hom_AA het_AA)
|
479
|
+
{
|
480
|
+
my $m = $$stats{"${key}_"}{match};
|
481
|
+
my $ms = $$stats{"${key}_"}{mismatch};
|
482
|
+
if ( !$m ) { $m=0; }
|
483
|
+
if ( !$ms ) { $ms=0; }
|
484
|
+
printf "GS\t$key\t%d\t%d\t%.2f%%\n", $ms,$m,$m?$ms*100./($m+$ms):0;
|
485
|
+
$ndr_ms += $ms;
|
486
|
+
$ndr_m += $key eq 'hom_RR' ? 0 : $m;
|
487
|
+
}
|
488
|
+
printf
|
489
|
+
"SN\tNon-reference Discordance Rate (NDR):\t%.2f\n", $ndr_m+$ndr_ms ? $ndr_ms*100./($ndr_m+$ndr_ms) : 0;
|
490
|
+
}
|
491
|
+
|
492
|
+
sub read_stats
|
493
|
+
{
|
494
|
+
my ($stats,$file) = @_;
|
495
|
+
open(my $fh,'<',$file) or error("$file: $!");
|
496
|
+
while (my $line=<$fh>)
|
497
|
+
{
|
498
|
+
if ( $line=~/^#/ ) { next; }
|
499
|
+
my @items = split(/\t/,$line);
|
500
|
+
chomp($items[-1]);
|
501
|
+
if ( $items[0] eq 'DP' )
|
502
|
+
{
|
503
|
+
my $dp = $items[1];
|
504
|
+
$$stats{dp}{ndist}{$dp} += $items[2] + $items[3] + $items[4] + $items[5] + $items[6] + $items[7] + $items[8] + $items[9] + $items[10];
|
505
|
+
$$stats{dp}{RR}{RA}{$dp} += $items[5];
|
506
|
+
$$stats{dp}{n}{RR}{RA} += $items[5];
|
507
|
+
$$stats{dp}{RR}{AA}{$dp} += $items[6];
|
508
|
+
$$stats{dp}{n}{RR}{AA} += $items[6];
|
509
|
+
$$stats{dp}{RA}{RR}{$dp} += $items[7];
|
510
|
+
$$stats{dp}{n}{RA}{RR} += $items[7];
|
511
|
+
$$stats{dp}{RA}{AA}{$dp} += $items[8];
|
512
|
+
$$stats{dp}{n}{RA}{AA} += $items[8];
|
513
|
+
$$stats{dp}{AA}{RR}{$dp} += $items[9];
|
514
|
+
$$stats{dp}{n}{AA}{RR} += $items[9];
|
515
|
+
$$stats{dp}{AA}{RA}{$dp} += $items[10];
|
516
|
+
$$stats{dp}{n}{AA}{RA} += $items[10];
|
517
|
+
}
|
518
|
+
if ( $items[0] eq 'AF' )
|
519
|
+
{
|
520
|
+
my $af = $items[1];
|
521
|
+
$$stats{af}{RR}{$af}{matches} += $items[2];
|
522
|
+
$$stats{af}{RA}{$af}{matches} += $items[3];
|
523
|
+
$$stats{af}{AA}{$af}{matches} += $items[4];
|
524
|
+
$$stats{af}{RR}{$af}{mismatches} += $items[6];
|
525
|
+
$$stats{af}{RA}{$af}{mismatches} += $items[7];
|
526
|
+
$$stats{af}{AA}{$af}{mismatches} += $items[8];
|
527
|
+
}
|
528
|
+
if ( $items[0] eq 'GS' )
|
529
|
+
{
|
530
|
+
my $type = $items[1];
|
531
|
+
$$stats{gs}{$type.'_'}{mismatch} += $items[2];
|
532
|
+
$$stats{gs}{$type.'_'}{match} += $items[3];
|
533
|
+
}
|
534
|
+
}
|
535
|
+
close($fh);
|
536
|
+
}
|
537
|
+
|
538
|
+
sub plot_stats
|
539
|
+
{
|
540
|
+
my ($opts) = @_;
|
541
|
+
my $stats = {};
|
542
|
+
for my $file (@{$$opts{files}})
|
543
|
+
{
|
544
|
+
read_stats($stats,$file);
|
545
|
+
}
|
546
|
+
plot_dp($opts,$$stats{dp});
|
547
|
+
plot_af($opts,$$stats{af});
|
548
|
+
print_gs($opts,$$stats{gs});
|
549
|
+
}
|
550
|
+
|
551
|
+
sub plot
|
552
|
+
{
|
553
|
+
my ($file) = @_;
|
554
|
+
system("GDFONTPATH=/usr/share/fonts/truetype/ttf-dejavu/ gnuplot $file");
|
555
|
+
}
|
556
|
+
|
557
|
+
sub plot_dp
|
558
|
+
{
|
559
|
+
my ($opts,$stats) = @_;
|
560
|
+
|
561
|
+
my $out;
|
562
|
+
my @plots;
|
563
|
+
for my $agt (sort keys %$stats)
|
564
|
+
{
|
565
|
+
if ( $agt eq 'n' or $agt eq 'ndist' ) { next; }
|
566
|
+
for my $bgt (sort keys %{$$stats{$agt}})
|
567
|
+
{
|
568
|
+
if ( $bgt eq 'n' ) { next; }
|
569
|
+
for my $dp (sort {$a<=>$b} keys %{$$stats{$agt}{$bgt}})
|
570
|
+
{
|
571
|
+
$out .= $dp . "\t" . ($$stats{n}{$agt}{$bgt} ? $$stats{$agt}{$bgt}{$dp}*100./$$stats{n}{$agt}{$bgt} : 0) . "\n";
|
572
|
+
}
|
573
|
+
$out .= "end\n";
|
574
|
+
push @plots, qq["-" using 1:2 with linespoints pt 12 title "$agt -> $bgt"];
|
575
|
+
}
|
576
|
+
}
|
577
|
+
|
578
|
+
open(my $fh,'>',"$$opts{plot}-dp.gp") or error("$$opts{plot}-dp.gp: $!");
|
579
|
+
print $fh q[
|
580
|
+
set terminal png size 600,400 truecolor font "DejaVuSansMono,9"
|
581
|
+
set output "] . "$$opts{plot}-dp.png" . q["
|
582
|
+
set ylabel 'Fraction of GTs [%]'
|
583
|
+
set y2label 'Number of GTs total'
|
584
|
+
set y2tics
|
585
|
+
set ytics nomirror
|
586
|
+
set xlabel 'Depth'
|
587
|
+
set xrange [:20]
|
588
|
+
];
|
589
|
+
if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; }
|
590
|
+
print $fh "plot ", join(',',@plots), qq[, '-' using 1:2 axes x1y2 with lines lt 0 title "GTs total"\n];
|
591
|
+
print $fh $out;
|
592
|
+
for my $dp (sort {$a<=>$b} keys %{$$stats{ndist}})
|
593
|
+
{
|
594
|
+
print $fh "$dp\t$$stats{ndist}{$dp}\n";
|
595
|
+
}
|
596
|
+
print $fh "end\n";
|
597
|
+
close($fh);
|
598
|
+
|
599
|
+
plot("$$opts{plot}-dp.gp");
|
600
|
+
}
|
601
|
+
|
602
|
+
sub plot_af
|
603
|
+
{
|
604
|
+
my ($opts,$stats) = @_;
|
605
|
+
|
606
|
+
open(my $fh,'>',"$$opts{plot}-af.gp") or error("$$opts{plot}-af.gp: $!");
|
607
|
+
if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; }
|
608
|
+
print $fh q[
|
609
|
+
set terminal png size 550,400 truecolor font "DejaVuSansMono,9"
|
610
|
+
set output "] . "$$opts{plot}-af.png" . q["
|
611
|
+
set grid back lc rgb "#dddddd"
|
612
|
+
set xlabel "Non-reference allele frequency"
|
613
|
+
set ylabel "Concordance"
|
614
|
+
set y2label "Number of genotypes"
|
615
|
+
set yrange [0.0:1.0]
|
616
|
+
set y2tics
|
617
|
+
set key center
|
618
|
+
|
619
|
+
plot '-' axes x1y2 with lines lw 1 lc rgb "red" notitle, \
|
620
|
+
'-' axes x1y2 with lines lw 1 lc rgb "green" notitle, \
|
621
|
+
'-' axes x1y2 with lines lw 1 lc rgb "blue" notitle, \
|
622
|
+
'-' with points pt 20 lc rgb "red" title "HomRef", \
|
623
|
+
'-' with points pt 20 lc rgb "green" title "Het", \
|
624
|
+
'-' with points pt 20 lc rgb "blue" title "HomAlt"
|
625
|
+
];
|
626
|
+
|
627
|
+
for my $type qw(RR RA AA)
|
628
|
+
{
|
629
|
+
for my $af (sort {$a<=>$b} keys %{$$stats{$type}})
|
630
|
+
{
|
631
|
+
print $fh "$af\t" . ($$stats{$type}{$af}{matches}+$$stats{$type}{$af}{mismatches}) . "\n";
|
632
|
+
}
|
633
|
+
print $fh "end\n";
|
634
|
+
}
|
635
|
+
for my $type qw(RR RA AA)
|
636
|
+
{
|
637
|
+
for my $af (sort {$a<=>$b} keys %{$$stats{$type}})
|
638
|
+
{
|
639
|
+
my $n = $$stats{$type}{$af}{matches}+$$stats{$type}{$af}{mismatches};
|
640
|
+
print $fh "$af\t" . ($n ? 1-$$stats{$type}{$af}{mismatches}/$n : -1) . "\n";
|
641
|
+
}
|
642
|
+
print $fh "end\n";
|
643
|
+
}
|
644
|
+
close($fh);
|
645
|
+
|
646
|
+
plot("$$opts{plot}-af.gp");
|
647
|
+
}
|
648
|
+
|
649
|
+
sub do_region_stats
|
650
|
+
{
|
651
|
+
my ($opts,$vcfs) = @_;
|
652
|
+
|
653
|
+
my $refseq;
|
654
|
+
if ( $$opts{refseq} ) { $refseq = FaSlice->new(file=>$$opts{refseq}, size=>1_000_000); }
|
655
|
+
|
656
|
+
my $nvcfs = scalar @$vcfs;
|
657
|
+
my $debug = $$opts{debug} ? $$opts{debug} : 0;
|
658
|
+
my $match = $$opts{match};
|
659
|
+
my $win = $$opts{win} ? $$opts{win} : 0;
|
660
|
+
|
661
|
+
while (1)
|
662
|
+
{
|
663
|
+
my $grp = read_next_group($vcfs,$win);
|
664
|
+
if ( !$grp || !scalar @$grp ) { last }
|
665
|
+
|
666
|
+
if ( $debug>1 )
|
667
|
+
{
|
668
|
+
print "Group:\n";
|
669
|
+
for my $rec (@$grp) { print "$$rec{chr}\t$$rec{pos}\t$$rec{vcf}{file}\n"; }
|
670
|
+
print "\n";
|
671
|
+
}
|
672
|
+
|
673
|
+
my %files;
|
674
|
+
for my $rec (@$grp)
|
675
|
+
{
|
676
|
+
$files{$$rec{vcf}{file}} = 1;
|
677
|
+
}
|
678
|
+
my $key = join(' ',sort(keys %files));
|
679
|
+
$$match{$key}++;
|
680
|
+
|
681
|
+
my $npresent = scalar keys %files;
|
682
|
+
if ( $npresent == $nvcfs )
|
683
|
+
{
|
684
|
+
ref_alt_stats($opts,$grp);
|
685
|
+
}
|
686
|
+
|
687
|
+
if ( $npresent>1 && defined $refseq )
|
688
|
+
{
|
689
|
+
cmp_sequence($opts,$grp,$refseq);
|
690
|
+
}
|
691
|
+
|
692
|
+
if ( $$opts{cmp_genotypes} )
|
693
|
+
{
|
694
|
+
# Check that in the group there is one record for each file
|
695
|
+
if ( $npresent==$nvcfs && scalar @$grp==$nvcfs )
|
696
|
+
{
|
697
|
+
cmp_genotypes($opts,$grp);
|
698
|
+
}
|
699
|
+
}
|
700
|
+
}
|
701
|
+
}
|
702
|
+
|
703
|
+
sub cmp_sequence
|
704
|
+
{
|
705
|
+
my ($opts,$grp,$fa_refseq) = @_;
|
706
|
+
|
707
|
+
# Detailed comparison will be performed only if there are indels or complex
|
708
|
+
# substitutions, SNPs are interesting only in their presence. There can be
|
709
|
+
# more events from the same file present simultaneously and at multiple
|
710
|
+
# positions. They all are treated as separate variants and if any of them
|
711
|
+
# yields a haplotype present in all files, match is reported.
|
712
|
+
# Note that the original version of the code expected all alternate
|
713
|
+
# variants to be present on a single VCF line and was able to compare
|
714
|
+
# consecutive non-overlapping events as one sequence. However, because the
|
715
|
+
# the major producer of indel calls (Dindel) does report one variant per
|
716
|
+
# line, this idea was abandoned.
|
717
|
+
|
718
|
+
# Check if there are any interesting events.
|
719
|
+
my %has_indels;
|
720
|
+
my %events_per_file;
|
721
|
+
my $vcf = $$grp[0]{vcf};
|
722
|
+
for (my $igrp=0; $igrp<@$grp; $igrp++)
|
723
|
+
{
|
724
|
+
my $rec = $$grp[$igrp];
|
725
|
+
my $ifile = $$rec{vcf}{vcf_compare_ID};
|
726
|
+
|
727
|
+
my $ref_len = length($$rec{ref});
|
728
|
+
my @alts = split(/,/,$$rec{alt});
|
729
|
+
for my $alt (@alts)
|
730
|
+
{
|
731
|
+
if ( $alt eq '.' ) { next; }
|
732
|
+
if ( $alt=~/^</ ) { next; }
|
733
|
+
my $alt_len = length($alt);
|
734
|
+
push @{$events_per_file{$ifile}}, { pos=>$$rec{pos}, alt=>$alt, ref_len=>$ref_len };
|
735
|
+
|
736
|
+
# Do complex checking of event type only if it is still not certain if this is waste of time or not
|
737
|
+
if ( exists($has_indels{$ifile}) ) { next; }
|
738
|
+
|
739
|
+
if ( $ref_len!=$alt_len ) { $has_indels{$ifile} = $$rec{vcf}{file}; }
|
740
|
+
elsif ( $ref_len>1 )
|
741
|
+
{
|
742
|
+
my ($type,$len,$ht) = $vcf->event_type($$rec{ref},$alt);
|
743
|
+
if ( $type eq 'o' ) { $has_indels{$ifile} = $$rec{vcf}{file}; }
|
744
|
+
}
|
745
|
+
}
|
746
|
+
}
|
747
|
+
|
748
|
+
# Return if there is nothing interesting
|
749
|
+
if ( scalar keys %has_indels < 2 ) { return; }
|
750
|
+
|
751
|
+
for my $ifile (keys %events_per_file)
|
752
|
+
{
|
753
|
+
if ( !exists($has_indels{$ifile}) ) { delete($events_per_file{$ifile}); }
|
754
|
+
}
|
755
|
+
|
756
|
+
# Cache the reference sequence chunk
|
757
|
+
my $ref_from = $$grp[0]{pos} - $$opts{win};
|
758
|
+
my $ref_to = $$grp[-1]{pos} + $$opts{win};
|
759
|
+
my $refseq = $fa_refseq->get_slice($$grp[0]{chr},$ref_from,$ref_to);
|
760
|
+
|
761
|
+
# For each file get all possible sequences
|
762
|
+
for my $events (values %events_per_file)
|
763
|
+
{
|
764
|
+
for my $variant (@$events)
|
765
|
+
{
|
766
|
+
my $pos = $$variant{pos};
|
767
|
+
my $len = $pos - $ref_from;
|
768
|
+
my $seq = $len>0 ? substr($refseq,0,$len) : '';
|
769
|
+
$seq .= $$variant{alt};
|
770
|
+
|
771
|
+
$pos += $$variant{ref_len};
|
772
|
+
if ( $pos<=$ref_to )
|
773
|
+
{
|
774
|
+
$seq .= substr($refseq,$pos-$ref_from);
|
775
|
+
}
|
776
|
+
|
777
|
+
$$variant{seq} = $seq;
|
778
|
+
$$variant{length} = length($seq);
|
779
|
+
}
|
780
|
+
}
|
781
|
+
|
782
|
+
# Now compare the variants: is there a sequence shared across all files?
|
783
|
+
my $match = 1;
|
784
|
+
my @keys = keys %events_per_file;
|
785
|
+
for (my $ikey=0; $ikey<@keys; $ikey++)
|
786
|
+
{
|
787
|
+
my $ivars = $events_per_file{$ikey};
|
788
|
+
for (my $jkey=0; $jkey<$ikey; $jkey++)
|
789
|
+
{
|
790
|
+
my $jvars = $events_per_file{$jkey};
|
791
|
+
my $found = 0;
|
792
|
+
for my $ivar (@$ivars)
|
793
|
+
{
|
794
|
+
for my $jvar (@$jvars)
|
795
|
+
{
|
796
|
+
if ( $$ivar{length} != $$jvar{length} ) { next; }
|
797
|
+
if ( $$ivar{seq} ne $$jvar{seq} ) { next; }
|
798
|
+
$found=1;
|
799
|
+
last;
|
800
|
+
}
|
801
|
+
}
|
802
|
+
if ( !$found ) { $match=0; last; }
|
803
|
+
}
|
804
|
+
if ( !$match ) { last; }
|
805
|
+
}
|
806
|
+
|
807
|
+
my $key = join(' ',sort(values %has_indels));
|
808
|
+
if ( $match )
|
809
|
+
{
|
810
|
+
$$opts{indels}{$key}{match}++;
|
811
|
+
}
|
812
|
+
else
|
813
|
+
{
|
814
|
+
$$opts{indels}{$key}{mismatch}++;
|
815
|
+
}
|
816
|
+
}
|
817
|
+
|
818
|
+
sub ref_alt_stats
|
819
|
+
{
|
820
|
+
my ($opts,$grp) = @_;
|
821
|
+
|
822
|
+
my $ref = $$grp[0]{ref};
|
823
|
+
my $alt = join(',',sort split(/,/,$$grp[0]{alt}));
|
824
|
+
|
825
|
+
my $alt_mismatch = 0;
|
826
|
+
for (my $i=1; $i<@$grp; $i++)
|
827
|
+
{
|
828
|
+
my $rec = $$grp[$i];
|
829
|
+
|
830
|
+
if ( $ref ne $$rec{ref} )
|
831
|
+
{
|
832
|
+
$$opts{ref_mismatch}++;
|
833
|
+
return;
|
834
|
+
}
|
835
|
+
|
836
|
+
my $tmp = join(',',sort split(/,/,$$rec{alt}));
|
837
|
+
if ( $alt ne $tmp )
|
838
|
+
{
|
839
|
+
$alt_mismatch = 1;
|
840
|
+
}
|
841
|
+
}
|
842
|
+
if ( $alt ne '.' )
|
843
|
+
{
|
844
|
+
if ( $alt_mismatch ) { $$opts{alt_mismatch}++; }
|
845
|
+
else { $$opts{alt_match}++; }
|
846
|
+
}
|
847
|
+
$$opts{ref_match}++;
|
848
|
+
}
|
849
|
+
|
850
|
+
|
851
|
+
sub snp_type
|
852
|
+
{
|
853
|
+
my ($als,$ref) = @_;
|
854
|
+
|
855
|
+
# Determine SNP type: hom(RR),het(RA),hom(AA) or het(AA)
|
856
|
+
if ( $$als[0] eq $$als[1] )
|
857
|
+
{
|
858
|
+
if ( $$als[0] eq $ref ) { return 'hom_RR_'; }
|
859
|
+
else { return 'hom_AA_'; }
|
860
|
+
}
|
861
|
+
else
|
862
|
+
{
|
863
|
+
if ( $$als[0] eq $ref or $$als[1] eq $ref ) { return 'het_RA_'; }
|
864
|
+
else { return 'het_AA_'; }
|
865
|
+
}
|
866
|
+
}
|
867
|
+
|
868
|
+
sub cmp_genotypes
|
869
|
+
{
|
870
|
+
my ($opts,$grp) = @_;
|
871
|
+
my $nrecs = @$grp;
|
872
|
+
my $hapls = $$opts{hapls};
|
873
|
+
|
874
|
+
# Break the VCF lines into hashes (required by parse_haplotype)
|
875
|
+
for my $grp_rec (@$grp)
|
876
|
+
{
|
877
|
+
$$grp_rec{rec} = $$grp_rec{vcf}->next_data_hash($$grp_rec{line});
|
878
|
+
if ( $$opts{ignore_indels} && exists($$grp_rec{rec}{INFO}{INDEL}) ) { return; }
|
879
|
+
if ( exists($$grp_rec{vcf}{_col_mapping}) )
|
880
|
+
{
|
881
|
+
my %new_cols;
|
882
|
+
while (my ($name_ori,$name_new) = each %{$$grp_rec{vcf}{_col_mapping}})
|
883
|
+
{
|
884
|
+
$new_cols{$name_new} = $$grp_rec{rec}{gtypes}{$name_ori};
|
885
|
+
delete($$grp_rec{rec}{gtypes}{$name_ori});
|
886
|
+
}
|
887
|
+
while (my ($name,$hash) = each %new_cols)
|
888
|
+
{
|
889
|
+
$$grp_rec{rec}{gtypes}{$name} = $hash;
|
890
|
+
}
|
891
|
+
}
|
892
|
+
}
|
893
|
+
if ( $$grp[0]{vcf}{vcf_compare_ID} != 0 ) { error("FIXME: different order than expected: $$grp[0]{vcf}{vcf_compare_ID}\n"); }
|
894
|
+
my $ref = $$grp[0]{rec}{REF};
|
895
|
+
|
896
|
+
my %gtype_matches = ();
|
897
|
+
my %gtype_mismatches = ();
|
898
|
+
|
899
|
+
my $min_dp;
|
900
|
+
my $ndp3 = 0;
|
901
|
+
for my $id (keys %{$$grp[0]{rec}{gtypes}})
|
902
|
+
{
|
903
|
+
my (@sorted_als1,$nploid,$type);
|
904
|
+
|
905
|
+
my ($als1,$seps1,$is_phased1,$is_empty1) = $$grp[0]{vcf}->parse_haplotype($$grp[0]{rec},$id);
|
906
|
+
if ( !$is_empty1 )
|
907
|
+
{
|
908
|
+
@sorted_als1 = sort @$als1;
|
909
|
+
$nploid = scalar @sorted_als1;
|
910
|
+
$type = snp_type($als1,$ref);
|
911
|
+
}
|
912
|
+
|
913
|
+
if ( exists($$opts{high_confidence_gls}) )
|
914
|
+
{
|
915
|
+
my @gls = split(/,/,$$grp[1]{rec}{gtypes}{$id}{GL});
|
916
|
+
if ( @gls!=3 or $gls[0] eq '.' ) { next; }
|
917
|
+
@gls = sort {$b<=>$a} @gls;
|
918
|
+
if ( abs($gls[0]-$gls[1])<$$opts{high_confidence_gls} ) { next; }
|
919
|
+
}
|
920
|
+
|
921
|
+
# There may be multiple files entering the comparison. Report match only if all are present and all match.
|
922
|
+
# Report mismatch if all are present and they do not match. Otherwise report lost/gained event.
|
923
|
+
my $phase_match = 1;
|
924
|
+
my $gtype_match = 1;
|
925
|
+
my $gtype_lost = 0;
|
926
|
+
my $gtype_gained = 0;
|
927
|
+
my $phase_lost = 0;
|
928
|
+
my $phase_gained = 0;
|
929
|
+
my $type2;
|
930
|
+
for (my $i=1; $i<$nrecs; $i++)
|
931
|
+
{
|
932
|
+
my ($als2,$seps2,$is_phased2,$is_empty2) = $$grp[$i]{vcf}->parse_haplotype($$grp[$i]{rec},$id);
|
933
|
+
if ( $is_empty1 )
|
934
|
+
{
|
935
|
+
$gtype_match = 0;
|
936
|
+
if ( !$is_empty2 )
|
937
|
+
{
|
938
|
+
$gtype_gained = 1;
|
939
|
+
$type = snp_type($als2,$ref);
|
940
|
+
}
|
941
|
+
if ( !$is_phased1 && $is_phased2 ) { $phase_gained = 1; }
|
942
|
+
last;
|
943
|
+
}
|
944
|
+
elsif ( $is_empty2 )
|
945
|
+
{
|
946
|
+
$gtype_match = 0;
|
947
|
+
$gtype_lost = 1;
|
948
|
+
last;
|
949
|
+
}
|
950
|
+
if ( $is_phased1 )
|
951
|
+
{
|
952
|
+
if ( !$is_phased2 )
|
953
|
+
{
|
954
|
+
$phase_lost = 1;
|
955
|
+
$phase_match = 0;
|
956
|
+
}
|
957
|
+
}
|
958
|
+
elsif ( $is_phased2 )
|
959
|
+
{
|
960
|
+
$phase_gained = 1;
|
961
|
+
$phase_match = 0;
|
962
|
+
}
|
963
|
+
else { $phase_match = 0; }
|
964
|
+
|
965
|
+
# Consider different number of alleles as mismatch (C vs C/C)
|
966
|
+
if ( scalar @$als1 != scalar @$als2 )
|
967
|
+
{
|
968
|
+
$gtype_match = 0;
|
969
|
+
if ( $$opts{debug} ) { $$opts{mismatch_types}{$type}{'Allele_Count'}++ }
|
970
|
+
last;
|
971
|
+
}
|
972
|
+
|
973
|
+
my @sorted_als2 = sort @$als2;
|
974
|
+
for (my $ial=0; $ial<$nploid; $ial++)
|
975
|
+
{
|
976
|
+
if ( $sorted_als1[$ial] ne $sorted_als2[$ial] )
|
977
|
+
{
|
978
|
+
$gtype_match = 0;
|
979
|
+
if ( $$opts{debug} )
|
980
|
+
{
|
981
|
+
my $type2 = snp_type($als2,$ref);
|
982
|
+
$$opts{mismatch_types}{$type}{$type2}++;
|
983
|
+
}
|
984
|
+
last;
|
985
|
+
}
|
986
|
+
}
|
987
|
+
|
988
|
+
if ( !$gtype_match )
|
989
|
+
{
|
990
|
+
if ( !defined $type2 && !$is_empty2 )
|
991
|
+
{
|
992
|
+
$type2 = snp_type($als2,$ref);
|
993
|
+
}
|
994
|
+
last;
|
995
|
+
}
|
996
|
+
|
997
|
+
# They match, check also if their phase agrees
|
998
|
+
if ( $phase_match && $is_phased1 && $is_phased2 )
|
999
|
+
{
|
1000
|
+
for (my $ial=0; $ial<$nploid; $ial++)
|
1001
|
+
{
|
1002
|
+
if ( $$als1[$ial] ne $$als2[$ial] ) { $phase_match=0; last; }
|
1003
|
+
}
|
1004
|
+
}
|
1005
|
+
}
|
1006
|
+
if ( $gtype_gained )
|
1007
|
+
{
|
1008
|
+
$$hapls{$id}{$type.'gtype_gained'}++;
|
1009
|
+
if ( $phase_gained ) { $$hapls{$id}{phased_gtype_gained}++ }
|
1010
|
+
next;
|
1011
|
+
}
|
1012
|
+
if ( $gtype_lost ) { $$hapls{$id}{$type.'gtype_lost'}++; next; }
|
1013
|
+
|
1014
|
+
if ( $phase_gained ) { $$hapls{$id}{phase_gained}++ }
|
1015
|
+
elsif ( $phase_lost ) { $$hapls{$id}{$type.'phase_lost'}++ }
|
1016
|
+
|
1017
|
+
my $dp = exists($$grp[1]{rec}{gtypes}{$id}{DP}) ? $$grp[1]{rec}{gtypes}{$id}{DP} : -1;
|
1018
|
+
if ( $gtype_match )
|
1019
|
+
{
|
1020
|
+
$$hapls{$id}{$type.'gtype_match'}++;
|
1021
|
+
if ( $phase_match ) { $$hapls{$id}{$type.'phase_match'}++ }
|
1022
|
+
$gtype_matches{$type}++;
|
1023
|
+
$$opts{counts_by_dp}{$dp}{$type.'-'.$type}++;
|
1024
|
+
}
|
1025
|
+
elsif ( defined $type )
|
1026
|
+
{
|
1027
|
+
$$hapls{$id}{$type.'gtype_mismatch'}++;
|
1028
|
+
$gtype_mismatches{$type}++;
|
1029
|
+
$$opts{counts_by_dp}{$dp}{$type.'-'.$type2}++;
|
1030
|
+
}
|
1031
|
+
}
|
1032
|
+
$$opts{hapls_ncmp}++;
|
1033
|
+
|
1034
|
+
# Store the number of matching types by AC
|
1035
|
+
my $af;
|
1036
|
+
if ( $$opts{INFO_AF1_af} && exists($$grp[1]{rec}{INFO}{AF1}) )
|
1037
|
+
{
|
1038
|
+
$af = sprintf "%.2f", $$grp[1]{rec}{INFO}{AF1};
|
1039
|
+
}
|
1040
|
+
elsif ( !$$opts{all_samples_af} )
|
1041
|
+
{
|
1042
|
+
my $ac = 0;
|
1043
|
+
my $an = 0;
|
1044
|
+
if ( exists($gtype_matches{hom_AA_}) )
|
1045
|
+
{
|
1046
|
+
$ac += 2*$gtype_matches{hom_AA_};
|
1047
|
+
$an += 2*$gtype_matches{hom_AA_};
|
1048
|
+
}
|
1049
|
+
if ( exists($gtype_mismatches{hom_AA_}) )
|
1050
|
+
{
|
1051
|
+
$ac += 2*$gtype_mismatches{hom_AA_};
|
1052
|
+
$an += 2*$gtype_mismatches{hom_AA_};
|
1053
|
+
}
|
1054
|
+
if ( exists($gtype_matches{het_RA_}) )
|
1055
|
+
{
|
1056
|
+
$ac += $gtype_matches{het_RA_};
|
1057
|
+
$an += 2*$gtype_matches{het_RA_};
|
1058
|
+
}
|
1059
|
+
if ( exists($gtype_mismatches{het_RA_}) )
|
1060
|
+
{
|
1061
|
+
$ac += $gtype_mismatches{het_RA_};
|
1062
|
+
$an += 2*$gtype_mismatches{het_RA_};
|
1063
|
+
}
|
1064
|
+
if ( exists($gtype_matches{hom_RR_}) ) { $an += 2*$gtype_matches{hom_RR_}; }
|
1065
|
+
if ( exists($gtype_mismatches{hom_RR_}) ) { $an += 2*$gtype_mismatches{hom_RR_}; }
|
1066
|
+
$af = sprintf "%.2f", $an>0 ? $ac/$an : 0;
|
1067
|
+
}
|
1068
|
+
else
|
1069
|
+
{
|
1070
|
+
my ($an,$ac) = $$grp[0]{vcf}->calc_an_ac($$grp[0]{rec}{gtypes});
|
1071
|
+
$af = sprintf "%.2f", $an>0 ? $ac/$an : 0;
|
1072
|
+
}
|
1073
|
+
|
1074
|
+
for my $type (keys %gtype_matches)
|
1075
|
+
{
|
1076
|
+
$$opts{counts_by_af}{$af}{$type}{matches} += $gtype_matches{$type};
|
1077
|
+
$$opts{gtypes_cmp_total} += $gtype_matches{$type};
|
1078
|
+
}
|
1079
|
+
for my $type (keys %gtype_mismatches)
|
1080
|
+
{
|
1081
|
+
$$opts{counts_by_af}{$af}{$type}{mismatches} += $gtype_mismatches{$type};
|
1082
|
+
$$opts{gtypes_cmp_total} += $gtype_mismatches{$type};
|
1083
|
+
}
|
1084
|
+
|
1085
|
+
if ( $$opts{debug} )
|
1086
|
+
{
|
1087
|
+
my $match = '?';
|
1088
|
+
if ( scalar keys %gtype_mismatches ) { $match = '-' }
|
1089
|
+
elsif ( scalar keys %gtype_matches ) { $match = '+' }
|
1090
|
+
my $hom_rr_mm = $gtype_mismatches{hom_RR_} ? $gtype_mismatches{hom_RR_} : 0;
|
1091
|
+
my $het_ra_mm = $gtype_mismatches{het_RA_} ? $gtype_mismatches{het_RA_} : 0;
|
1092
|
+
my $hom_aa_mm = $gtype_mismatches{hom_AA_} ? $gtype_mismatches{hom_AA_} : 0;
|
1093
|
+
my $hom_rr_m = $gtype_matches{hom_RR_} ? $gtype_matches{hom_RR_} : 0;
|
1094
|
+
my $het_ra_m = $gtype_matches{het_RA_} ? $gtype_matches{het_RA_} : 0;
|
1095
|
+
my $hom_aa_m = $gtype_matches{hom_AA_} ? $gtype_matches{hom_AA_} : 0;
|
1096
|
+
my $hom_rr_c = sprintf "%.2f", ($hom_rr_mm or $hom_rr_m) ? $hom_rr_m*1./($hom_rr_m + $hom_rr_mm) : -1;
|
1097
|
+
my $het_ra_c = sprintf "%.2f", ($het_ra_mm or $het_ra_m) ? $het_ra_m*1./($het_ra_m + $het_ra_mm) : -1;
|
1098
|
+
my $hom_aa_c = sprintf "%.2f", ($hom_aa_mm or $hom_aa_m) ? $hom_aa_m*1./($hom_aa_m + $hom_aa_mm) : -1;
|
1099
|
+
print "SD\t$$grp[0]{rec}{CHROM}\t$$grp[0]{rec}{POS}\t$match\t$hom_rr_mm\t$het_ra_mm\t$hom_aa_mm\t$hom_rr_c\t$het_ra_c\t$hom_aa_c\n";
|
1100
|
+
}
|
1101
|
+
}
|
1102
|
+
|
1103
|
+
|
1104
|
+
sub read_next_group
|
1105
|
+
{
|
1106
|
+
my ($vcfs,$win) = @_;
|
1107
|
+
|
1108
|
+
my @grp;
|
1109
|
+
my $prev_vcf;
|
1110
|
+
my $start;
|
1111
|
+
|
1112
|
+
while (1)
|
1113
|
+
{
|
1114
|
+
my $min_vcf = get_min_position($vcfs);
|
1115
|
+
if ( !$min_vcf ) { last; }
|
1116
|
+
if ( $prev_vcf && $prev_vcf eq $$min_vcf{buf}[0] ) { last; }
|
1117
|
+
$prev_vcf = $$min_vcf{buf}[0];
|
1118
|
+
|
1119
|
+
if ( !$start or $start+$win >= $$min_vcf{buf}[0]{pos} )
|
1120
|
+
{
|
1121
|
+
my $rec = shift(@{$$min_vcf{buf}});
|
1122
|
+
push @grp,$rec;
|
1123
|
+
|
1124
|
+
$start = $$rec{pos};
|
1125
|
+
next;
|
1126
|
+
}
|
1127
|
+
}
|
1128
|
+
return \@grp;
|
1129
|
+
}
|
1130
|
+
|
1131
|
+
sub get_min_position
|
1132
|
+
{
|
1133
|
+
my ($vcfs) = @_;
|
1134
|
+
|
1135
|
+
my ($min_pos,$min_vcf);
|
1136
|
+
for my $vcf (@$vcfs)
|
1137
|
+
{
|
1138
|
+
# Check if there is a line in the buffer, if not, read. If still empty, the file reached eof
|
1139
|
+
if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { read_line($vcf); }
|
1140
|
+
if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { next; }
|
1141
|
+
|
1142
|
+
my $line = $$vcf{buf}[0];
|
1143
|
+
|
1144
|
+
# Designate this position as the minimum of all the files if:
|
1145
|
+
# .. is this the first file?
|
1146
|
+
if ( !$min_pos )
|
1147
|
+
{
|
1148
|
+
$min_pos = $$line{pos};
|
1149
|
+
$min_vcf = $vcf;
|
1150
|
+
next;
|
1151
|
+
}
|
1152
|
+
|
1153
|
+
# .. has this file lower position?
|
1154
|
+
if ( $min_pos>$$line{pos} )
|
1155
|
+
{
|
1156
|
+
$min_pos = $$line{pos};
|
1157
|
+
$min_vcf = $vcf;
|
1158
|
+
next;
|
1159
|
+
}
|
1160
|
+
}
|
1161
|
+
return $min_vcf;
|
1162
|
+
}
|
1163
|
+
|
1164
|
+
sub read_line
|
1165
|
+
{
|
1166
|
+
my ($vcf) = @_;
|
1167
|
+
|
1168
|
+
if ( $$vcf{eof} ) { return; }
|
1169
|
+
|
1170
|
+
my $line = $vcf->next_line();
|
1171
|
+
if ( !$line )
|
1172
|
+
{
|
1173
|
+
$$vcf{eof} = 1;
|
1174
|
+
return;
|
1175
|
+
}
|
1176
|
+
|
1177
|
+
$$vcf{nread}++;
|
1178
|
+
|
1179
|
+
if ( !($line=~/^(\S+)\t(\S+)\t\S+\t(\S+)\t(\S+)/) ) { error("Could not parse the line: [$line]\n"); }
|
1180
|
+
my $chr = $1;
|
1181
|
+
my $pos = $2;
|
1182
|
+
my $ref = uc($3);
|
1183
|
+
my $alt = uc($4);
|
1184
|
+
if ( $$vcf{buf} && @{$$vcf{buf}} )
|
1185
|
+
{
|
1186
|
+
my $prev = $$vcf{buf}[-1];
|
1187
|
+
if ( $$prev{pos} == $pos ) { warn("Position $chr:$pos appeared twice in $$vcf{file}\n"); }
|
1188
|
+
}
|
1189
|
+
|
1190
|
+
push @{$$vcf{buf}}, { chr=>$chr, pos=>$pos, ref=>$ref, alt=>$alt, line=>$line, vcf=>$vcf };
|
1191
|
+
return;
|
1192
|
+
}
|
1193
|
+
|