ngs_server 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/ngs_server +72 -50
- data/ext/bamtools/extconf.rb +3 -3
- data/ext/vcftools/Makefile +28 -0
- data/ext/vcftools/README.txt +36 -0
- data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
- data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
- data/ext/vcftools/cpp/.svn/entries +708 -0
- data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
- data/ext/vcftools/cpp/Makefile +46 -0
- data/ext/vcftools/cpp/dgeev.cpp +146 -0
- data/ext/vcftools/cpp/dgeev.h +43 -0
- data/ext/vcftools/cpp/output_log.cpp +79 -0
- data/ext/vcftools/cpp/output_log.h +34 -0
- data/ext/vcftools/cpp/parameters.cpp +535 -0
- data/ext/vcftools/cpp/parameters.h +154 -0
- data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
- data/ext/vcftools/cpp/vcf_entry.h +190 -0
- data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
- data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
- data/ext/vcftools/cpp/vcf_file.cpp +495 -0
- data/ext/vcftools/cpp/vcf_file.h +184 -0
- data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
- data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
- data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
- data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
- data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
- data/ext/vcftools/cpp/vcftools.cpp +107 -0
- data/ext/vcftools/cpp/vcftools.h +25 -0
- data/ext/vcftools/examples/.svn/all-wcprops +185 -0
- data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
- data/ext/vcftools/examples/.svn/entries +1048 -0
- data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
- data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
- data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
- data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
- data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
- data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
- data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
- data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
- data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
- data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
- data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
- data/ext/vcftools/examples/annotate-test.vcf +37 -0
- data/ext/vcftools/examples/annotate.out +23 -0
- data/ext/vcftools/examples/annotate.txt +7 -0
- data/ext/vcftools/examples/annotate2.out +52 -0
- data/ext/vcftools/examples/annotate3.out +23 -0
- data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
- data/ext/vcftools/examples/cmp-test.out +53 -0
- data/ext/vcftools/examples/concat-a.vcf +21 -0
- data/ext/vcftools/examples/concat-b.vcf +13 -0
- data/ext/vcftools/examples/concat-c.vcf +19 -0
- data/ext/vcftools/examples/concat.out +39 -0
- data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
- data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
- data/ext/vcftools/examples/merge-test-a.vcf +17 -0
- data/ext/vcftools/examples/merge-test-b.vcf +17 -0
- data/ext/vcftools/examples/merge-test-c.vcf +15 -0
- data/ext/vcftools/examples/merge-test.vcf.out +31 -0
- data/ext/vcftools/examples/perl-api-1.pl +46 -0
- data/ext/vcftools/examples/query-test.out +6 -0
- data/ext/vcftools/examples/shuffle-test.vcf +12 -0
- data/ext/vcftools/examples/subset.SNPs.out +10 -0
- data/ext/vcftools/examples/subset.indels.out +18 -0
- data/ext/vcftools/examples/subset.vcf +21 -0
- data/ext/vcftools/examples/valid-3.3.vcf +30 -0
- data/ext/vcftools/examples/valid-4.0.vcf +34 -0
- data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
- data/ext/vcftools/examples/valid-4.1.vcf +37 -0
- data/ext/vcftools/extconf.rb +2 -0
- data/ext/vcftools/perl/.svn/all-wcprops +149 -0
- data/ext/vcftools/perl/.svn/entries +844 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
- data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
- data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
- data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
- data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
- data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
- data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
- data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
- data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
- data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
- data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
- data/ext/vcftools/perl/ChangeLog +84 -0
- data/ext/vcftools/perl/FaSlice.pm +214 -0
- data/ext/vcftools/perl/Makefile +12 -0
- data/ext/vcftools/perl/Vcf.pm +2853 -0
- data/ext/vcftools/perl/VcfStats.pm +681 -0
- data/ext/vcftools/perl/fill-aa +103 -0
- data/ext/vcftools/perl/fill-an-ac +56 -0
- data/ext/vcftools/perl/fill-ref-md5 +204 -0
- data/ext/vcftools/perl/tab-to-vcf +92 -0
- data/ext/vcftools/perl/test.t +376 -0
- data/ext/vcftools/perl/vcf-annotate +1099 -0
- data/ext/vcftools/perl/vcf-compare +1193 -0
- data/ext/vcftools/perl/vcf-concat +310 -0
- data/ext/vcftools/perl/vcf-convert +180 -0
- data/ext/vcftools/perl/vcf-fix-newlines +97 -0
- data/ext/vcftools/perl/vcf-isec +660 -0
- data/ext/vcftools/perl/vcf-merge +577 -0
- data/ext/vcftools/perl/vcf-query +286 -0
- data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
- data/ext/vcftools/perl/vcf-sort +79 -0
- data/ext/vcftools/perl/vcf-stats +160 -0
- data/ext/vcftools/perl/vcf-subset +206 -0
- data/ext/vcftools/perl/vcf-to-tab +112 -0
- data/ext/vcftools/perl/vcf-validator +145 -0
- data/ext/vcftools/website/.svn/all-wcprops +41 -0
- data/ext/vcftools/website/.svn/entries +238 -0
- data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
- data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
- data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
- data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
- data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
- data/ext/vcftools/website/Makefile +6 -0
- data/ext/vcftools/website/README +2 -0
- data/ext/vcftools/website/VCF-poster.pdf +0 -0
- data/ext/vcftools/website/default.css +250 -0
- data/ext/vcftools/website/favicon.ico +0 -0
- data/ext/vcftools/website/favicon.png +0 -0
- data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/img/.svn/entries +300 -0
- data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
- data/ext/vcftools/website/img/bg.gif +0 -0
- data/ext/vcftools/website/img/bgcode.gif +0 -0
- data/ext/vcftools/website/img/bgcontainer.gif +0 -0
- data/ext/vcftools/website/img/bgul.gif +0 -0
- data/ext/vcftools/website/img/header.gif +0 -0
- data/ext/vcftools/website/img/li.gif +0 -0
- data/ext/vcftools/website/img/quote.gif +0 -0
- data/ext/vcftools/website/img/search.gif +0 -0
- data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/src/.svn/entries +300 -0
- data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
- data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
- data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
- data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
- data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
- data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
- data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
- data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
- data/ext/vcftools/website/src/docs.inc +202 -0
- data/ext/vcftools/website/src/index.inc +52 -0
- data/ext/vcftools/website/src/index.php +80 -0
- data/ext/vcftools/website/src/license.inc +27 -0
- data/ext/vcftools/website/src/links.inc +13 -0
- data/ext/vcftools/website/src/options.inc +654 -0
- data/ext/vcftools/website/src/perl_module.inc +249 -0
- data/ext/vcftools/website/src/specs.inc +18 -0
- data/lib/config.ru +9 -0
- data/lib/ngs_server/add.rb +9 -0
- data/lib/ngs_server/version.rb +1 -1
- data/lib/ngs_server.rb +55 -3
- data/ngs_server.gemspec +5 -2
- metadata +296 -6
@@ -0,0 +1,1193 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# Author: petr.danecek@sanger
|
4
|
+
#
|
5
|
+
|
6
|
+
use strict;
|
7
|
+
use warnings;
|
8
|
+
use Carp;
|
9
|
+
use Vcf;
|
10
|
+
use FaSlice;
|
11
|
+
|
12
|
+
my $opts = parse_params();
|
13
|
+
if ( exists($$opts{plot}) )
|
14
|
+
{
|
15
|
+
plot_stats($opts);
|
16
|
+
}
|
17
|
+
else
|
18
|
+
{
|
19
|
+
compare_vcfs($opts);
|
20
|
+
}
|
21
|
+
|
22
|
+
exit;
|
23
|
+
|
24
|
+
#--------------------------------
|
25
|
+
|
26
|
+
sub error
|
27
|
+
{
|
28
|
+
my (@msg) = @_;
|
29
|
+
if ( scalar @msg )
|
30
|
+
{
|
31
|
+
croak @msg;
|
32
|
+
}
|
33
|
+
die
|
34
|
+
"About: Compare bgzipped and tabix indexed VCF files. (E.g. bgzip file.vcf; tabix -p vcf file.vcf.gz)\n",
|
35
|
+
"Usage: vcf-compare [OPTIONS] file1.vcf file2.vcf ...\n",
|
36
|
+
" vcf-compare -p plots chr1.cmp chr2.cmp ...\n",
|
37
|
+
"Options:\n",
|
38
|
+
" -c, --chromosomes <list|file> Same as -r, left for backward compatibility. Please do not use as it will be dropped in the future.\n",
|
39
|
+
" -d, --debug Debugging information. Giving the option multiple times increases verbosity\n",
|
40
|
+
" -g, --cmp-genotypes Compare genotypes, not only positions\n",
|
41
|
+
" -m, --name-mapping <list|file> Use with -g when comparing files with differing column names. The argument to this options is a\n",
|
42
|
+
" comma-separated list or one mapping per line in a file. The names are colon separated and must\n",
|
43
|
+
" appear in the same order as the files on the command line.\n",
|
44
|
+
" -p, --plot <prefix> Create plots. Multiple files (e.g. per-chromosome outputs from vcf-compare) can be given.\n",
|
45
|
+
" -R, --refseq <file> Compare the actual sequence, not just positions. Use with -w to compare indels.\n",
|
46
|
+
" -r, --regions <list|file> Process the given regions (comma-separated list or one region per line in a file).\n",
|
47
|
+
" -s, --samples <list|file> Process only the listed samples. Excluding unwanted samples may increase performance considerably.\n",
|
48
|
+
" -t, --title <string> Title for graphs (see also -p)\n",
|
49
|
+
" -w, --win <int> In repetitive sequences, the same indel can be called at different positions. Consider\n",
|
50
|
+
" records this far apart as matching (be it a SNP or an indel).\n",
|
51
|
+
" -h, -?, --help This help message.\n",
|
52
|
+
"\n";
|
53
|
+
}
|
54
|
+
|
55
|
+
|
56
|
+
sub parse_params
|
57
|
+
{
|
58
|
+
my $opts = { positions=>0 };
|
59
|
+
while (my $arg=shift(@ARGV))
|
60
|
+
{
|
61
|
+
if ( $arg eq '--all-samples-af' ) { $$opts{all_samples_af}=1; next; }
|
62
|
+
if ( $arg eq '--INFO/AF1-af' ) { $$opts{INFO_AF1_af}=1; next; }
|
63
|
+
if ( $arg eq '--ignore-indels' ) { $$opts{ignore_indels}=1; next; }
|
64
|
+
if ( $arg eq '--high-conf-gls' ) { $$opts{high_confidence_gls}=shift(@ARGV); next; }
|
65
|
+
if ( $arg eq '-m' || $arg eq '--name-mapping' ) { $$opts{mappings_list}=shift(@ARGV); next; }
|
66
|
+
if ( $arg eq '-R' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; }
|
67
|
+
if ( $arg eq '-c' || $arg eq '--chromosomes' ) { $$opts{regions_list}=shift(@ARGV); next; }
|
68
|
+
if ( $arg eq '-r' || $arg eq '--regions' ) { $$opts{regions_list}=shift(@ARGV); next; }
|
69
|
+
if ( $arg eq '-g' || $arg eq '--cmp-genotypes' ) { $$opts{cmp_genotypes}=1; next; }
|
70
|
+
if ( $arg eq '-s' || $arg eq '--samples' )
|
71
|
+
{
|
72
|
+
my $samples = shift(@ARGV);
|
73
|
+
my @samples = ( -e $samples ) ? read_list($samples) : split(/,/,$samples);
|
74
|
+
$$opts{samples} = \@samples;
|
75
|
+
next;
|
76
|
+
}
|
77
|
+
if ( $arg eq '-d' || $arg eq '--debug' ) { $$opts{debug}++; next; }
|
78
|
+
if ( $arg eq '-w' || $arg eq '--win' ) { $$opts{win}=shift(@ARGV); next; }
|
79
|
+
if ( $arg eq '-p' || $arg eq '--plot' ) { $$opts{plot}=shift(@ARGV); next; }
|
80
|
+
if ( $arg eq '-t' || $arg eq '--title' ) { $$opts{title}=shift(@ARGV); next; }
|
81
|
+
if ( -e $arg ) { push @{$$opts{files}}, $arg; next }
|
82
|
+
if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
|
83
|
+
error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n");
|
84
|
+
}
|
85
|
+
if ( !exists($$opts{files}) ) { error("What files should be compared?\n") }
|
86
|
+
return $opts;
|
87
|
+
}
|
88
|
+
|
89
|
+
sub read_list
|
90
|
+
{
|
91
|
+
my ($fname) = @_;
|
92
|
+
my @regions;
|
93
|
+
if ( -e $fname )
|
94
|
+
{
|
95
|
+
open(my $rgs,'<',$fname) or error("$fname: $!");
|
96
|
+
while (my $line=<$rgs>)
|
97
|
+
{
|
98
|
+
chomp($line);
|
99
|
+
push @regions, $line;
|
100
|
+
}
|
101
|
+
close($rgs);
|
102
|
+
}
|
103
|
+
else
|
104
|
+
{
|
105
|
+
@regions = split(/,/,$fname);
|
106
|
+
}
|
107
|
+
return (@regions);
|
108
|
+
}
|
109
|
+
|
110
|
+
sub read_mappings_list
|
111
|
+
{
|
112
|
+
my ($fname,$files) = @_;
|
113
|
+
my @maps = read_list($fname);
|
114
|
+
my %mapping;
|
115
|
+
for my $map (@maps)
|
116
|
+
{
|
117
|
+
my @items = split(/:/,$map);
|
118
|
+
if ( scalar @items != scalar @$files ) { error(sprintf "Expected %d column names, found [$map].\n", scalar @$files); }
|
119
|
+
for (my $i=1; $i<@$files; $i++)
|
120
|
+
{
|
121
|
+
$mapping{$$files[$i]}{$items[$i]} = $items[0];
|
122
|
+
warn("Using column name '$items[0]' for $$files[$i]:$items[$i]\n");
|
123
|
+
}
|
124
|
+
}
|
125
|
+
return \%mapping;
|
126
|
+
}
|
127
|
+
|
128
|
+
sub compare_vcfs
|
129
|
+
{
|
130
|
+
my ($opts) = @_;
|
131
|
+
|
132
|
+
$$opts{match} = {};
|
133
|
+
$$opts{hapls} = {};
|
134
|
+
|
135
|
+
# Open the VCF files and initialize the list of chromosomes
|
136
|
+
my @vcfs;
|
137
|
+
my (@regions,%has_chrom,$mappings);
|
138
|
+
if ( exists($$opts{regions_list}) ) { @regions = read_list($$opts{regions_list}); }
|
139
|
+
if ( exists($$opts{mappings_list}) ) { $mappings = read_mappings_list($$opts{mappings_list},$$opts{files}); }
|
140
|
+
|
141
|
+
print "# This file was generated by vcf-compare.\n#\n";
|
142
|
+
if ( $$opts{debug} )
|
143
|
+
{
|
144
|
+
print
|
145
|
+
"#SD Site discordance. Use `grep ^SD | cut -f 2-` to extract this part.\n",
|
146
|
+
"#SD The columns are: \n",
|
147
|
+
"#SD 1 .. chromosome\n",
|
148
|
+
"#SD 2 .. position\n",
|
149
|
+
"#SD 3 .. indicates matching (+) or mismatching (-) site\n",
|
150
|
+
"#SD 4 .. number of Hom_RR mismatches\n",
|
151
|
+
"#SD 5 .. number of Het_RA mismatches\n",
|
152
|
+
"#SD 6 .. number of Hom_AA mismatches\n";
|
153
|
+
}
|
154
|
+
|
155
|
+
my $ifile = 0;
|
156
|
+
for my $file (@{$$opts{files}})
|
157
|
+
{
|
158
|
+
my $vcf = Vcf->new(file=>$file);
|
159
|
+
$$vcf{vcf_compare_ID} = $ifile++;
|
160
|
+
$vcf->parse_header();
|
161
|
+
$vcf->close();
|
162
|
+
$$vcf{nread} = 0;
|
163
|
+
push @vcfs, $vcf;
|
164
|
+
|
165
|
+
# Update the list of known chromosomes
|
166
|
+
if ( !exists($$opts{regions_list}) )
|
167
|
+
{
|
168
|
+
my $chrms = $vcf->get_chromosomes();
|
169
|
+
for my $chr (@$chrms)
|
170
|
+
{
|
171
|
+
if ( exists($has_chrom{$chr}) ) { next; }
|
172
|
+
$has_chrom{$chr} = 1;
|
173
|
+
push @regions, $chr;
|
174
|
+
}
|
175
|
+
}
|
176
|
+
|
177
|
+
# Check if column names need to be renamed
|
178
|
+
if ( defined $mappings && exists($$mappings{$$vcf{file}}) )
|
179
|
+
{
|
180
|
+
$$vcf{_col_mapping} = $$mappings{$$vcf{file}};
|
181
|
+
for my $name (keys %{$$vcf{_col_mapping}})
|
182
|
+
{
|
183
|
+
if ( !exists($$vcf{has_column}{$name}) ) { error("No such column [$name] in the file $$vcf{file}\n"); }
|
184
|
+
my $new_name = $$vcf{_col_mapping}{$name};
|
185
|
+
$$vcf{_col_mapping_rev}{$new_name} = $name;
|
186
|
+
}
|
187
|
+
}
|
188
|
+
}
|
189
|
+
|
190
|
+
# Include only matching samples in haplotype comparison
|
191
|
+
if ( $$opts{cmp_genotypes} )
|
192
|
+
{
|
193
|
+
my %all_samples;
|
194
|
+
for my $vcf (@vcfs)
|
195
|
+
{
|
196
|
+
if ( exists $$opts{samples} )
|
197
|
+
{
|
198
|
+
for my $sample (@{$$opts{samples}})
|
199
|
+
{
|
200
|
+
if ( exists($$vcf{_col_mapping}) && exists($$vcf{_col_mapping}{$sample}) ) { $sample = $$vcf{_col_mapping}{$sample}; }
|
201
|
+
if ( exists($$vcf{has_column}{$sample}) ) { $all_samples{$sample}++ }
|
202
|
+
}
|
203
|
+
}
|
204
|
+
else
|
205
|
+
{
|
206
|
+
my @samples = $vcf->get_samples();
|
207
|
+
for my $sample (@samples)
|
208
|
+
{
|
209
|
+
if ( exists($$vcf{_col_mapping}) && exists($$vcf{_col_mapping}{$sample}) ) { $sample = $$vcf{_col_mapping}{$sample}; }
|
210
|
+
$all_samples{$sample}++
|
211
|
+
}
|
212
|
+
}
|
213
|
+
}
|
214
|
+
my @include_samples;
|
215
|
+
while (my ($sample,$count)=each %all_samples)
|
216
|
+
{
|
217
|
+
if ( $count != scalar @vcfs ) { next; }
|
218
|
+
push @include_samples, $sample;
|
219
|
+
}
|
220
|
+
if ( !@include_samples )
|
221
|
+
{
|
222
|
+
error("Error: There is no overlap between any of the samples, yet haplotype comparison was requested.\n");
|
223
|
+
}
|
224
|
+
$$opts{gt_samples_compared} = scalar @include_samples;
|
225
|
+
for my $vcf (@vcfs)
|
226
|
+
{
|
227
|
+
my @include;
|
228
|
+
if ( !exists($$vcf{_col_mapping}) ) { @include=@include_samples; }
|
229
|
+
else
|
230
|
+
{
|
231
|
+
for my $sample (@include_samples)
|
232
|
+
{
|
233
|
+
push @include, exists($$vcf{_col_mapping_rev}{$sample}) ? $$vcf{_col_mapping_rev}{$sample} : $sample
|
234
|
+
}
|
235
|
+
}
|
236
|
+
$vcf->set_samples(include=>\@include);
|
237
|
+
}
|
238
|
+
}
|
239
|
+
|
240
|
+
# Go through all the files simultaneously and get the stats.
|
241
|
+
for my $region (@regions)
|
242
|
+
{
|
243
|
+
# Open files
|
244
|
+
for my $vcf (@vcfs)
|
245
|
+
{
|
246
|
+
delete($$vcf{last_line});
|
247
|
+
$vcf->open(region=>$region,parse_header=>1);
|
248
|
+
delete($$vcf{eof});
|
249
|
+
}
|
250
|
+
do_region_stats($opts,\@vcfs);
|
251
|
+
}
|
252
|
+
|
253
|
+
report_stats($opts,\@vcfs);
|
254
|
+
|
255
|
+
for my $vcf (@vcfs)
|
256
|
+
{
|
257
|
+
if ( !$$vcf{nread} ) { warn("Warning: Read 0 lines from $$vcf{file}, the tabix index may be broken.\n"); }
|
258
|
+
}
|
259
|
+
}
|
260
|
+
|
261
|
+
sub report_stats
|
262
|
+
{
|
263
|
+
my ($opts,$vcfs) = @_;
|
264
|
+
|
265
|
+
# if ( $$opts{debug} )
|
266
|
+
# {
|
267
|
+
# use Data::Dumper; print Dumper($opts);
|
268
|
+
# }
|
269
|
+
|
270
|
+
my (@counts,%totals);
|
271
|
+
while (my ($key,$num) = each %{$$opts{match}})
|
272
|
+
{
|
273
|
+
my @files = split(/\s+/,$key);
|
274
|
+
for my $file (@files)
|
275
|
+
{
|
276
|
+
$totals{$file} += $num;
|
277
|
+
}
|
278
|
+
push @counts, {count=>$num, files=>[@files]};
|
279
|
+
}
|
280
|
+
|
281
|
+
print
|
282
|
+
"#VN 'Venn-Diagram Numbers'. Use `grep ^VN | cut -f 2-` to extract this part.\n",
|
283
|
+
"#VN The columns are: \n",
|
284
|
+
"#VN 1 .. number of sites unique to this particular combination of files\n",
|
285
|
+
"#VN 2- .. combination of files and space-separated number, a fraction of sites in the file\n";
|
286
|
+
for my $rec (sort {$$a{count}<=>$$b{count}} @counts)
|
287
|
+
{
|
288
|
+
my $num = $$rec{count};
|
289
|
+
my $files = $$rec{files};
|
290
|
+
|
291
|
+
print "VN\t$num";
|
292
|
+
for my $file (@$files)
|
293
|
+
{
|
294
|
+
printf "\t$file (%.1f%%)", $num*100./$totals{$file};
|
295
|
+
}
|
296
|
+
print "\n";
|
297
|
+
}
|
298
|
+
|
299
|
+
if ( $$opts{refseq} && $$opts{indels} )
|
300
|
+
{
|
301
|
+
print
|
302
|
+
"#IN Indel Numbers. Use `grep ^IN | cut -f 2-` to extract this part.\n",
|
303
|
+
"#IN .. todo\n",
|
304
|
+
"#IN Number of matching indel haplotypes shared across:\n";
|
305
|
+
while (my ($file,$stat) = each %{$$opts{indels}})
|
306
|
+
{
|
307
|
+
print "IN\t$file\n";
|
308
|
+
my $match = $$stat{match} ? $$stat{match} : 0;
|
309
|
+
my $mismatch = $$stat{mismatch} ? $$stat{mismatch} : 0;
|
310
|
+
printf "\t\tNumber of matches: %d\n", $match;
|
311
|
+
printf "\t\t mismatches: %d\n", $mismatch;
|
312
|
+
printf "\t\t error rate: %.1f%%\n", 100*$mismatch/($match+$mismatch);
|
313
|
+
}
|
314
|
+
}
|
315
|
+
|
316
|
+
print "#SN Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n";
|
317
|
+
printf "SN\tNumber of REF matches:\t%d\n", exists($$opts{ref_match}) ? $$opts{ref_match} : 0;
|
318
|
+
printf "SN\tNumber of ALT matches:\t%d\n", exists($$opts{alt_match}) ? $$opts{alt_match} : 0;
|
319
|
+
printf "SN\tNumber of REF mismatches:\t%d\n", exists($$opts{ref_mismatch}) ? $$opts{ref_mismatch} : 0;
|
320
|
+
printf "SN\tNumber of ALT mismatches:\t%d\n", exists($$opts{alt_mismatch}) ? $$opts{alt_mismatch} : 0;
|
321
|
+
printf "SN\tNumber of samples in GT comparison:\t%d\n", $$opts{gt_samples_compared} ? $$opts{gt_samples_compared} : 0;
|
322
|
+
|
323
|
+
my $out;
|
324
|
+
for my $vcf (@$vcfs)
|
325
|
+
{
|
326
|
+
if ( !exists($totals{$$vcf{file}}) ) { $totals{$$vcf{file}}=0; }
|
327
|
+
if ( $totals{$$vcf{file}} == $$vcf{nread} ) { next; }
|
328
|
+
|
329
|
+
my $diff = $$vcf{nread}-$totals{$$vcf{file}};
|
330
|
+
my $reported = $totals{$$vcf{file}};
|
331
|
+
my $total = $$vcf{nread};
|
332
|
+
$out .= sprintf "SN\t%d (%.1f%%) .. read %d, reported %d\t%s\n", $diff,$diff*100./$total,$total,$reported,$$vcf{file};
|
333
|
+
}
|
334
|
+
if ( $out )
|
335
|
+
{
|
336
|
+
print "SN\tNumber of sites lost due to grouping (e.g. duplicate sites)\n";
|
337
|
+
print $out;
|
338
|
+
}
|
339
|
+
|
340
|
+
|
341
|
+
if ( !$$opts{cmp_genotypes} ) { return; }
|
342
|
+
|
343
|
+
my %summary;
|
344
|
+
for my $id (keys %{$$opts{hapls}})
|
345
|
+
{
|
346
|
+
for my $key qw(hom_RR_ het_RA_ hom_AA_ het_AA_)
|
347
|
+
{
|
348
|
+
if ( !exists($$opts{hapls}{$id}{$key.'gtype_mismatch'}) ) { $$opts{hapls}{$id}{$key.'gtype_mismatch'}=0; }
|
349
|
+
$$opts{hapls}{$id}{total_gtype_mismatch} += $$opts{hapls}{$id}{$key.'gtype_mismatch'};
|
350
|
+
|
351
|
+
if ( !exists($$opts{hapls}{$id}{$key.'gtype_match'}) ) { $$opts{hapls}{$id}{$key.'gtype_match'}=0; }
|
352
|
+
$$opts{hapls}{$id}{total_gtype_match} += $$opts{hapls}{$id}{$key.'gtype_match'};
|
353
|
+
|
354
|
+
if ( !exists($$opts{hapls}{$id}{$key.'gtype_lost'}) ) { $$opts{hapls}{$id}{$key.'gtype_lost'}=0; }
|
355
|
+
$$opts{hapls}{$id}{total_gtype_lost} += $$opts{hapls}{$id}{$key.'gtype_lost'};
|
356
|
+
|
357
|
+
if ( !exists($$opts{hapls}{$id}{$key.'gtype_gained'}) ) { $$opts{hapls}{$id}{$key.'gtype_gained'}=0; }
|
358
|
+
$$opts{hapls}{$id}{total_gtype_gained} += $$opts{hapls}{$id}{$key.'gtype_gained'};
|
359
|
+
|
360
|
+
$summary{$key}{match} += $$opts{hapls}{$id}{$key.'gtype_match'};
|
361
|
+
$summary{$key}{mismatch} += $$opts{hapls}{$id}{$key.'gtype_mismatch'};
|
362
|
+
}
|
363
|
+
for my $key qw(het_RA_ het_AA_)
|
364
|
+
{
|
365
|
+
if ( !exists($$opts{hapls}{$id}{$key.'phase_match'}) ) { $$opts{hapls}{$id}{$key.'phase_match'}=0; }
|
366
|
+
$$opts{hapls}{$id}{total_phase_match} += $$opts{hapls}{$id}{$key.'phase_match'};
|
367
|
+
|
368
|
+
if ( !exists($$opts{hapls}{$id}{$key.'phase_mismatch'}) ) { $$opts{hapls}{$id}{$key.'phase_mismatch'}=0; }
|
369
|
+
$$opts{hapls}{$id}{total_phase_mismatch} += $$opts{hapls}{$id}{$key.'phase_mismatch'};
|
370
|
+
|
371
|
+
if ( !exists($$opts{hapls}{$id}{$key.'phase_lost'}) ) { $$opts{hapls}{$id}{$key.'phase_lost'}=0; }
|
372
|
+
$$opts{hapls}{$id}{total_phase_lost} += $$opts{hapls}{$id}{$key.'phase_lost'};
|
373
|
+
}
|
374
|
+
}
|
375
|
+
print
|
376
|
+
"#GS Genotype Comparison Summary. Use `grep ^GS | cut -f 2-` to extract this part.\n",
|
377
|
+
"#GS The columns are:\n",
|
378
|
+
"#GS 1 .. variant type\n",
|
379
|
+
"#GS 2 .. number of mismatches\n",
|
380
|
+
"#GS 3 .. number of matches\n",
|
381
|
+
"#GS 4 .. discordance\n";
|
382
|
+
print_gs($opts,\%summary);
|
383
|
+
|
384
|
+
print
|
385
|
+
"\n",
|
386
|
+
"#GC Genotype Comparison. Use `grep ^GC | cut -f 2-` to extract this part.\n",
|
387
|
+
"#GC The columns are:\n",
|
388
|
+
"#GC 1 .. Sample\n",
|
389
|
+
"#GC 2-6 .. Gtype mismatches: total hom_RR hom_AA het_RA het_AA \n",
|
390
|
+
"#GC 7-9 .. Gtype lost: total het_RA het_AA \n",
|
391
|
+
"#GC 10-14 .. Gtype gained: total hom_RR hom_AA het_RA het_AA \n",
|
392
|
+
"#GC 15-17 .. Phase lost: total het_RA het_AA \n",
|
393
|
+
"#GC 18 .. Phase gained\n",
|
394
|
+
"#GC 19-23 .. Matching sites: total hom_RR hom_AA het_RA het_AA \n",
|
395
|
+
"#GC 24-26 .. Phased matches: total het_RA het_AA \n",
|
396
|
+
"#GC 27-29 .. Misphased matches: total het_RA het_AA \n";
|
397
|
+
|
398
|
+
for my $id (keys %{$$opts{hapls}})
|
399
|
+
{
|
400
|
+
print "GC\t$id";
|
401
|
+
for my $key qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_mismatch'}; }
|
402
|
+
for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_lost'}; }
|
403
|
+
for my $key qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_gained'}; }
|
404
|
+
for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'phase_lost'}; }
|
405
|
+
if ( !exists($$opts{hapls}{$id}{phase_gained}) ) { $$opts{hapls}{$id}{phase_gained}=0; }
|
406
|
+
print "\t",$$opts{hapls}{$id}{phase_gained};
|
407
|
+
for my $key qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_match'}; }
|
408
|
+
for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'phase_match'}; }
|
409
|
+
for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'phase_mismatch'}; }
|
410
|
+
print "\n";
|
411
|
+
}
|
412
|
+
|
413
|
+
print
|
414
|
+
"#AF Number of matching and mismatching genotypes vs non-ref allele frequency. Use `^AF | cut -f 2-` to extract this part.\n",
|
415
|
+
"#AF The columns are:\n",
|
416
|
+
"#AF 1 .. Non-ref allele count\n",
|
417
|
+
"#AF 2 .. Hom(RR) matches\n",
|
418
|
+
"#AF 3 .. Het(RA) matches\n",
|
419
|
+
"#AF 4 .. Hom(AA) matches\n",
|
420
|
+
"#AF 5 .. Het(AA) matches\n",
|
421
|
+
"#AF 6 .. Hom(RR) mismatches\n",
|
422
|
+
"#AF 7 .. Het(RA) mismatches\n",
|
423
|
+
"#AF 8 .. Hom(AA) mismatches\n",
|
424
|
+
"#AF 9 .. Het(AA) mismatches\n";
|
425
|
+
for my $ac (sort {$a<=>$b} keys %{$$opts{counts_by_af}})
|
426
|
+
{
|
427
|
+
print "AF\t$ac";
|
428
|
+
for my $key qw(hom_RR_ het_RA_ hom_AA_ het_AA_)
|
429
|
+
{
|
430
|
+
print "\t", $$opts{counts_by_af}{$ac}{$key}{matches} ? $$opts{counts_by_af}{$ac}{$key}{matches} : 0;
|
431
|
+
}
|
432
|
+
for my $key qw(hom_RR_ het_RA_ hom_AA_ het_AA_)
|
433
|
+
{
|
434
|
+
print "\t", $$opts{counts_by_af}{$ac}{$key}{mismatches} ? $$opts{counts_by_af}{$ac}{$key}{mismatches} : 0;
|
435
|
+
}
|
436
|
+
print "\n";
|
437
|
+
}
|
438
|
+
|
439
|
+
print "#DP Counts by depth. Use `grep ^DP | cut -f 2-` to extract this part.\n";
|
440
|
+
print "#DP The columns are:\n";
|
441
|
+
print "#DP 1 .. depth\n";
|
442
|
+
print "#DP 2 .. RR matches\n";
|
443
|
+
print "#DP 3 .. RA matches\n";
|
444
|
+
print "#DP 4 .. AA matches\n";
|
445
|
+
print "#DP 5 .. RR -> RA mismatches\n";
|
446
|
+
print "#DP 6 .. RR -> AA mismatches\n";
|
447
|
+
print "#DP 7 .. RA -> RR mismatches\n";
|
448
|
+
print "#DP 8 .. RA -> AA mismatches\n";
|
449
|
+
print "#DP 9 .. AA -> RR mismatches\n";
|
450
|
+
print "#DP 10 .. AA -> RA mismatches\n";
|
451
|
+
for my $dp (sort {$a<=>$b} keys %{$$opts{counts_by_dp}})
|
452
|
+
{
|
453
|
+
print "DP\t$dp";
|
454
|
+
for my $type qw(hom_RR_-hom_RR_ het_RA_-het_RA_ hom_AA_-hom_AA_ hom_RR_-het_RA_ hom_RR_-hom_AA_ het_RA_-hom_RR_ het_RA_-hom_AA_ hom_AA_-hom_RR_ hom_AA_-het_RA_)
|
455
|
+
{
|
456
|
+
printf "\t%d", exists($$opts{counts_by_dp}{$dp}{$type}) ? $$opts{counts_by_dp}{$dp}{$type} : 0;
|
457
|
+
}
|
458
|
+
print "\n";
|
459
|
+
}
|
460
|
+
|
461
|
+
if ( $$opts{debug} )
|
462
|
+
{
|
463
|
+
print "#MT Mismatch Types\n";
|
464
|
+
for my $t1 (keys %{$$opts{mismatch_types}})
|
465
|
+
{
|
466
|
+
for my $t2 (keys %{$$opts{mismatch_types}{$t1}})
|
467
|
+
{
|
468
|
+
print "MT\t$t1\t$t2\t$$opts{mismatch_types}{$t1}{$t2}\n";
|
469
|
+
}
|
470
|
+
}
|
471
|
+
}
|
472
|
+
}
|
473
|
+
|
474
|
+
sub print_gs
|
475
|
+
{
|
476
|
+
my ($opts,$stats) = @_;
|
477
|
+
my ($ndr_ms,$ndr_m);
|
478
|
+
for my $key qw(hom_RR het_RA hom_AA het_AA)
|
479
|
+
{
|
480
|
+
my $m = $$stats{"${key}_"}{match};
|
481
|
+
my $ms = $$stats{"${key}_"}{mismatch};
|
482
|
+
if ( !$m ) { $m=0; }
|
483
|
+
if ( !$ms ) { $ms=0; }
|
484
|
+
printf "GS\t$key\t%d\t%d\t%.2f%%\n", $ms,$m,$m?$ms*100./($m+$ms):0;
|
485
|
+
$ndr_ms += $ms;
|
486
|
+
$ndr_m += $key eq 'hom_RR' ? 0 : $m;
|
487
|
+
}
|
488
|
+
printf
|
489
|
+
"SN\tNon-reference Discordance Rate (NDR):\t%.2f\n", $ndr_m+$ndr_ms ? $ndr_ms*100./($ndr_m+$ndr_ms) : 0;
|
490
|
+
}
|
491
|
+
|
492
|
+
sub read_stats
|
493
|
+
{
|
494
|
+
my ($stats,$file) = @_;
|
495
|
+
open(my $fh,'<',$file) or error("$file: $!");
|
496
|
+
while (my $line=<$fh>)
|
497
|
+
{
|
498
|
+
if ( $line=~/^#/ ) { next; }
|
499
|
+
my @items = split(/\t/,$line);
|
500
|
+
chomp($items[-1]);
|
501
|
+
if ( $items[0] eq 'DP' )
|
502
|
+
{
|
503
|
+
my $dp = $items[1];
|
504
|
+
$$stats{dp}{ndist}{$dp} += $items[2] + $items[3] + $items[4] + $items[5] + $items[6] + $items[7] + $items[8] + $items[9] + $items[10];
|
505
|
+
$$stats{dp}{RR}{RA}{$dp} += $items[5];
|
506
|
+
$$stats{dp}{n}{RR}{RA} += $items[5];
|
507
|
+
$$stats{dp}{RR}{AA}{$dp} += $items[6];
|
508
|
+
$$stats{dp}{n}{RR}{AA} += $items[6];
|
509
|
+
$$stats{dp}{RA}{RR}{$dp} += $items[7];
|
510
|
+
$$stats{dp}{n}{RA}{RR} += $items[7];
|
511
|
+
$$stats{dp}{RA}{AA}{$dp} += $items[8];
|
512
|
+
$$stats{dp}{n}{RA}{AA} += $items[8];
|
513
|
+
$$stats{dp}{AA}{RR}{$dp} += $items[9];
|
514
|
+
$$stats{dp}{n}{AA}{RR} += $items[9];
|
515
|
+
$$stats{dp}{AA}{RA}{$dp} += $items[10];
|
516
|
+
$$stats{dp}{n}{AA}{RA} += $items[10];
|
517
|
+
}
|
518
|
+
if ( $items[0] eq 'AF' )
|
519
|
+
{
|
520
|
+
my $af = $items[1];
|
521
|
+
$$stats{af}{RR}{$af}{matches} += $items[2];
|
522
|
+
$$stats{af}{RA}{$af}{matches} += $items[3];
|
523
|
+
$$stats{af}{AA}{$af}{matches} += $items[4];
|
524
|
+
$$stats{af}{RR}{$af}{mismatches} += $items[6];
|
525
|
+
$$stats{af}{RA}{$af}{mismatches} += $items[7];
|
526
|
+
$$stats{af}{AA}{$af}{mismatches} += $items[8];
|
527
|
+
}
|
528
|
+
if ( $items[0] eq 'GS' )
|
529
|
+
{
|
530
|
+
my $type = $items[1];
|
531
|
+
$$stats{gs}{$type.'_'}{mismatch} += $items[2];
|
532
|
+
$$stats{gs}{$type.'_'}{match} += $items[3];
|
533
|
+
}
|
534
|
+
}
|
535
|
+
close($fh);
|
536
|
+
}
|
537
|
+
|
538
|
+
sub plot_stats
|
539
|
+
{
|
540
|
+
my ($opts) = @_;
|
541
|
+
my $stats = {};
|
542
|
+
for my $file (@{$$opts{files}})
|
543
|
+
{
|
544
|
+
read_stats($stats,$file);
|
545
|
+
}
|
546
|
+
plot_dp($opts,$$stats{dp});
|
547
|
+
plot_af($opts,$$stats{af});
|
548
|
+
print_gs($opts,$$stats{gs});
|
549
|
+
}
|
550
|
+
|
551
|
+
sub plot
|
552
|
+
{
|
553
|
+
my ($file) = @_;
|
554
|
+
system("GDFONTPATH=/usr/share/fonts/truetype/ttf-dejavu/ gnuplot $file");
|
555
|
+
}
|
556
|
+
|
557
|
+
sub plot_dp
|
558
|
+
{
|
559
|
+
my ($opts,$stats) = @_;
|
560
|
+
|
561
|
+
my $out;
|
562
|
+
my @plots;
|
563
|
+
for my $agt (sort keys %$stats)
|
564
|
+
{
|
565
|
+
if ( $agt eq 'n' or $agt eq 'ndist' ) { next; }
|
566
|
+
for my $bgt (sort keys %{$$stats{$agt}})
|
567
|
+
{
|
568
|
+
if ( $bgt eq 'n' ) { next; }
|
569
|
+
for my $dp (sort {$a<=>$b} keys %{$$stats{$agt}{$bgt}})
|
570
|
+
{
|
571
|
+
$out .= $dp . "\t" . ($$stats{n}{$agt}{$bgt} ? $$stats{$agt}{$bgt}{$dp}*100./$$stats{n}{$agt}{$bgt} : 0) . "\n";
|
572
|
+
}
|
573
|
+
$out .= "end\n";
|
574
|
+
push @plots, qq["-" using 1:2 with linespoints pt 12 title "$agt -> $bgt"];
|
575
|
+
}
|
576
|
+
}
|
577
|
+
|
578
|
+
open(my $fh,'>',"$$opts{plot}-dp.gp") or error("$$opts{plot}-dp.gp: $!");
|
579
|
+
print $fh q[
|
580
|
+
set terminal png size 600,400 truecolor font "DejaVuSansMono,9"
|
581
|
+
set output "] . "$$opts{plot}-dp.png" . q["
|
582
|
+
set ylabel 'Fraction of GTs [%]'
|
583
|
+
set y2label 'Number of GTs total'
|
584
|
+
set y2tics
|
585
|
+
set ytics nomirror
|
586
|
+
set xlabel 'Depth'
|
587
|
+
set xrange [:20]
|
588
|
+
];
|
589
|
+
if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; }
|
590
|
+
print $fh "plot ", join(',',@plots), qq[, '-' using 1:2 axes x1y2 with lines lt 0 title "GTs total"\n];
|
591
|
+
print $fh $out;
|
592
|
+
for my $dp (sort {$a<=>$b} keys %{$$stats{ndist}})
|
593
|
+
{
|
594
|
+
print $fh "$dp\t$$stats{ndist}{$dp}\n";
|
595
|
+
}
|
596
|
+
print $fh "end\n";
|
597
|
+
close($fh);
|
598
|
+
|
599
|
+
plot("$$opts{plot}-dp.gp");
|
600
|
+
}
|
601
|
+
|
602
|
+
sub plot_af
|
603
|
+
{
|
604
|
+
my ($opts,$stats) = @_;
|
605
|
+
|
606
|
+
open(my $fh,'>',"$$opts{plot}-af.gp") or error("$$opts{plot}-af.gp: $!");
|
607
|
+
if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; }
|
608
|
+
print $fh q[
|
609
|
+
set terminal png size 550,400 truecolor font "DejaVuSansMono,9"
|
610
|
+
set output "] . "$$opts{plot}-af.png" . q["
|
611
|
+
set grid back lc rgb "#dddddd"
|
612
|
+
set xlabel "Non-reference allele frequency"
|
613
|
+
set ylabel "Concordance"
|
614
|
+
set y2label "Number of genotypes"
|
615
|
+
set yrange [0.0:1.0]
|
616
|
+
set y2tics
|
617
|
+
set key center
|
618
|
+
|
619
|
+
plot '-' axes x1y2 with lines lw 1 lc rgb "red" notitle, \
|
620
|
+
'-' axes x1y2 with lines lw 1 lc rgb "green" notitle, \
|
621
|
+
'-' axes x1y2 with lines lw 1 lc rgb "blue" notitle, \
|
622
|
+
'-' with points pt 20 lc rgb "red" title "HomRef", \
|
623
|
+
'-' with points pt 20 lc rgb "green" title "Het", \
|
624
|
+
'-' with points pt 20 lc rgb "blue" title "HomAlt"
|
625
|
+
];
|
626
|
+
|
627
|
+
for my $type qw(RR RA AA)
|
628
|
+
{
|
629
|
+
for my $af (sort {$a<=>$b} keys %{$$stats{$type}})
|
630
|
+
{
|
631
|
+
print $fh "$af\t" . ($$stats{$type}{$af}{matches}+$$stats{$type}{$af}{mismatches}) . "\n";
|
632
|
+
}
|
633
|
+
print $fh "end\n";
|
634
|
+
}
|
635
|
+
for my $type qw(RR RA AA)
|
636
|
+
{
|
637
|
+
for my $af (sort {$a<=>$b} keys %{$$stats{$type}})
|
638
|
+
{
|
639
|
+
my $n = $$stats{$type}{$af}{matches}+$$stats{$type}{$af}{mismatches};
|
640
|
+
print $fh "$af\t" . ($n ? 1-$$stats{$type}{$af}{mismatches}/$n : -1) . "\n";
|
641
|
+
}
|
642
|
+
print $fh "end\n";
|
643
|
+
}
|
644
|
+
close($fh);
|
645
|
+
|
646
|
+
plot("$$opts{plot}-af.gp");
|
647
|
+
}
|
648
|
+
|
649
|
+
sub do_region_stats
|
650
|
+
{
|
651
|
+
my ($opts,$vcfs) = @_;
|
652
|
+
|
653
|
+
my $refseq;
|
654
|
+
if ( $$opts{refseq} ) { $refseq = FaSlice->new(file=>$$opts{refseq}, size=>1_000_000); }
|
655
|
+
|
656
|
+
my $nvcfs = scalar @$vcfs;
|
657
|
+
my $debug = $$opts{debug} ? $$opts{debug} : 0;
|
658
|
+
my $match = $$opts{match};
|
659
|
+
my $win = $$opts{win} ? $$opts{win} : 0;
|
660
|
+
|
661
|
+
while (1)
|
662
|
+
{
|
663
|
+
my $grp = read_next_group($vcfs,$win);
|
664
|
+
if ( !$grp || !scalar @$grp ) { last }
|
665
|
+
|
666
|
+
if ( $debug>1 )
|
667
|
+
{
|
668
|
+
print "Group:\n";
|
669
|
+
for my $rec (@$grp) { print "$$rec{chr}\t$$rec{pos}\t$$rec{vcf}{file}\n"; }
|
670
|
+
print "\n";
|
671
|
+
}
|
672
|
+
|
673
|
+
my %files;
|
674
|
+
for my $rec (@$grp)
|
675
|
+
{
|
676
|
+
$files{$$rec{vcf}{file}} = 1;
|
677
|
+
}
|
678
|
+
my $key = join(' ',sort(keys %files));
|
679
|
+
$$match{$key}++;
|
680
|
+
|
681
|
+
my $npresent = scalar keys %files;
|
682
|
+
if ( $npresent == $nvcfs )
|
683
|
+
{
|
684
|
+
ref_alt_stats($opts,$grp);
|
685
|
+
}
|
686
|
+
|
687
|
+
if ( $npresent>1 && defined $refseq )
|
688
|
+
{
|
689
|
+
cmp_sequence($opts,$grp,$refseq);
|
690
|
+
}
|
691
|
+
|
692
|
+
if ( $$opts{cmp_genotypes} )
|
693
|
+
{
|
694
|
+
# Check that in the group there is one record for each file
|
695
|
+
if ( $npresent==$nvcfs && scalar @$grp==$nvcfs )
|
696
|
+
{
|
697
|
+
cmp_genotypes($opts,$grp);
|
698
|
+
}
|
699
|
+
}
|
700
|
+
}
|
701
|
+
}
|
702
|
+
|
703
|
+
sub cmp_sequence
|
704
|
+
{
|
705
|
+
my ($opts,$grp,$fa_refseq) = @_;
|
706
|
+
|
707
|
+
# Detailed comparison will be performed only if there are indels or complex
|
708
|
+
# substitutions, SNPs are interesting only in their presence. There can be
|
709
|
+
# more events from the same file present simultaneously and at multiple
|
710
|
+
# positions. They all are treated as separate variants and if any of them
|
711
|
+
# yields a haplotype present in all files, match is reported.
|
712
|
+
# Note that the original version of the code expected all alternate
|
713
|
+
# variants to be present on a single VCF line and was able to compare
|
714
|
+
# consecutive non-overlapping events as one sequence. However, because the
|
715
|
+
# the major producer of indel calls (Dindel) does report one variant per
|
716
|
+
# line, this idea was abandoned.
|
717
|
+
|
718
|
+
# Check if there are any interesting events.
|
719
|
+
my %has_indels;
|
720
|
+
my %events_per_file;
|
721
|
+
my $vcf = $$grp[0]{vcf};
|
722
|
+
for (my $igrp=0; $igrp<@$grp; $igrp++)
|
723
|
+
{
|
724
|
+
my $rec = $$grp[$igrp];
|
725
|
+
my $ifile = $$rec{vcf}{vcf_compare_ID};
|
726
|
+
|
727
|
+
my $ref_len = length($$rec{ref});
|
728
|
+
my @alts = split(/,/,$$rec{alt});
|
729
|
+
for my $alt (@alts)
|
730
|
+
{
|
731
|
+
if ( $alt eq '.' ) { next; }
|
732
|
+
if ( $alt=~/^</ ) { next; }
|
733
|
+
my $alt_len = length($alt);
|
734
|
+
push @{$events_per_file{$ifile}}, { pos=>$$rec{pos}, alt=>$alt, ref_len=>$ref_len };
|
735
|
+
|
736
|
+
# Do complex checking of event type only if it is still not certain if this is waste of time or not
|
737
|
+
if ( exists($has_indels{$ifile}) ) { next; }
|
738
|
+
|
739
|
+
if ( $ref_len!=$alt_len ) { $has_indels{$ifile} = $$rec{vcf}{file}; }
|
740
|
+
elsif ( $ref_len>1 )
|
741
|
+
{
|
742
|
+
my ($type,$len,$ht) = $vcf->event_type($$rec{ref},$alt);
|
743
|
+
if ( $type eq 'o' ) { $has_indels{$ifile} = $$rec{vcf}{file}; }
|
744
|
+
}
|
745
|
+
}
|
746
|
+
}
|
747
|
+
|
748
|
+
# Return if there is nothing interesting
|
749
|
+
if ( scalar keys %has_indels < 2 ) { return; }
|
750
|
+
|
751
|
+
for my $ifile (keys %events_per_file)
|
752
|
+
{
|
753
|
+
if ( !exists($has_indels{$ifile}) ) { delete($events_per_file{$ifile}); }
|
754
|
+
}
|
755
|
+
|
756
|
+
# Cache the reference sequence chunk
|
757
|
+
my $ref_from = $$grp[0]{pos} - $$opts{win};
|
758
|
+
my $ref_to = $$grp[-1]{pos} + $$opts{win};
|
759
|
+
my $refseq = $fa_refseq->get_slice($$grp[0]{chr},$ref_from,$ref_to);
|
760
|
+
|
761
|
+
# For each file get all possible sequences
|
762
|
+
for my $events (values %events_per_file)
|
763
|
+
{
|
764
|
+
for my $variant (@$events)
|
765
|
+
{
|
766
|
+
my $pos = $$variant{pos};
|
767
|
+
my $len = $pos - $ref_from;
|
768
|
+
my $seq = $len>0 ? substr($refseq,0,$len) : '';
|
769
|
+
$seq .= $$variant{alt};
|
770
|
+
|
771
|
+
$pos += $$variant{ref_len};
|
772
|
+
if ( $pos<=$ref_to )
|
773
|
+
{
|
774
|
+
$seq .= substr($refseq,$pos-$ref_from);
|
775
|
+
}
|
776
|
+
|
777
|
+
$$variant{seq} = $seq;
|
778
|
+
$$variant{length} = length($seq);
|
779
|
+
}
|
780
|
+
}
|
781
|
+
|
782
|
+
# Now compare the variants: is there a sequence shared across all files?
|
783
|
+
my $match = 1;
|
784
|
+
my @keys = keys %events_per_file;
|
785
|
+
for (my $ikey=0; $ikey<@keys; $ikey++)
|
786
|
+
{
|
787
|
+
my $ivars = $events_per_file{$ikey};
|
788
|
+
for (my $jkey=0; $jkey<$ikey; $jkey++)
|
789
|
+
{
|
790
|
+
my $jvars = $events_per_file{$jkey};
|
791
|
+
my $found = 0;
|
792
|
+
for my $ivar (@$ivars)
|
793
|
+
{
|
794
|
+
for my $jvar (@$jvars)
|
795
|
+
{
|
796
|
+
if ( $$ivar{length} != $$jvar{length} ) { next; }
|
797
|
+
if ( $$ivar{seq} ne $$jvar{seq} ) { next; }
|
798
|
+
$found=1;
|
799
|
+
last;
|
800
|
+
}
|
801
|
+
}
|
802
|
+
if ( !$found ) { $match=0; last; }
|
803
|
+
}
|
804
|
+
if ( !$match ) { last; }
|
805
|
+
}
|
806
|
+
|
807
|
+
my $key = join(' ',sort(values %has_indels));
|
808
|
+
if ( $match )
|
809
|
+
{
|
810
|
+
$$opts{indels}{$key}{match}++;
|
811
|
+
}
|
812
|
+
else
|
813
|
+
{
|
814
|
+
$$opts{indels}{$key}{mismatch}++;
|
815
|
+
}
|
816
|
+
}
|
817
|
+
|
818
|
+
sub ref_alt_stats
|
819
|
+
{
|
820
|
+
my ($opts,$grp) = @_;
|
821
|
+
|
822
|
+
my $ref = $$grp[0]{ref};
|
823
|
+
my $alt = join(',',sort split(/,/,$$grp[0]{alt}));
|
824
|
+
|
825
|
+
my $alt_mismatch = 0;
|
826
|
+
for (my $i=1; $i<@$grp; $i++)
|
827
|
+
{
|
828
|
+
my $rec = $$grp[$i];
|
829
|
+
|
830
|
+
if ( $ref ne $$rec{ref} )
|
831
|
+
{
|
832
|
+
$$opts{ref_mismatch}++;
|
833
|
+
return;
|
834
|
+
}
|
835
|
+
|
836
|
+
my $tmp = join(',',sort split(/,/,$$rec{alt}));
|
837
|
+
if ( $alt ne $tmp )
|
838
|
+
{
|
839
|
+
$alt_mismatch = 1;
|
840
|
+
}
|
841
|
+
}
|
842
|
+
if ( $alt ne '.' )
|
843
|
+
{
|
844
|
+
if ( $alt_mismatch ) { $$opts{alt_mismatch}++; }
|
845
|
+
else { $$opts{alt_match}++; }
|
846
|
+
}
|
847
|
+
$$opts{ref_match}++;
|
848
|
+
}
|
849
|
+
|
850
|
+
|
851
|
+
sub snp_type
|
852
|
+
{
|
853
|
+
my ($als,$ref) = @_;
|
854
|
+
|
855
|
+
# Determine SNP type: hom(RR),het(RA),hom(AA) or het(AA)
|
856
|
+
if ( $$als[0] eq $$als[1] )
|
857
|
+
{
|
858
|
+
if ( $$als[0] eq $ref ) { return 'hom_RR_'; }
|
859
|
+
else { return 'hom_AA_'; }
|
860
|
+
}
|
861
|
+
else
|
862
|
+
{
|
863
|
+
if ( $$als[0] eq $ref or $$als[1] eq $ref ) { return 'het_RA_'; }
|
864
|
+
else { return 'het_AA_'; }
|
865
|
+
}
|
866
|
+
}
|
867
|
+
|
868
|
+
sub cmp_genotypes
|
869
|
+
{
|
870
|
+
my ($opts,$grp) = @_;
|
871
|
+
my $nrecs = @$grp;
|
872
|
+
my $hapls = $$opts{hapls};
|
873
|
+
|
874
|
+
# Break the VCF lines into hashes (required by parse_haplotype)
|
875
|
+
for my $grp_rec (@$grp)
|
876
|
+
{
|
877
|
+
$$grp_rec{rec} = $$grp_rec{vcf}->next_data_hash($$grp_rec{line});
|
878
|
+
if ( $$opts{ignore_indels} && exists($$grp_rec{rec}{INFO}{INDEL}) ) { return; }
|
879
|
+
if ( exists($$grp_rec{vcf}{_col_mapping}) )
|
880
|
+
{
|
881
|
+
my %new_cols;
|
882
|
+
while (my ($name_ori,$name_new) = each %{$$grp_rec{vcf}{_col_mapping}})
|
883
|
+
{
|
884
|
+
$new_cols{$name_new} = $$grp_rec{rec}{gtypes}{$name_ori};
|
885
|
+
delete($$grp_rec{rec}{gtypes}{$name_ori});
|
886
|
+
}
|
887
|
+
while (my ($name,$hash) = each %new_cols)
|
888
|
+
{
|
889
|
+
$$grp_rec{rec}{gtypes}{$name} = $hash;
|
890
|
+
}
|
891
|
+
}
|
892
|
+
}
|
893
|
+
if ( $$grp[0]{vcf}{vcf_compare_ID} != 0 ) { error("FIXME: different order than expected: $$grp[0]{vcf}{vcf_compare_ID}\n"); }
|
894
|
+
my $ref = $$grp[0]{rec}{REF};
|
895
|
+
|
896
|
+
my %gtype_matches = ();
|
897
|
+
my %gtype_mismatches = ();
|
898
|
+
|
899
|
+
my $min_dp;
|
900
|
+
my $ndp3 = 0;
|
901
|
+
for my $id (keys %{$$grp[0]{rec}{gtypes}})
|
902
|
+
{
|
903
|
+
my (@sorted_als1,$nploid,$type);
|
904
|
+
|
905
|
+
my ($als1,$seps1,$is_phased1,$is_empty1) = $$grp[0]{vcf}->parse_haplotype($$grp[0]{rec},$id);
|
906
|
+
if ( !$is_empty1 )
|
907
|
+
{
|
908
|
+
@sorted_als1 = sort @$als1;
|
909
|
+
$nploid = scalar @sorted_als1;
|
910
|
+
$type = snp_type($als1,$ref);
|
911
|
+
}
|
912
|
+
|
913
|
+
if ( exists($$opts{high_confidence_gls}) )
|
914
|
+
{
|
915
|
+
my @gls = split(/,/,$$grp[1]{rec}{gtypes}{$id}{GL});
|
916
|
+
if ( @gls!=3 or $gls[0] eq '.' ) { next; }
|
917
|
+
@gls = sort {$b<=>$a} @gls;
|
918
|
+
if ( abs($gls[0]-$gls[1])<$$opts{high_confidence_gls} ) { next; }
|
919
|
+
}
|
920
|
+
|
921
|
+
# There may be multiple files entering the comparison. Report match only if all are present and all match.
|
922
|
+
# Report mismatch if all are present and they do not match. Otherwise report lost/gained event.
|
923
|
+
my $phase_match = 1;
|
924
|
+
my $gtype_match = 1;
|
925
|
+
my $gtype_lost = 0;
|
926
|
+
my $gtype_gained = 0;
|
927
|
+
my $phase_lost = 0;
|
928
|
+
my $phase_gained = 0;
|
929
|
+
my $type2;
|
930
|
+
for (my $i=1; $i<$nrecs; $i++)
|
931
|
+
{
|
932
|
+
my ($als2,$seps2,$is_phased2,$is_empty2) = $$grp[$i]{vcf}->parse_haplotype($$grp[$i]{rec},$id);
|
933
|
+
if ( $is_empty1 )
|
934
|
+
{
|
935
|
+
$gtype_match = 0;
|
936
|
+
if ( !$is_empty2 )
|
937
|
+
{
|
938
|
+
$gtype_gained = 1;
|
939
|
+
$type = snp_type($als2,$ref);
|
940
|
+
}
|
941
|
+
if ( !$is_phased1 && $is_phased2 ) { $phase_gained = 1; }
|
942
|
+
last;
|
943
|
+
}
|
944
|
+
elsif ( $is_empty2 )
|
945
|
+
{
|
946
|
+
$gtype_match = 0;
|
947
|
+
$gtype_lost = 1;
|
948
|
+
last;
|
949
|
+
}
|
950
|
+
if ( $is_phased1 )
|
951
|
+
{
|
952
|
+
if ( !$is_phased2 )
|
953
|
+
{
|
954
|
+
$phase_lost = 1;
|
955
|
+
$phase_match = 0;
|
956
|
+
}
|
957
|
+
}
|
958
|
+
elsif ( $is_phased2 )
|
959
|
+
{
|
960
|
+
$phase_gained = 1;
|
961
|
+
$phase_match = 0;
|
962
|
+
}
|
963
|
+
else { $phase_match = 0; }
|
964
|
+
|
965
|
+
# Consider different number of alleles as mismatch (C vs C/C)
|
966
|
+
if ( scalar @$als1 != scalar @$als2 )
|
967
|
+
{
|
968
|
+
$gtype_match = 0;
|
969
|
+
if ( $$opts{debug} ) { $$opts{mismatch_types}{$type}{'Allele_Count'}++ }
|
970
|
+
last;
|
971
|
+
}
|
972
|
+
|
973
|
+
my @sorted_als2 = sort @$als2;
|
974
|
+
for (my $ial=0; $ial<$nploid; $ial++)
|
975
|
+
{
|
976
|
+
if ( $sorted_als1[$ial] ne $sorted_als2[$ial] )
|
977
|
+
{
|
978
|
+
$gtype_match = 0;
|
979
|
+
if ( $$opts{debug} )
|
980
|
+
{
|
981
|
+
my $type2 = snp_type($als2,$ref);
|
982
|
+
$$opts{mismatch_types}{$type}{$type2}++;
|
983
|
+
}
|
984
|
+
last;
|
985
|
+
}
|
986
|
+
}
|
987
|
+
|
988
|
+
if ( !$gtype_match )
|
989
|
+
{
|
990
|
+
if ( !defined $type2 && !$is_empty2 )
|
991
|
+
{
|
992
|
+
$type2 = snp_type($als2,$ref);
|
993
|
+
}
|
994
|
+
last;
|
995
|
+
}
|
996
|
+
|
997
|
+
# They match, check also if their phase agrees
|
998
|
+
if ( $phase_match && $is_phased1 && $is_phased2 )
|
999
|
+
{
|
1000
|
+
for (my $ial=0; $ial<$nploid; $ial++)
|
1001
|
+
{
|
1002
|
+
if ( $$als1[$ial] ne $$als2[$ial] ) { $phase_match=0; last; }
|
1003
|
+
}
|
1004
|
+
}
|
1005
|
+
}
|
1006
|
+
if ( $gtype_gained )
|
1007
|
+
{
|
1008
|
+
$$hapls{$id}{$type.'gtype_gained'}++;
|
1009
|
+
if ( $phase_gained ) { $$hapls{$id}{phased_gtype_gained}++ }
|
1010
|
+
next;
|
1011
|
+
}
|
1012
|
+
if ( $gtype_lost ) { $$hapls{$id}{$type.'gtype_lost'}++; next; }
|
1013
|
+
|
1014
|
+
if ( $phase_gained ) { $$hapls{$id}{phase_gained}++ }
|
1015
|
+
elsif ( $phase_lost ) { $$hapls{$id}{$type.'phase_lost'}++ }
|
1016
|
+
|
1017
|
+
my $dp = exists($$grp[1]{rec}{gtypes}{$id}{DP}) ? $$grp[1]{rec}{gtypes}{$id}{DP} : -1;
|
1018
|
+
if ( $gtype_match )
|
1019
|
+
{
|
1020
|
+
$$hapls{$id}{$type.'gtype_match'}++;
|
1021
|
+
if ( $phase_match ) { $$hapls{$id}{$type.'phase_match'}++ }
|
1022
|
+
$gtype_matches{$type}++;
|
1023
|
+
$$opts{counts_by_dp}{$dp}{$type.'-'.$type}++;
|
1024
|
+
}
|
1025
|
+
elsif ( defined $type )
|
1026
|
+
{
|
1027
|
+
$$hapls{$id}{$type.'gtype_mismatch'}++;
|
1028
|
+
$gtype_mismatches{$type}++;
|
1029
|
+
$$opts{counts_by_dp}{$dp}{$type.'-'.$type2}++;
|
1030
|
+
}
|
1031
|
+
}
|
1032
|
+
$$opts{hapls_ncmp}++;
|
1033
|
+
|
1034
|
+
# Store the number of matching types by AC
|
1035
|
+
my $af;
|
1036
|
+
if ( $$opts{INFO_AF1_af} && exists($$grp[1]{rec}{INFO}{AF1}) )
|
1037
|
+
{
|
1038
|
+
$af = sprintf "%.2f", $$grp[1]{rec}{INFO}{AF1};
|
1039
|
+
}
|
1040
|
+
elsif ( !$$opts{all_samples_af} )
|
1041
|
+
{
|
1042
|
+
my $ac = 0;
|
1043
|
+
my $an = 0;
|
1044
|
+
if ( exists($gtype_matches{hom_AA_}) )
|
1045
|
+
{
|
1046
|
+
$ac += 2*$gtype_matches{hom_AA_};
|
1047
|
+
$an += 2*$gtype_matches{hom_AA_};
|
1048
|
+
}
|
1049
|
+
if ( exists($gtype_mismatches{hom_AA_}) )
|
1050
|
+
{
|
1051
|
+
$ac += 2*$gtype_mismatches{hom_AA_};
|
1052
|
+
$an += 2*$gtype_mismatches{hom_AA_};
|
1053
|
+
}
|
1054
|
+
if ( exists($gtype_matches{het_RA_}) )
|
1055
|
+
{
|
1056
|
+
$ac += $gtype_matches{het_RA_};
|
1057
|
+
$an += 2*$gtype_matches{het_RA_};
|
1058
|
+
}
|
1059
|
+
if ( exists($gtype_mismatches{het_RA_}) )
|
1060
|
+
{
|
1061
|
+
$ac += $gtype_mismatches{het_RA_};
|
1062
|
+
$an += 2*$gtype_mismatches{het_RA_};
|
1063
|
+
}
|
1064
|
+
if ( exists($gtype_matches{hom_RR_}) ) { $an += 2*$gtype_matches{hom_RR_}; }
|
1065
|
+
if ( exists($gtype_mismatches{hom_RR_}) ) { $an += 2*$gtype_mismatches{hom_RR_}; }
|
1066
|
+
$af = sprintf "%.2f", $an>0 ? $ac/$an : 0;
|
1067
|
+
}
|
1068
|
+
else
|
1069
|
+
{
|
1070
|
+
my ($an,$ac) = $$grp[0]{vcf}->calc_an_ac($$grp[0]{rec}{gtypes});
|
1071
|
+
$af = sprintf "%.2f", $an>0 ? $ac/$an : 0;
|
1072
|
+
}
|
1073
|
+
|
1074
|
+
for my $type (keys %gtype_matches)
|
1075
|
+
{
|
1076
|
+
$$opts{counts_by_af}{$af}{$type}{matches} += $gtype_matches{$type};
|
1077
|
+
$$opts{gtypes_cmp_total} += $gtype_matches{$type};
|
1078
|
+
}
|
1079
|
+
for my $type (keys %gtype_mismatches)
|
1080
|
+
{
|
1081
|
+
$$opts{counts_by_af}{$af}{$type}{mismatches} += $gtype_mismatches{$type};
|
1082
|
+
$$opts{gtypes_cmp_total} += $gtype_mismatches{$type};
|
1083
|
+
}
|
1084
|
+
|
1085
|
+
if ( $$opts{debug} )
|
1086
|
+
{
|
1087
|
+
my $match = '?';
|
1088
|
+
if ( scalar keys %gtype_mismatches ) { $match = '-' }
|
1089
|
+
elsif ( scalar keys %gtype_matches ) { $match = '+' }
|
1090
|
+
my $hom_rr_mm = $gtype_mismatches{hom_RR_} ? $gtype_mismatches{hom_RR_} : 0;
|
1091
|
+
my $het_ra_mm = $gtype_mismatches{het_RA_} ? $gtype_mismatches{het_RA_} : 0;
|
1092
|
+
my $hom_aa_mm = $gtype_mismatches{hom_AA_} ? $gtype_mismatches{hom_AA_} : 0;
|
1093
|
+
my $hom_rr_m = $gtype_matches{hom_RR_} ? $gtype_matches{hom_RR_} : 0;
|
1094
|
+
my $het_ra_m = $gtype_matches{het_RA_} ? $gtype_matches{het_RA_} : 0;
|
1095
|
+
my $hom_aa_m = $gtype_matches{hom_AA_} ? $gtype_matches{hom_AA_} : 0;
|
1096
|
+
my $hom_rr_c = sprintf "%.2f", ($hom_rr_mm or $hom_rr_m) ? $hom_rr_m*1./($hom_rr_m + $hom_rr_mm) : -1;
|
1097
|
+
my $het_ra_c = sprintf "%.2f", ($het_ra_mm or $het_ra_m) ? $het_ra_m*1./($het_ra_m + $het_ra_mm) : -1;
|
1098
|
+
my $hom_aa_c = sprintf "%.2f", ($hom_aa_mm or $hom_aa_m) ? $hom_aa_m*1./($hom_aa_m + $hom_aa_mm) : -1;
|
1099
|
+
print "SD\t$$grp[0]{rec}{CHROM}\t$$grp[0]{rec}{POS}\t$match\t$hom_rr_mm\t$het_ra_mm\t$hom_aa_mm\t$hom_rr_c\t$het_ra_c\t$hom_aa_c\n";
|
1100
|
+
}
|
1101
|
+
}
|
1102
|
+
|
1103
|
+
|
1104
|
+
sub read_next_group
|
1105
|
+
{
|
1106
|
+
my ($vcfs,$win) = @_;
|
1107
|
+
|
1108
|
+
my @grp;
|
1109
|
+
my $prev_vcf;
|
1110
|
+
my $start;
|
1111
|
+
|
1112
|
+
while (1)
|
1113
|
+
{
|
1114
|
+
my $min_vcf = get_min_position($vcfs);
|
1115
|
+
if ( !$min_vcf ) { last; }
|
1116
|
+
if ( $prev_vcf && $prev_vcf eq $$min_vcf{buf}[0] ) { last; }
|
1117
|
+
$prev_vcf = $$min_vcf{buf}[0];
|
1118
|
+
|
1119
|
+
if ( !$start or $start+$win >= $$min_vcf{buf}[0]{pos} )
|
1120
|
+
{
|
1121
|
+
my $rec = shift(@{$$min_vcf{buf}});
|
1122
|
+
push @grp,$rec;
|
1123
|
+
|
1124
|
+
$start = $$rec{pos};
|
1125
|
+
next;
|
1126
|
+
}
|
1127
|
+
}
|
1128
|
+
return \@grp;
|
1129
|
+
}
|
1130
|
+
|
1131
|
+
sub get_min_position
|
1132
|
+
{
|
1133
|
+
my ($vcfs) = @_;
|
1134
|
+
|
1135
|
+
my ($min_pos,$min_vcf);
|
1136
|
+
for my $vcf (@$vcfs)
|
1137
|
+
{
|
1138
|
+
# Check if there is a line in the buffer, if not, read. If still empty, the file reached eof
|
1139
|
+
if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { read_line($vcf); }
|
1140
|
+
if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { next; }
|
1141
|
+
|
1142
|
+
my $line = $$vcf{buf}[0];
|
1143
|
+
|
1144
|
+
# Designate this position as the minimum of all the files if:
|
1145
|
+
# .. is this the first file?
|
1146
|
+
if ( !$min_pos )
|
1147
|
+
{
|
1148
|
+
$min_pos = $$line{pos};
|
1149
|
+
$min_vcf = $vcf;
|
1150
|
+
next;
|
1151
|
+
}
|
1152
|
+
|
1153
|
+
# .. has this file lower position?
|
1154
|
+
if ( $min_pos>$$line{pos} )
|
1155
|
+
{
|
1156
|
+
$min_pos = $$line{pos};
|
1157
|
+
$min_vcf = $vcf;
|
1158
|
+
next;
|
1159
|
+
}
|
1160
|
+
}
|
1161
|
+
return $min_vcf;
|
1162
|
+
}
|
1163
|
+
|
1164
|
+
sub read_line
|
1165
|
+
{
|
1166
|
+
my ($vcf) = @_;
|
1167
|
+
|
1168
|
+
if ( $$vcf{eof} ) { return; }
|
1169
|
+
|
1170
|
+
my $line = $vcf->next_line();
|
1171
|
+
if ( !$line )
|
1172
|
+
{
|
1173
|
+
$$vcf{eof} = 1;
|
1174
|
+
return;
|
1175
|
+
}
|
1176
|
+
|
1177
|
+
$$vcf{nread}++;
|
1178
|
+
|
1179
|
+
if ( !($line=~/^(\S+)\t(\S+)\t\S+\t(\S+)\t(\S+)/) ) { error("Could not parse the line: [$line]\n"); }
|
1180
|
+
my $chr = $1;
|
1181
|
+
my $pos = $2;
|
1182
|
+
my $ref = uc($3);
|
1183
|
+
my $alt = uc($4);
|
1184
|
+
if ( $$vcf{buf} && @{$$vcf{buf}} )
|
1185
|
+
{
|
1186
|
+
my $prev = $$vcf{buf}[-1];
|
1187
|
+
if ( $$prev{pos} == $pos ) { warn("Position $chr:$pos appeared twice in $$vcf{file}\n"); }
|
1188
|
+
}
|
1189
|
+
|
1190
|
+
push @{$$vcf{buf}}, { chr=>$chr, pos=>$pos, ref=>$ref, alt=>$alt, line=>$line, vcf=>$vcf };
|
1191
|
+
return;
|
1192
|
+
}
|
1193
|
+
|