ngs_server 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/ngs_server +72 -50
- data/ext/bamtools/extconf.rb +3 -3
- data/ext/vcftools/Makefile +28 -0
- data/ext/vcftools/README.txt +36 -0
- data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
- data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
- data/ext/vcftools/cpp/.svn/entries +708 -0
- data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
- data/ext/vcftools/cpp/Makefile +46 -0
- data/ext/vcftools/cpp/dgeev.cpp +146 -0
- data/ext/vcftools/cpp/dgeev.h +43 -0
- data/ext/vcftools/cpp/output_log.cpp +79 -0
- data/ext/vcftools/cpp/output_log.h +34 -0
- data/ext/vcftools/cpp/parameters.cpp +535 -0
- data/ext/vcftools/cpp/parameters.h +154 -0
- data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
- data/ext/vcftools/cpp/vcf_entry.h +190 -0
- data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
- data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
- data/ext/vcftools/cpp/vcf_file.cpp +495 -0
- data/ext/vcftools/cpp/vcf_file.h +184 -0
- data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
- data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
- data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
- data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
- data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
- data/ext/vcftools/cpp/vcftools.cpp +107 -0
- data/ext/vcftools/cpp/vcftools.h +25 -0
- data/ext/vcftools/examples/.svn/all-wcprops +185 -0
- data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
- data/ext/vcftools/examples/.svn/entries +1048 -0
- data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
- data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
- data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
- data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
- data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
- data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
- data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
- data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
- data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
- data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
- data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
- data/ext/vcftools/examples/annotate-test.vcf +37 -0
- data/ext/vcftools/examples/annotate.out +23 -0
- data/ext/vcftools/examples/annotate.txt +7 -0
- data/ext/vcftools/examples/annotate2.out +52 -0
- data/ext/vcftools/examples/annotate3.out +23 -0
- data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
- data/ext/vcftools/examples/cmp-test.out +53 -0
- data/ext/vcftools/examples/concat-a.vcf +21 -0
- data/ext/vcftools/examples/concat-b.vcf +13 -0
- data/ext/vcftools/examples/concat-c.vcf +19 -0
- data/ext/vcftools/examples/concat.out +39 -0
- data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
- data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
- data/ext/vcftools/examples/merge-test-a.vcf +17 -0
- data/ext/vcftools/examples/merge-test-b.vcf +17 -0
- data/ext/vcftools/examples/merge-test-c.vcf +15 -0
- data/ext/vcftools/examples/merge-test.vcf.out +31 -0
- data/ext/vcftools/examples/perl-api-1.pl +46 -0
- data/ext/vcftools/examples/query-test.out +6 -0
- data/ext/vcftools/examples/shuffle-test.vcf +12 -0
- data/ext/vcftools/examples/subset.SNPs.out +10 -0
- data/ext/vcftools/examples/subset.indels.out +18 -0
- data/ext/vcftools/examples/subset.vcf +21 -0
- data/ext/vcftools/examples/valid-3.3.vcf +30 -0
- data/ext/vcftools/examples/valid-4.0.vcf +34 -0
- data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
- data/ext/vcftools/examples/valid-4.1.vcf +37 -0
- data/ext/vcftools/extconf.rb +2 -0
- data/ext/vcftools/perl/.svn/all-wcprops +149 -0
- data/ext/vcftools/perl/.svn/entries +844 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
- data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
- data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
- data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
- data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
- data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
- data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
- data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
- data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
- data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
- data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
- data/ext/vcftools/perl/ChangeLog +84 -0
- data/ext/vcftools/perl/FaSlice.pm +214 -0
- data/ext/vcftools/perl/Makefile +12 -0
- data/ext/vcftools/perl/Vcf.pm +2853 -0
- data/ext/vcftools/perl/VcfStats.pm +681 -0
- data/ext/vcftools/perl/fill-aa +103 -0
- data/ext/vcftools/perl/fill-an-ac +56 -0
- data/ext/vcftools/perl/fill-ref-md5 +204 -0
- data/ext/vcftools/perl/tab-to-vcf +92 -0
- data/ext/vcftools/perl/test.t +376 -0
- data/ext/vcftools/perl/vcf-annotate +1099 -0
- data/ext/vcftools/perl/vcf-compare +1193 -0
- data/ext/vcftools/perl/vcf-concat +310 -0
- data/ext/vcftools/perl/vcf-convert +180 -0
- data/ext/vcftools/perl/vcf-fix-newlines +97 -0
- data/ext/vcftools/perl/vcf-isec +660 -0
- data/ext/vcftools/perl/vcf-merge +577 -0
- data/ext/vcftools/perl/vcf-query +286 -0
- data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
- data/ext/vcftools/perl/vcf-sort +79 -0
- data/ext/vcftools/perl/vcf-stats +160 -0
- data/ext/vcftools/perl/vcf-subset +206 -0
- data/ext/vcftools/perl/vcf-to-tab +112 -0
- data/ext/vcftools/perl/vcf-validator +145 -0
- data/ext/vcftools/website/.svn/all-wcprops +41 -0
- data/ext/vcftools/website/.svn/entries +238 -0
- data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
- data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
- data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
- data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
- data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
- data/ext/vcftools/website/Makefile +6 -0
- data/ext/vcftools/website/README +2 -0
- data/ext/vcftools/website/VCF-poster.pdf +0 -0
- data/ext/vcftools/website/default.css +250 -0
- data/ext/vcftools/website/favicon.ico +0 -0
- data/ext/vcftools/website/favicon.png +0 -0
- data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/img/.svn/entries +300 -0
- data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
- data/ext/vcftools/website/img/bg.gif +0 -0
- data/ext/vcftools/website/img/bgcode.gif +0 -0
- data/ext/vcftools/website/img/bgcontainer.gif +0 -0
- data/ext/vcftools/website/img/bgul.gif +0 -0
- data/ext/vcftools/website/img/header.gif +0 -0
- data/ext/vcftools/website/img/li.gif +0 -0
- data/ext/vcftools/website/img/quote.gif +0 -0
- data/ext/vcftools/website/img/search.gif +0 -0
- data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/src/.svn/entries +300 -0
- data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
- data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
- data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
- data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
- data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
- data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
- data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
- data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
- data/ext/vcftools/website/src/docs.inc +202 -0
- data/ext/vcftools/website/src/index.inc +52 -0
- data/ext/vcftools/website/src/index.php +80 -0
- data/ext/vcftools/website/src/license.inc +27 -0
- data/ext/vcftools/website/src/links.inc +13 -0
- data/ext/vcftools/website/src/options.inc +654 -0
- data/ext/vcftools/website/src/perl_module.inc +249 -0
- data/ext/vcftools/website/src/specs.inc +18 -0
- data/lib/config.ru +9 -0
- data/lib/ngs_server/add.rb +9 -0
- data/lib/ngs_server/version.rb +1 -1
- data/lib/ngs_server.rb +55 -3
- data/ngs_server.gemspec +5 -2
- metadata +296 -6
|
@@ -0,0 +1,3012 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* vcf_file_output.cpp
|
|
3
|
+
*
|
|
4
|
+
* Created on: Aug 28, 2009
|
|
5
|
+
* Author: Adam Auton
|
|
6
|
+
* ($Revision: 249 $)
|
|
7
|
+
*/
|
|
8
|
+
#include "vcf_file.h"
|
|
9
|
+
|
|
10
|
+
void vcf_file::output_frequency(const string &output_file_prefix, bool output_counts, bool suppress_allele_output)
|
|
11
|
+
{
|
|
12
|
+
// Output statistics of frequency at each site
|
|
13
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
14
|
+
error("Require Genotypes in VCF file in order to output Frequency Statistics.");
|
|
15
|
+
|
|
16
|
+
printLOG("Outputting Frequency Statistics...\n");
|
|
17
|
+
string output_file = output_file_prefix + ".frq";
|
|
18
|
+
if (output_counts)
|
|
19
|
+
output_file += ".count";
|
|
20
|
+
|
|
21
|
+
ofstream out(output_file.c_str());
|
|
22
|
+
if (!out.is_open()) error("Could not open output file: " + output_file, 12);
|
|
23
|
+
if (suppress_allele_output == false)
|
|
24
|
+
{
|
|
25
|
+
out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{ALLELE:";
|
|
26
|
+
if (output_counts)
|
|
27
|
+
out << "COUNT}" << endl;
|
|
28
|
+
else
|
|
29
|
+
out << "FREQ}" << endl;
|
|
30
|
+
}
|
|
31
|
+
else
|
|
32
|
+
{
|
|
33
|
+
if (output_counts)
|
|
34
|
+
out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{COUNT}" << endl;
|
|
35
|
+
else
|
|
36
|
+
out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{FREQ}" << endl;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
vector<int> allele_counts;
|
|
40
|
+
unsigned int N_non_missing_chr;
|
|
41
|
+
unsigned int N_alleles;
|
|
42
|
+
string vcf_line;
|
|
43
|
+
vcf_entry e(N_indv);
|
|
44
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
45
|
+
{
|
|
46
|
+
if (include_entry[s] == false)
|
|
47
|
+
continue;
|
|
48
|
+
|
|
49
|
+
get_vcf_entry(s, vcf_line);
|
|
50
|
+
e.reset(vcf_line);
|
|
51
|
+
e.parse_basic_entry(true);
|
|
52
|
+
e.parse_genotype_entries(true);
|
|
53
|
+
N_alleles = e.get_N_alleles();
|
|
54
|
+
|
|
55
|
+
e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
|
|
56
|
+
|
|
57
|
+
out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << N_alleles << "\t" << N_non_missing_chr;
|
|
58
|
+
if (output_counts)
|
|
59
|
+
{
|
|
60
|
+
if (suppress_allele_output == false)
|
|
61
|
+
{
|
|
62
|
+
out << "\t" << e.get_REF() << ":" << allele_counts[0];
|
|
63
|
+
for (unsigned int ui=1; ui<N_alleles; ui++)
|
|
64
|
+
{
|
|
65
|
+
out << "\t" << e.get_ALT_allele(ui-1) << ":" << allele_counts[ui];
|
|
66
|
+
}
|
|
67
|
+
out << endl;
|
|
68
|
+
}
|
|
69
|
+
else
|
|
70
|
+
{
|
|
71
|
+
for (unsigned ui=0; ui<N_alleles; ui++)
|
|
72
|
+
{
|
|
73
|
+
out << "\t" << allele_counts[ui];
|
|
74
|
+
}
|
|
75
|
+
out << endl;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
else
|
|
79
|
+
{
|
|
80
|
+
double freq;
|
|
81
|
+
if (suppress_allele_output == false)
|
|
82
|
+
{
|
|
83
|
+
freq = allele_counts[0] / (double)N_non_missing_chr;
|
|
84
|
+
out << "\t" << e.get_REF() << ":" << freq;
|
|
85
|
+
for (unsigned int ui=1; ui<N_alleles; ui++)
|
|
86
|
+
{
|
|
87
|
+
freq = allele_counts[ui] / (double)N_non_missing_chr;
|
|
88
|
+
out << "\t" << e.get_ALT_allele(ui-1) << ":" << freq;
|
|
89
|
+
}
|
|
90
|
+
out << endl;
|
|
91
|
+
}
|
|
92
|
+
else
|
|
93
|
+
{
|
|
94
|
+
for (unsigned int ui=0; ui<N_alleles; ui++)
|
|
95
|
+
{
|
|
96
|
+
freq = allele_counts[ui] / (double)N_non_missing_chr;
|
|
97
|
+
out << "\t" << freq;
|
|
98
|
+
}
|
|
99
|
+
out << endl;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
out.close();
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
void vcf_file::output_het(const string &output_file_prefix)
|
|
107
|
+
{
|
|
108
|
+
// Output statistics on Heterozygosity for each individual
|
|
109
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
110
|
+
error("Require Genotypes in VCF file in order to output Heterozygosity Statistics.");
|
|
111
|
+
// Following the calculations in PLINK....
|
|
112
|
+
// Note this assumes Biallelic SNPs.
|
|
113
|
+
|
|
114
|
+
printLOG("Outputting Individual Heterozygosity\n");
|
|
115
|
+
|
|
116
|
+
string output_file = output_file_prefix + ".het";
|
|
117
|
+
ofstream out(output_file.c_str());
|
|
118
|
+
if (!out.is_open()) error("Could not open output file: " + output_file, 12);
|
|
119
|
+
out << "INDV\tO(HOM)\tE(HOM)\tN_SITES\tF" << endl;
|
|
120
|
+
|
|
121
|
+
// P(Homo) = F + (1-F)P(Homo by chance)
|
|
122
|
+
// P(Homo by chance) = p^2+q^2 for a biallelic locus.
|
|
123
|
+
// For an individual with N genotyped loci, we
|
|
124
|
+
// 1. count the total observed number of loci which are homozygous (O),
|
|
125
|
+
// 2. calculate the total expected number of loci homozygous by chance (E)
|
|
126
|
+
// Then, using the method of moments, we have
|
|
127
|
+
// O = NF + (1-F)E
|
|
128
|
+
// Which rearranges to give
|
|
129
|
+
// F = (O-E)/(N-E)
|
|
130
|
+
|
|
131
|
+
// First, calc frequency of each site (should really move this to a subroutine)
|
|
132
|
+
vector<double> freq(N_entries, 0.0);
|
|
133
|
+
vector<int> allele_counts;
|
|
134
|
+
vector<unsigned int> N_non_missing_chr(N_entries,0);
|
|
135
|
+
string vcf_line;
|
|
136
|
+
vcf_entry e(N_indv);
|
|
137
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
138
|
+
{
|
|
139
|
+
if (include_entry[s] == false)
|
|
140
|
+
continue;
|
|
141
|
+
|
|
142
|
+
get_vcf_entry(s, vcf_line);
|
|
143
|
+
e.reset(vcf_line);
|
|
144
|
+
e.parse_basic_entry(true);
|
|
145
|
+
|
|
146
|
+
if (e.get_N_alleles() != 2)
|
|
147
|
+
{
|
|
148
|
+
one_off_warning("\tIndividual Heterozygosity: Only using biallelic SNPs.");
|
|
149
|
+
continue;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
e.parse_genotype_entries(true);
|
|
153
|
+
|
|
154
|
+
if (e.is_diploid(include_indv, include_genotype[s]) == false)
|
|
155
|
+
{
|
|
156
|
+
one_off_warning("\tIndividual Heterozygosity: Only using fully diploid SNPs.");
|
|
157
|
+
continue;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Frequency of non-reference allele
|
|
161
|
+
e.get_allele_counts(allele_counts, N_non_missing_chr[s], include_indv, include_genotype[s]);
|
|
162
|
+
|
|
163
|
+
if (N_non_missing_chr[s] > 0)
|
|
164
|
+
freq[s] = allele_counts[1] / double(N_non_missing_chr[s]);
|
|
165
|
+
else
|
|
166
|
+
freq[s] = -1;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
vector<int> N_sites_included(N_indv, 0);
|
|
170
|
+
vector<int> N_obs_hom(N_indv, 0);
|
|
171
|
+
vector<double> N_expected_hom(N_indv, 0.0);
|
|
172
|
+
pair<int, int> alleles;
|
|
173
|
+
|
|
174
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
175
|
+
{
|
|
176
|
+
if (include_entry[s] == false)
|
|
177
|
+
continue;
|
|
178
|
+
|
|
179
|
+
get_vcf_entry(s, vcf_line);
|
|
180
|
+
e.reset(vcf_line);
|
|
181
|
+
e.parse_basic_entry(true);
|
|
182
|
+
|
|
183
|
+
if (e.get_N_alleles() != 2)
|
|
184
|
+
continue;
|
|
185
|
+
|
|
186
|
+
e.parse_genotype_entries(true);
|
|
187
|
+
if (e.is_diploid(include_indv, include_genotype[s]) == false)
|
|
188
|
+
continue;
|
|
189
|
+
|
|
190
|
+
if ((freq[s] <= numeric_limits<double>::epsilon()) || (1.0 - freq[s] <= numeric_limits<double>::epsilon()))
|
|
191
|
+
continue;
|
|
192
|
+
|
|
193
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
194
|
+
{
|
|
195
|
+
if (include_indv[ui] == false)
|
|
196
|
+
continue;
|
|
197
|
+
|
|
198
|
+
if (include_genotype[s][ui] == true)
|
|
199
|
+
{
|
|
200
|
+
e.get_indv_GENOTYPE_ids(ui, alleles);
|
|
201
|
+
if ((alleles.first != -1) && (alleles.second != -1))
|
|
202
|
+
{
|
|
203
|
+
N_sites_included[ui]++;
|
|
204
|
+
if (alleles.first == alleles.second)
|
|
205
|
+
N_obs_hom[ui]++;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/////////////////////////
|
|
209
|
+
// Expected homozygosity
|
|
210
|
+
// E = 1 - (2pq . 2N/(2N-1))
|
|
211
|
+
// (Using Nei's unbiased estimator)
|
|
212
|
+
N_expected_hom[ui] += 1.0 - (2.0 * freq[s] * (1.0 - freq[s]) * (N_non_missing_chr[s] / (N_non_missing_chr[s] - 1.0)));
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
out.setf(ios::fixed,ios::floatfield);
|
|
218
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
219
|
+
{
|
|
220
|
+
if (include_indv[ui] == false)
|
|
221
|
+
continue;
|
|
222
|
+
if (N_sites_included[ui] > 0)
|
|
223
|
+
{
|
|
224
|
+
double F = (N_obs_hom[ui] - N_expected_hom[ui]) / double(N_sites_included[ui] - N_expected_hom[ui]);
|
|
225
|
+
out << indv[ui] << "\t" << N_obs_hom[ui] << "\t";
|
|
226
|
+
out.precision(1);
|
|
227
|
+
out << N_expected_hom[ui] << "\t";
|
|
228
|
+
out.precision(5);
|
|
229
|
+
out << N_sites_included[ui] << "\t" << F << endl;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
out.close();
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
void vcf_file::output_hwe(const string &output_file_prefix)
|
|
237
|
+
{
|
|
238
|
+
// Output HWE statistics for each site as described in Wigginton, Cutler, and Abecasis (2005)
|
|
239
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
240
|
+
error("Require Genotypes in VCF file in order to output HWE Statistics.");
|
|
241
|
+
// Note this assumes Biallelic SNPs.
|
|
242
|
+
printLOG("Outputting HWE statistics (but only for biallelic loci)\n");
|
|
243
|
+
|
|
244
|
+
string output_file = output_file_prefix + ".hwe";
|
|
245
|
+
ofstream out(output_file.c_str());
|
|
246
|
+
if (!out.is_open()) error("Could not open output file: " + output_file, 12);
|
|
247
|
+
out << "CHR\tPOS\tOBS(HOM1/HET/HOM2)\tE(HOM1/HET/HOM2)\tChiSq\tP" << endl;
|
|
248
|
+
|
|
249
|
+
/* PLINK code:
|
|
250
|
+
// b11 = Nhom1, b12 = Nhet, b22 = Nhom2
|
|
251
|
+
double tot = b11 + b12 + b22;
|
|
252
|
+
double exp_11 = freq * freq * tot;
|
|
253
|
+
double exp_12 = 2 * freq * (1-freq) * tot;
|
|
254
|
+
double exp_22 = (1-freq) * (1-freq) * tot;
|
|
255
|
+
|
|
256
|
+
double chisq = ( (b11-exp_11)*(b11-exp_11) ) / exp_11
|
|
257
|
+
+ ( (b12-exp_12)*(b12-exp_12) ) / exp_12
|
|
258
|
+
+ ( (b22-exp_22)*(b22-exp_22) ) / exp_22 ;
|
|
259
|
+
|
|
260
|
+
p = chiprobP(chisq,1);
|
|
261
|
+
*/
|
|
262
|
+
|
|
263
|
+
double freq;
|
|
264
|
+
unsigned int b11, b12, b22;
|
|
265
|
+
double exp_11, exp_12, exp_22;
|
|
266
|
+
double chisq;
|
|
267
|
+
double tot;
|
|
268
|
+
double p;
|
|
269
|
+
unsigned int precision = out.precision();
|
|
270
|
+
vector<int> allele_counts;
|
|
271
|
+
unsigned int N_non_missing_chr;
|
|
272
|
+
string vcf_line;
|
|
273
|
+
vcf_entry e(N_indv);
|
|
274
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
275
|
+
{
|
|
276
|
+
if (include_entry[s] == false)
|
|
277
|
+
continue;
|
|
278
|
+
|
|
279
|
+
get_vcf_entry(s, vcf_line);
|
|
280
|
+
e.reset(vcf_line);
|
|
281
|
+
e.parse_basic_entry(true);
|
|
282
|
+
|
|
283
|
+
if (e.get_N_alleles() != 2)
|
|
284
|
+
{
|
|
285
|
+
one_off_warning("\tHWE: Only using biallelic SNPs.");
|
|
286
|
+
continue; // Isn't biallelic
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
e.parse_genotype_entries(true);
|
|
290
|
+
|
|
291
|
+
if (e.is_diploid(include_indv, include_genotype[s]) == false)
|
|
292
|
+
{
|
|
293
|
+
one_off_warning("\tHWE: Only using fully diploid SNPs.");
|
|
294
|
+
continue; // Isn't diploid
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
|
|
298
|
+
freq = allele_counts[0] / (double)N_non_missing_chr;
|
|
299
|
+
e.get_genotype_counts(include_indv, include_genotype[s], b11, b12, b22);
|
|
300
|
+
tot = b11 + b12 + b22;
|
|
301
|
+
exp_11 = freq * freq * tot;
|
|
302
|
+
exp_12 = 2.0 * freq * (1.0-freq) * tot;
|
|
303
|
+
exp_22 = (1.0-freq) * (1.0-freq) * tot;
|
|
304
|
+
|
|
305
|
+
chisq = ( (b11-exp_11)*(b11-exp_11) ) / exp_11
|
|
306
|
+
+ ( (b12-exp_12)*(b12-exp_12) ) / exp_12
|
|
307
|
+
+ ( (b22-exp_22)*(b22-exp_22) ) / exp_22;
|
|
308
|
+
|
|
309
|
+
p = vcf_entry::SNPHWE(b12, b11, b22);
|
|
310
|
+
out << e.get_CHROM() << "\t" << e.get_POS();
|
|
311
|
+
out << "\t" << b11 << "/" << b12 << "/" << b22;
|
|
312
|
+
out.precision(2);
|
|
313
|
+
out << fixed << "\t" << exp_11 << "/" << exp_12 << "/" << exp_22;
|
|
314
|
+
out.precision(precision);
|
|
315
|
+
out << "\t" << chisq << "\t" << p << endl;
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
void vcf_file::output_individuals_by_mean_depth(const string &output_file_prefix)
|
|
320
|
+
{
|
|
321
|
+
// Output information regarding the mean depth for each individual
|
|
322
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
323
|
+
error("Require Genotypes in VCF file in order to output Individuals by Mean Depth Statistics.");
|
|
324
|
+
|
|
325
|
+
printLOG("Outputting Mean Depth by Individual\n");
|
|
326
|
+
string output = output_file_prefix + ".idepth";
|
|
327
|
+
ofstream out(output.c_str());
|
|
328
|
+
if (!out.is_open())
|
|
329
|
+
error("Could not open Individual Depth Output File: " + output, 2);
|
|
330
|
+
out << "INDV\tN_SITES\tMEAN_DEPTH" << endl;
|
|
331
|
+
vector<double> depth_sum(N_indv, 0.0);
|
|
332
|
+
vector<int> count(N_indv, 0);
|
|
333
|
+
int depth;
|
|
334
|
+
string vcf_line;
|
|
335
|
+
vcf_entry e(N_indv);
|
|
336
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
337
|
+
{
|
|
338
|
+
if (include_entry[s] == false)
|
|
339
|
+
continue;
|
|
340
|
+
|
|
341
|
+
get_vcf_entry(s, vcf_line);
|
|
342
|
+
e.reset(vcf_line);
|
|
343
|
+
|
|
344
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
345
|
+
{
|
|
346
|
+
if (include_indv[ui] == false)
|
|
347
|
+
continue;
|
|
348
|
+
|
|
349
|
+
if (include_genotype[s][ui] == true)
|
|
350
|
+
{
|
|
351
|
+
e.parse_genotype_entry(ui, false, false, true);
|
|
352
|
+
depth = e.get_indv_DEPTH(ui);
|
|
353
|
+
if (depth >= 0)
|
|
354
|
+
{
|
|
355
|
+
depth_sum[ui] += depth;
|
|
356
|
+
count[ui]++;
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
363
|
+
{
|
|
364
|
+
if (include_indv[ui] == false)
|
|
365
|
+
continue;
|
|
366
|
+
|
|
367
|
+
double mean_depth = depth_sum[ui] / count[ui];
|
|
368
|
+
out << indv[ui] << "\t" << count[ui] << "\t" << mean_depth << endl;
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
out.close();
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
void vcf_file::output_SNP_density(const string &output_file_prefix, int bin_size)
|
|
375
|
+
{
|
|
376
|
+
// Output SNP density (technically variant density)
|
|
377
|
+
if (bin_size <= 0)
|
|
378
|
+
return;
|
|
379
|
+
printLOG("Outputting SNP density\n");
|
|
380
|
+
|
|
381
|
+
string output = output_file_prefix + ".snpden";
|
|
382
|
+
ofstream out(output.c_str());
|
|
383
|
+
if (!out.is_open())
|
|
384
|
+
error("Could not open SNP Density Output File: " + output, 2);
|
|
385
|
+
|
|
386
|
+
// Find maximum position
|
|
387
|
+
unsigned int s;
|
|
388
|
+
map<string, int> max_pos;
|
|
389
|
+
string vcf_line;
|
|
390
|
+
string CHROM; int POS;
|
|
391
|
+
vcf_entry e(N_indv);
|
|
392
|
+
for (s=0; s<N_entries; s++)
|
|
393
|
+
{
|
|
394
|
+
if (include_entry[s] == true)
|
|
395
|
+
{
|
|
396
|
+
//get_vcf_entry(s, vcf_line);
|
|
397
|
+
//e.reset(vcf_line);
|
|
398
|
+
//e.parse_basic_entry();
|
|
399
|
+
|
|
400
|
+
//CHROM = e.get_CHROM();
|
|
401
|
+
//POS = e.get_POS();
|
|
402
|
+
|
|
403
|
+
set_filepos(entry_file_locations[s]);
|
|
404
|
+
read_CHROM_and_POS_only(CHROM, POS);
|
|
405
|
+
if (max_pos.find(CHROM) != max_pos.end())
|
|
406
|
+
{
|
|
407
|
+
if (POS > max_pos[CHROM])
|
|
408
|
+
max_pos[CHROM] = POS;
|
|
409
|
+
}
|
|
410
|
+
else
|
|
411
|
+
max_pos[CHROM] = POS;
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
map<string, int>::iterator it;
|
|
416
|
+
|
|
417
|
+
unsigned int N_bins;
|
|
418
|
+
map<string, vector<int> > bins;
|
|
419
|
+
for (it=max_pos.begin(); it != max_pos.end(); ++it)
|
|
420
|
+
{
|
|
421
|
+
CHROM = (*it).first;
|
|
422
|
+
N_bins = (unsigned int)((max_pos[CHROM] + bin_size) / double(bin_size));
|
|
423
|
+
bins[CHROM].resize(N_bins, 0);
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
unsigned int idx;
|
|
428
|
+
double C = 1.0 / double(bin_size);
|
|
429
|
+
for (s=0; s<N_entries; s++)
|
|
430
|
+
{
|
|
431
|
+
if (include_entry[s] == true)
|
|
432
|
+
{
|
|
433
|
+
//get_vcf_entry(s, vcf_line);
|
|
434
|
+
//e.reset(vcf_line);
|
|
435
|
+
//e.parse_basic_entry();
|
|
436
|
+
|
|
437
|
+
//CHROM = e.get_CHROM();
|
|
438
|
+
//POS = e.get_POS();
|
|
439
|
+
set_filepos(entry_file_locations[s]);
|
|
440
|
+
read_CHROM_and_POS_only(CHROM, POS);
|
|
441
|
+
idx = (unsigned int)(POS * C);
|
|
442
|
+
bins[CHROM][idx]++;
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
out << "CHROM\tBIN_START\tSNP_COUNT\tSNPS/KB" << endl;
|
|
447
|
+
double sum1=0.0, sum2=0.0;
|
|
448
|
+
int bin_tot;
|
|
449
|
+
C = 1000.0 / bin_size;
|
|
450
|
+
for (it=max_pos.begin(); it != max_pos.end(); ++it)
|
|
451
|
+
{
|
|
452
|
+
bool output = false;
|
|
453
|
+
CHROM = (*it).first;
|
|
454
|
+
sum2 += max_pos[CHROM];
|
|
455
|
+
for (s=0; s<bins[CHROM].size(); s++)
|
|
456
|
+
{
|
|
457
|
+
bin_tot = bins[CHROM][s];
|
|
458
|
+
sum1 += bin_tot;
|
|
459
|
+
if (bin_tot > 0)
|
|
460
|
+
output = true;
|
|
461
|
+
if (output == true)
|
|
462
|
+
out << CHROM << "\t" << s*bin_size << "\t" << bin_tot << "\t" << bin_tot * C << endl;
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
out.close();
|
|
466
|
+
|
|
467
|
+
double mean_SNP_density = sum1 / sum2 * 1000;
|
|
468
|
+
printLOG("Mean SNP density: " + dbl2str(mean_SNP_density, 5) + " SNPs / kb\n");
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
void vcf_file::output_missingness(const string &output_file_prefix)
|
|
472
|
+
{
|
|
473
|
+
// Output missingness by individual and site
|
|
474
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
475
|
+
error("Require Genotypes in VCF file in order to output Missingness Statistics.");
|
|
476
|
+
|
|
477
|
+
printLOG("Outputting Site and Individual Missingness\n");
|
|
478
|
+
string output1 = output_file_prefix + ".imiss";
|
|
479
|
+
ofstream out1(output1.c_str());
|
|
480
|
+
if (!out1.is_open())
|
|
481
|
+
error("Could not open Individual Missingness Output File: " + output1, 3);
|
|
482
|
+
|
|
483
|
+
string output2 = output_file_prefix + ".lmiss";
|
|
484
|
+
ofstream out2(output2.c_str());
|
|
485
|
+
if (!out2.is_open())
|
|
486
|
+
error("Could not open Site Missingness Output File: " + output2, 4);
|
|
487
|
+
|
|
488
|
+
out1 << "INDV\tN_DATA\tN_GENOTYPES_FILTERED\tN_MISS\tF_MISS" << endl;
|
|
489
|
+
unsigned int ui, s;
|
|
490
|
+
vector<unsigned int> indv_N_missing(N_indv, 0), indv_N_tot(N_indv, 0);
|
|
491
|
+
vector<unsigned int> indv_N_geno_filtered(N_indv, 0);
|
|
492
|
+
unsigned int site_N_missing, site_N_tot, site_N_geno_filtered;
|
|
493
|
+
pair<int, int> alleles;
|
|
494
|
+
string vcf_line;
|
|
495
|
+
vcf_entry e(N_indv);
|
|
496
|
+
|
|
497
|
+
out2 << "CHR\tPOS\tN_DATA\tN_GENOTYPE_FILTERED\tN_MISS\tF_MISS" << endl;
|
|
498
|
+
for (s=0; s<N_entries; s++)
|
|
499
|
+
{
|
|
500
|
+
if (include_entry[s] == false)
|
|
501
|
+
continue;
|
|
502
|
+
|
|
503
|
+
get_vcf_entry(s, vcf_line);
|
|
504
|
+
e.reset(vcf_line);
|
|
505
|
+
e.parse_basic_entry();
|
|
506
|
+
|
|
507
|
+
site_N_missing = 0;
|
|
508
|
+
site_N_tot = 0;
|
|
509
|
+
site_N_geno_filtered = 0;
|
|
510
|
+
for (ui=0; ui<N_indv; ui++)
|
|
511
|
+
{
|
|
512
|
+
if (include_indv[ui] == false)
|
|
513
|
+
continue;
|
|
514
|
+
if (include_genotype[s][ui] == false)
|
|
515
|
+
{
|
|
516
|
+
site_N_geno_filtered++;
|
|
517
|
+
indv_N_geno_filtered[ui]++;
|
|
518
|
+
continue;
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
e.parse_genotype_entry(ui, true);
|
|
522
|
+
e.get_indv_GENOTYPE_ids(ui, alleles);
|
|
523
|
+
if (alleles.first == -1)
|
|
524
|
+
{
|
|
525
|
+
site_N_missing++;
|
|
526
|
+
indv_N_missing[ui]++;
|
|
527
|
+
}
|
|
528
|
+
indv_N_tot[ui]++;
|
|
529
|
+
|
|
530
|
+
if (alleles.second == -1)
|
|
531
|
+
{
|
|
532
|
+
site_N_missing++;
|
|
533
|
+
}
|
|
534
|
+
site_N_tot+=2;
|
|
535
|
+
|
|
536
|
+
if ((alleles.second == -1) && (e.get_indv_PHASE(ui) == '|'))
|
|
537
|
+
{ // Phased missing genotypes indicate haploid genome
|
|
538
|
+
site_N_tot--;
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
out2 << e.get_CHROM() << "\t" << e.get_POS() << "\t" << site_N_tot << "\t" << site_N_geno_filtered << "\t";
|
|
542
|
+
out2 << site_N_missing << "\t" << double(site_N_missing) / double(site_N_tot) << endl;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
for (ui=0; ui<N_indv; ui++)
|
|
546
|
+
{
|
|
547
|
+
if (include_indv[ui] == false)
|
|
548
|
+
continue;
|
|
549
|
+
out1 << indv[ui] << "\t" << indv_N_tot[ui] << "\t";
|
|
550
|
+
out1 << indv_N_geno_filtered[ui] << "\t" << indv_N_missing[ui] << "\t";
|
|
551
|
+
out1 << indv_N_missing[ui] / double(indv_N_tot[ui]) << endl;
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
out2.close();
|
|
555
|
+
out1.close();
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
void vcf_file::output_haplotype_r2(const string &output_file_prefix, int snp_window_size, int bp_window_size, double min_r2)
|
|
559
|
+
{
|
|
560
|
+
// Output pairwise LD statistics, using traditional r^2. Requires phased haplotypes.
|
|
561
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
562
|
+
error("Require Genotypes in VCF file in order to output LD Statistics.");
|
|
563
|
+
|
|
564
|
+
unsigned int s, s2;
|
|
565
|
+
unsigned int ui;
|
|
566
|
+
|
|
567
|
+
printLOG("Outputting Pairwise LD (phased bi-allelic only)\n");
|
|
568
|
+
string output = output_file_prefix + ".hap.ld";
|
|
569
|
+
ofstream out(output.c_str());
|
|
570
|
+
if (!out.is_open())
|
|
571
|
+
error("Could not open LD Output File: " + output, 3);
|
|
572
|
+
|
|
573
|
+
out << "CHR\tPOS1\tPOS2\tN_CHR\tR^2\tD\tDprime" << endl;
|
|
574
|
+
|
|
575
|
+
//For D, D' computations
|
|
576
|
+
double D, Dmax, Dprime;
|
|
577
|
+
int x11, x12, x21, x22;
|
|
578
|
+
double p1, p2, q1, q2;
|
|
579
|
+
double rel_x11, rel_x12, rel_x21, rel_x22;
|
|
580
|
+
|
|
581
|
+
unsigned int chr_count;
|
|
582
|
+
double r2;
|
|
583
|
+
int sx, sy;
|
|
584
|
+
double X, X2, Y, Y2, XY;
|
|
585
|
+
double var1, var2, cov12;
|
|
586
|
+
pair<int,int> geno1, geno2;
|
|
587
|
+
string vcf_line, vcf_line2;
|
|
588
|
+
vcf_entry e(N_indv), e2(N_indv);
|
|
589
|
+
for (s=0; s<(N_entries-1); s++)
|
|
590
|
+
{
|
|
591
|
+
if (include_entry[s] == false)
|
|
592
|
+
continue;
|
|
593
|
+
|
|
594
|
+
get_vcf_entry(s, vcf_line);
|
|
595
|
+
e.reset(vcf_line);
|
|
596
|
+
e.parse_basic_entry(true);
|
|
597
|
+
|
|
598
|
+
if (e.get_N_alleles() != 2)
|
|
599
|
+
{
|
|
600
|
+
one_off_warning("\tLD: Only using biallelic SNPs.");
|
|
601
|
+
continue; // Isn't biallelic
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
for (s2 = s+1; s2<N_entries; s2++)
|
|
605
|
+
{
|
|
606
|
+
if (include_entry[s2] == false)
|
|
607
|
+
continue;
|
|
608
|
+
|
|
609
|
+
if (int(s2 - s) > snp_window_size)
|
|
610
|
+
{
|
|
611
|
+
s2 = N_entries; // SNPs sorted, so no need to go any further
|
|
612
|
+
continue;
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
get_vcf_entry(s2, vcf_line2);
|
|
616
|
+
e2.reset(vcf_line2);
|
|
617
|
+
e2.parse_basic_entry(true);
|
|
618
|
+
|
|
619
|
+
if (e.get_CHROM() != e2.get_CHROM())
|
|
620
|
+
{
|
|
621
|
+
s2 = N_entries; // No need to go any further (assuming SNPs are sorted)
|
|
622
|
+
continue;
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
if ((e2.get_POS() - e.get_POS()) > bp_window_size)
|
|
626
|
+
{
|
|
627
|
+
s2 = N_entries; // No need to go any further (assuming SNPs are sorted)
|
|
628
|
+
continue;
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
if (e2.get_N_alleles() != 2)
|
|
632
|
+
{
|
|
633
|
+
one_off_warning("\tLD: Only using biallelic SNPs.");
|
|
634
|
+
continue;
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
x11=0; x12=0; x21=0; x22=0;
|
|
638
|
+
|
|
639
|
+
X=0, X2=0; Y=0; Y2=0; XY=0;
|
|
640
|
+
chr_count = 0;
|
|
641
|
+
for (ui=0; ui<N_indv; ui++)
|
|
642
|
+
{
|
|
643
|
+
if ((include_indv[ui] == false) || (include_genotype[s][ui] == false) || (include_genotype[s2][ui] == false))
|
|
644
|
+
continue;
|
|
645
|
+
|
|
646
|
+
e.parse_genotype_entry(ui, true);
|
|
647
|
+
e.get_indv_GENOTYPE_ids(ui, geno1);
|
|
648
|
+
|
|
649
|
+
e2.parse_genotype_entry(ui, true);
|
|
650
|
+
e2.get_indv_GENOTYPE_ids(ui, geno2);
|
|
651
|
+
|
|
652
|
+
if ((e.get_indv_ploidy(ui) != 2) || (e2.get_indv_ploidy(ui) != 2))
|
|
653
|
+
{
|
|
654
|
+
one_off_warning("\tLD: Only using diploid individuals.");
|
|
655
|
+
continue;
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
if ((e.get_indv_PHASE(ui) != '|') || (e2.get_indv_PHASE(ui) != '|'))
|
|
659
|
+
error("Require phased haplotypes for r^2 calculation (use --phased)\n");
|
|
660
|
+
|
|
661
|
+
for (unsigned int c=0; c<2; c++)
|
|
662
|
+
{
|
|
663
|
+
int allele1, allele2;
|
|
664
|
+
if (c==0)
|
|
665
|
+
{
|
|
666
|
+
allele1 = geno1.first;
|
|
667
|
+
allele2 = geno2.first;
|
|
668
|
+
}
|
|
669
|
+
else
|
|
670
|
+
{
|
|
671
|
+
allele1 = geno1.second;
|
|
672
|
+
allele2 = geno2.second;
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
if ((allele1 == -1) || (allele2 == -1))
|
|
676
|
+
continue;
|
|
677
|
+
|
|
678
|
+
if (allele1 == 0 && allele2 == 0){
|
|
679
|
+
x11++;
|
|
680
|
+
} else if (allele1 == 0 && allele2 != 0){
|
|
681
|
+
x12++;
|
|
682
|
+
} else if (allele1 != 0 && allele2 == 0){
|
|
683
|
+
x21++;
|
|
684
|
+
} else { // (allele1 !=0 && allele2 != 0)
|
|
685
|
+
x22++;
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
sx=0, sy=0;
|
|
689
|
+
if (allele1 == 0)
|
|
690
|
+
sx += 1;
|
|
691
|
+
|
|
692
|
+
if (allele2 == 0)
|
|
693
|
+
sy += 1;
|
|
694
|
+
|
|
695
|
+
X += sx; Y += sy;
|
|
696
|
+
XY += sx*sy;
|
|
697
|
+
sx *= sx; sy *= sy;
|
|
698
|
+
X2 += sx;
|
|
699
|
+
Y2 += sy;
|
|
700
|
+
|
|
701
|
+
chr_count++;
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
rel_x11 = 1.0*x11/chr_count;
|
|
706
|
+
rel_x12 = 1.0*x12/chr_count;
|
|
707
|
+
rel_x21 = 1.0*x21/chr_count;
|
|
708
|
+
rel_x22 = 1.0*x22/chr_count;
|
|
709
|
+
p1 = rel_x11 + rel_x12;
|
|
710
|
+
p2 = rel_x21 + rel_x22;
|
|
711
|
+
q1 = rel_x11 + rel_x21;
|
|
712
|
+
q2 = rel_x12 + rel_x22;
|
|
713
|
+
D = rel_x11 - p1*q1;
|
|
714
|
+
if (D < 0){
|
|
715
|
+
Dmax = min(p1*q1,p2*q2);
|
|
716
|
+
} else {
|
|
717
|
+
Dmax = min(p1*q2,p2*q1);
|
|
718
|
+
};
|
|
719
|
+
Dprime = D/Dmax;
|
|
720
|
+
|
|
721
|
+
X /= chr_count; X2 /= chr_count;
|
|
722
|
+
Y /= chr_count; Y2 /= chr_count;
|
|
723
|
+
XY /= chr_count;
|
|
724
|
+
|
|
725
|
+
var1 = X2 - X*X;
|
|
726
|
+
var2 = Y2 - Y*Y;
|
|
727
|
+
cov12 = XY - X*Y;
|
|
728
|
+
|
|
729
|
+
r2 = cov12 * cov12 / (var1 * var2);
|
|
730
|
+
|
|
731
|
+
if (min_r2 > 0)
|
|
732
|
+
if ((r2 < min_r2) | (r2 != r2))
|
|
733
|
+
continue;
|
|
734
|
+
|
|
735
|
+
out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e2.get_POS() << "\t" << chr_count << "\t" << r2 << "\t" << D << "\t" << Dprime << "\t" << endl;
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
out.close();
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
void vcf_file::output_genotype_r2(const string &output_file_prefix, int snp_window_size, int bp_window_size, double min_r2)
|
|
742
|
+
{
|
|
743
|
+
// Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared
|
|
744
|
+
// correlation coefficient between genotypes numbered as 0, 1, 2.
|
|
745
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
746
|
+
error("Require Genotypes in VCF file in order to output LD Statistics.");
|
|
747
|
+
|
|
748
|
+
unsigned int s, s2;
|
|
749
|
+
unsigned int ui;
|
|
750
|
+
|
|
751
|
+
printLOG("Outputting Pairwise LD (bi-allelic only)\n");
|
|
752
|
+
string output = output_file_prefix + ".geno.ld";
|
|
753
|
+
ofstream out(output.c_str());
|
|
754
|
+
if (!out.is_open())
|
|
755
|
+
error("Could not open LD Output File: " + output, 3);
|
|
756
|
+
|
|
757
|
+
out << "CHR\tPOS1\tPOS2\tN_INDV\tR^2" << endl;
|
|
758
|
+
|
|
759
|
+
unsigned int indv_count;
|
|
760
|
+
double r2;
|
|
761
|
+
int sx, sy;
|
|
762
|
+
double X, X2, Y, Y2, XY;
|
|
763
|
+
double var1, var2, cov12;
|
|
764
|
+
pair<int,int> geno1, geno2;
|
|
765
|
+
string vcf_line, vcf_line2;
|
|
766
|
+
vcf_entry e(N_indv), e2(N_indv);
|
|
767
|
+
for (s=0; s<(N_entries-1); s++)
|
|
768
|
+
{
|
|
769
|
+
if (include_entry[s] == false)
|
|
770
|
+
continue;
|
|
771
|
+
|
|
772
|
+
get_vcf_entry(s, vcf_line);
|
|
773
|
+
e.reset(vcf_line);
|
|
774
|
+
e.parse_basic_entry(true);
|
|
775
|
+
|
|
776
|
+
if (e.get_N_alleles() != 2)
|
|
777
|
+
{
|
|
778
|
+
one_off_warning("\tgenoLD: Only using biallelic SNPs.");
|
|
779
|
+
continue; // Isn't biallelic
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
for (s2 = s+1; s2<N_entries; s2++)
|
|
783
|
+
{
|
|
784
|
+
if (include_entry[s2] == false)
|
|
785
|
+
continue;
|
|
786
|
+
|
|
787
|
+
if (int(s2 - s) > snp_window_size)
|
|
788
|
+
{
|
|
789
|
+
s2 = N_entries; // SNPs sorted, so no need to go any further
|
|
790
|
+
continue;
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
get_vcf_entry(s2, vcf_line2);
|
|
794
|
+
e2.reset(vcf_line2);
|
|
795
|
+
e2.parse_basic_entry(true);
|
|
796
|
+
|
|
797
|
+
if (e2.get_N_alleles() != 2)
|
|
798
|
+
{
|
|
799
|
+
one_off_warning("\tgenoLD: Only using biallelic SNPs.");
|
|
800
|
+
continue; // Isn't biallelic
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
if (e.get_CHROM() != e2.get_CHROM())
|
|
804
|
+
{
|
|
805
|
+
s2 = N_entries; // SNPs sorted, so no need to go any further
|
|
806
|
+
continue;
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
if ((e2.get_POS() - e.get_POS()) > bp_window_size)
|
|
810
|
+
{
|
|
811
|
+
s2 = N_entries; // SNPs sorted, so no need to go any further
|
|
812
|
+
continue;
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
X=0, X2=0; Y=0; Y2=0; XY=0;
|
|
816
|
+
indv_count = 0;
|
|
817
|
+
for (ui=0; ui<N_indv; ui++)
|
|
818
|
+
{
|
|
819
|
+
if ((include_indv[ui] == false) || (include_genotype[s][ui] == false) || (include_genotype[s2][ui] == false))
|
|
820
|
+
continue;
|
|
821
|
+
|
|
822
|
+
e.parse_genotype_entry(ui, true);
|
|
823
|
+
e.get_indv_GENOTYPE_ids(ui, geno1);
|
|
824
|
+
|
|
825
|
+
e2.parse_genotype_entry(ui, true);
|
|
826
|
+
e2.get_indv_GENOTYPE_ids(ui, geno2);
|
|
827
|
+
|
|
828
|
+
if ((e.get_indv_ploidy(ui) != 2) || (e2.get_indv_ploidy(ui) != 2))
|
|
829
|
+
{
|
|
830
|
+
one_off_warning("\tgenoLD: Only using diploid individuals.");
|
|
831
|
+
continue;
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
if ((geno1.first == -1) || (geno1.second == -1))
|
|
835
|
+
continue;
|
|
836
|
+
|
|
837
|
+
if ((geno2.first == -1) || (geno2.second == -1))
|
|
838
|
+
continue;
|
|
839
|
+
|
|
840
|
+
sx=0, sy=0;
|
|
841
|
+
if (geno1.first == geno1.second)
|
|
842
|
+
{
|
|
843
|
+
if (geno1.first == 0)
|
|
844
|
+
{
|
|
845
|
+
sx = 2;
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
else
|
|
849
|
+
sx = 1;
|
|
850
|
+
|
|
851
|
+
if (geno2.first == geno2.second)
|
|
852
|
+
{
|
|
853
|
+
if (geno2.first == 0)
|
|
854
|
+
{
|
|
855
|
+
sy = 2;
|
|
856
|
+
}
|
|
857
|
+
}
|
|
858
|
+
else
|
|
859
|
+
sy = 1;
|
|
860
|
+
|
|
861
|
+
X += sx; Y += sy;
|
|
862
|
+
XY += sx*sy;
|
|
863
|
+
sx *= sx; sy *= sy;
|
|
864
|
+
X2 += sx; Y2 += sy;
|
|
865
|
+
|
|
866
|
+
indv_count++;
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
X /= indv_count; X2 /= indv_count;
|
|
870
|
+
Y /= indv_count; Y2 /= indv_count;
|
|
871
|
+
XY /= indv_count;
|
|
872
|
+
|
|
873
|
+
var1 = X2 - X*X;
|
|
874
|
+
var2 = Y2 - Y*Y;
|
|
875
|
+
cov12 = XY - X*Y;
|
|
876
|
+
|
|
877
|
+
r2 = cov12 * cov12 / (var1 * var2);
|
|
878
|
+
|
|
879
|
+
if (min_r2 > 0)
|
|
880
|
+
if ((r2 < min_r2) | (r2 != r2))
|
|
881
|
+
continue;
|
|
882
|
+
|
|
883
|
+
out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e2.get_POS() << "\t" << indv_count << "\t" << r2 << endl;
|
|
884
|
+
}
|
|
885
|
+
}
|
|
886
|
+
out.close();
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
// TODO - provide similar function for haplotype r2.
|
|
890
|
+
void vcf_file::output_interchromosomal_genotype_r2(const string &output_file_prefix, double min_r2)
|
|
891
|
+
{
|
|
892
|
+
// Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared
|
|
893
|
+
// correlation coefficient between genotypes numbered as 0, 1, 2.
|
|
894
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
895
|
+
error("Require Genotypes in VCF file in order to output LD Statistics.");
|
|
896
|
+
|
|
897
|
+
unsigned int s, s2;
|
|
898
|
+
unsigned int ui;
|
|
899
|
+
|
|
900
|
+
printLOG("Outputting Interchromosomal Pairwise LD (bi-allelic only)\n");
|
|
901
|
+
string output = output_file_prefix + ".interchrom.geno.ld";
|
|
902
|
+
ofstream out(output.c_str());
|
|
903
|
+
if (!out.is_open())
|
|
904
|
+
error("Could not open LD Output File: " + output, 3);
|
|
905
|
+
|
|
906
|
+
out << "CHR1\tPOS1\tCHR2\tPOS2\tN_INDV\tR^2" << endl;
|
|
907
|
+
|
|
908
|
+
unsigned int indv_count;
|
|
909
|
+
double r2;
|
|
910
|
+
int sx, sy;
|
|
911
|
+
double X, X2, Y, Y2, XY;
|
|
912
|
+
double var1, var2, cov12;
|
|
913
|
+
pair<int,int> geno1, geno2;
|
|
914
|
+
string vcf_line, vcf_line2;
|
|
915
|
+
vcf_entry e(N_indv), e2(N_indv);
|
|
916
|
+
for (s=0; s<(N_entries-1); s++)
|
|
917
|
+
{
|
|
918
|
+
if (include_entry[s] == false)
|
|
919
|
+
continue;
|
|
920
|
+
|
|
921
|
+
get_vcf_entry(s, vcf_line);
|
|
922
|
+
e.reset(vcf_line);
|
|
923
|
+
e.parse_basic_entry(true);
|
|
924
|
+
|
|
925
|
+
if (e.get_N_alleles() != 2)
|
|
926
|
+
{
|
|
927
|
+
one_off_warning("\tinterchromLD: Only using biallelic SNPs.");
|
|
928
|
+
continue; // Isn't biallelic
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
for (s2 = s+1; s2<N_entries; s2++)
|
|
932
|
+
{
|
|
933
|
+
if (include_entry[s2] == false)
|
|
934
|
+
continue;
|
|
935
|
+
|
|
936
|
+
get_vcf_entry(s2, vcf_line2);
|
|
937
|
+
e2.reset(vcf_line2);
|
|
938
|
+
e2.parse_basic_entry(true);
|
|
939
|
+
|
|
940
|
+
if (e2.get_N_alleles() != 2)
|
|
941
|
+
{
|
|
942
|
+
one_off_warning("\tinterchromLD: Only using biallelic SNPs.");
|
|
943
|
+
continue; // Isn't biallelic
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
if (e.get_CHROM() == e2.get_CHROM())
|
|
947
|
+
{
|
|
948
|
+
continue;
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
X=0, X2=0; Y=0; Y2=0; XY=0;
|
|
952
|
+
indv_count = 0;
|
|
953
|
+
for (ui=0; ui<N_indv; ui++)
|
|
954
|
+
{
|
|
955
|
+
if ((include_indv[ui] == false) || (include_genotype[s][ui] == false) || (include_genotype[s2][ui] == false))
|
|
956
|
+
continue;
|
|
957
|
+
|
|
958
|
+
e.parse_genotype_entry(ui, true);
|
|
959
|
+
e.get_indv_GENOTYPE_ids(ui, geno1);
|
|
960
|
+
|
|
961
|
+
e2.parse_genotype_entry(ui, true);
|
|
962
|
+
e2.get_indv_GENOTYPE_ids(ui, geno2);
|
|
963
|
+
|
|
964
|
+
if ((e.get_indv_ploidy(ui) != 2) || (e2.get_indv_ploidy(ui) != 2))
|
|
965
|
+
{
|
|
966
|
+
one_off_warning("\tinterchromLD: Only using diploid individuals.");
|
|
967
|
+
continue;
|
|
968
|
+
}
|
|
969
|
+
|
|
970
|
+
if ((geno1.first == -1) || (geno1.second == -1))
|
|
971
|
+
continue;
|
|
972
|
+
|
|
973
|
+
if ((geno2.first == -1) || (geno2.second == -1))
|
|
974
|
+
continue;
|
|
975
|
+
|
|
976
|
+
sx=0, sy=0;
|
|
977
|
+
if (geno1.first == geno1.second)
|
|
978
|
+
{
|
|
979
|
+
if (geno1.first == 0)
|
|
980
|
+
{
|
|
981
|
+
sx = 2;
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
else
|
|
985
|
+
sx = 1;
|
|
986
|
+
|
|
987
|
+
if (geno2.first == geno2.second)
|
|
988
|
+
{
|
|
989
|
+
if (geno2.first == 0)
|
|
990
|
+
{
|
|
991
|
+
sy = 2;
|
|
992
|
+
}
|
|
993
|
+
}
|
|
994
|
+
else
|
|
995
|
+
sy = 1;
|
|
996
|
+
|
|
997
|
+
X += sx; Y += sy;
|
|
998
|
+
XY += sx*sy;
|
|
999
|
+
sx *= sx; sy *= sy;
|
|
1000
|
+
X2 += sx; Y2 += sy;
|
|
1001
|
+
|
|
1002
|
+
indv_count++;
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
X /= indv_count; X2 /= indv_count;
|
|
1006
|
+
Y /= indv_count; Y2 /= indv_count;
|
|
1007
|
+
XY /= indv_count;
|
|
1008
|
+
|
|
1009
|
+
var1 = X2 - X*X;
|
|
1010
|
+
var2 = Y2 - Y*Y;
|
|
1011
|
+
cov12 = XY - X*Y;
|
|
1012
|
+
|
|
1013
|
+
r2 = cov12 * cov12 / (var1 * var2);
|
|
1014
|
+
|
|
1015
|
+
if (min_r2 > 0)
|
|
1016
|
+
if ((r2 < min_r2) | (r2 != r2))
|
|
1017
|
+
continue;
|
|
1018
|
+
|
|
1019
|
+
out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e2.get_CHROM() << "\t" << e2.get_POS() << "\t" << indv_count << "\t" << r2 << endl;
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
out.close();
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
void vcf_file::output_singletons(const string &output_file_prefix)
|
|
1026
|
+
{
|
|
1027
|
+
// Locate and output singletons (and private doubletons)
|
|
1028
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
1029
|
+
error("Require Genotypes in VCF file in order to output Singletons.");
|
|
1030
|
+
|
|
1031
|
+
printLOG("Outputting Singleton Locations\n");
|
|
1032
|
+
string output = output_file_prefix + ".singletons";
|
|
1033
|
+
ofstream out(output.c_str());
|
|
1034
|
+
if (!out.is_open())
|
|
1035
|
+
error("Could not open Singleton Output File: " + output, 3);
|
|
1036
|
+
|
|
1037
|
+
out << "CHROM\tPOS\tSINGLETON/DOUBLETON\tALLELE\tINDV" << endl;
|
|
1038
|
+
|
|
1039
|
+
unsigned int ui;
|
|
1040
|
+
int a;
|
|
1041
|
+
vector<int> allele_counts;
|
|
1042
|
+
unsigned int N_non_missing_chr;
|
|
1043
|
+
unsigned int N_alleles;
|
|
1044
|
+
pair<int, int> geno;
|
|
1045
|
+
string allele;
|
|
1046
|
+
string vcf_line;
|
|
1047
|
+
vcf_entry e(N_indv);
|
|
1048
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
1049
|
+
{
|
|
1050
|
+
if (include_entry[s] == false)
|
|
1051
|
+
continue;
|
|
1052
|
+
|
|
1053
|
+
get_vcf_entry(s, vcf_line);
|
|
1054
|
+
e.reset(vcf_line);
|
|
1055
|
+
e.parse_basic_entry(true);
|
|
1056
|
+
e.parse_genotype_entries(true);
|
|
1057
|
+
|
|
1058
|
+
e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
|
|
1059
|
+
N_alleles = e.get_N_alleles();
|
|
1060
|
+
|
|
1061
|
+
for (a=0; a<(signed)N_alleles; a++)
|
|
1062
|
+
{
|
|
1063
|
+
if (allele_counts[a] == 1)
|
|
1064
|
+
{ // Singleton
|
|
1065
|
+
for (ui=0; ui<N_indv; ui++)
|
|
1066
|
+
{
|
|
1067
|
+
if (include_indv[ui] == false)
|
|
1068
|
+
continue;
|
|
1069
|
+
e.get_indv_GENOTYPE_ids(ui, geno);
|
|
1070
|
+
if ((geno.first == a) || (geno.second == a))
|
|
1071
|
+
{
|
|
1072
|
+
e.get_allele(a, allele);
|
|
1073
|
+
out << e.get_CHROM() << "\t" << e.get_POS() << "\tS\t" << allele << "\t" << indv[ui] << endl;
|
|
1074
|
+
ui=N_indv;
|
|
1075
|
+
break;
|
|
1076
|
+
}
|
|
1077
|
+
}
|
|
1078
|
+
}
|
|
1079
|
+
else if (allele_counts[a] == 2)
|
|
1080
|
+
{ // Possible doubleton
|
|
1081
|
+
for (ui=0; ui<N_indv; ui++)
|
|
1082
|
+
{
|
|
1083
|
+
if (include_indv[ui] == false)
|
|
1084
|
+
continue;
|
|
1085
|
+
e.get_indv_GENOTYPE_ids(ui, geno);
|
|
1086
|
+
if ((geno.first == a) && (geno.second == a))
|
|
1087
|
+
{
|
|
1088
|
+
e.get_allele(a, allele);
|
|
1089
|
+
out << e.get_CHROM() << "\t" << e.get_POS() << "\tD\t" << allele << "\t" << indv[ui] << endl;
|
|
1090
|
+
ui=N_indv;
|
|
1091
|
+
break;
|
|
1092
|
+
}
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
}
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1098
|
+
out.close();
|
|
1099
|
+
}
|
|
1100
|
+
|
|
1101
|
+
void vcf_file::output_genotype_depth(const string &output_file_prefix)
|
|
1102
|
+
{
|
|
1103
|
+
// Output genotype depth in tab-delimited format.
|
|
1104
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
1105
|
+
error("Require Genotypes in VCF file in order to output Genotype Depth Statistics.");
|
|
1106
|
+
|
|
1107
|
+
printLOG("Outputting Depth for Each Genotype\n");
|
|
1108
|
+
string output = output_file_prefix + ".gdepth";
|
|
1109
|
+
ofstream out(output.c_str());
|
|
1110
|
+
if (!out.is_open())
|
|
1111
|
+
error("Could not open Genotype Depth Output File: " + output, 7);
|
|
1112
|
+
|
|
1113
|
+
out << "CHROM\tPOS";
|
|
1114
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
1115
|
+
{
|
|
1116
|
+
if (include_indv[ui] == false)
|
|
1117
|
+
continue;
|
|
1118
|
+
|
|
1119
|
+
out << "\t" << indv[ui];
|
|
1120
|
+
}
|
|
1121
|
+
out << endl;
|
|
1122
|
+
|
|
1123
|
+
string vcf_line;
|
|
1124
|
+
vcf_entry e(N_indv);
|
|
1125
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
1126
|
+
{
|
|
1127
|
+
if (include_entry[s] == false)
|
|
1128
|
+
continue;
|
|
1129
|
+
|
|
1130
|
+
get_vcf_entry(s, vcf_line);
|
|
1131
|
+
e.reset(vcf_line);
|
|
1132
|
+
e.parse_basic_entry();
|
|
1133
|
+
|
|
1134
|
+
out << e.get_CHROM() << "\t" << e.get_POS();
|
|
1135
|
+
|
|
1136
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
1137
|
+
{
|
|
1138
|
+
if (include_indv[ui] == false)
|
|
1139
|
+
continue;
|
|
1140
|
+
|
|
1141
|
+
if (include_genotype[s][ui] == true)
|
|
1142
|
+
{
|
|
1143
|
+
e.parse_genotype_entry(ui, false, false, true);
|
|
1144
|
+
out << "\t" << e.get_indv_DEPTH(ui);
|
|
1145
|
+
}
|
|
1146
|
+
else
|
|
1147
|
+
out << "\t-1";
|
|
1148
|
+
}
|
|
1149
|
+
out << endl;
|
|
1150
|
+
}
|
|
1151
|
+
out.close();
|
|
1152
|
+
}
|
|
1153
|
+
|
|
1154
|
+
void vcf_file::output_FILTER_summary(const string &output_file_prefix)
|
|
1155
|
+
{
|
|
1156
|
+
// Output a summary of sites in various FILTER categories.
|
|
1157
|
+
printLOG("Outputting Filter Summary (for bi-allelic loci only)\n");
|
|
1158
|
+
|
|
1159
|
+
map<string, unsigned int> model_to_idx;
|
|
1160
|
+
model_to_idx["AC"] = 0;
|
|
1161
|
+
model_to_idx["AG"] = 1;
|
|
1162
|
+
model_to_idx["AT"] = 2;
|
|
1163
|
+
model_to_idx["CG"] = 3;
|
|
1164
|
+
model_to_idx["CT"] = 4;
|
|
1165
|
+
model_to_idx["GT"] = 5;
|
|
1166
|
+
string FILTER;
|
|
1167
|
+
string vcf_line;
|
|
1168
|
+
vcf_entry e(N_indv);
|
|
1169
|
+
|
|
1170
|
+
map<string, pair<int, int> > FILTER_to_TsTv;
|
|
1171
|
+
map<string, int > FILTER_to_Nsites;
|
|
1172
|
+
map<string, int >::iterator FILTER_to_Nsites_it;
|
|
1173
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
1174
|
+
{
|
|
1175
|
+
if (include_entry[s] == false)
|
|
1176
|
+
continue;
|
|
1177
|
+
|
|
1178
|
+
get_vcf_entry(s, vcf_line);
|
|
1179
|
+
e.reset(vcf_line);
|
|
1180
|
+
e.parse_basic_entry(true, true);
|
|
1181
|
+
|
|
1182
|
+
string model = e.get_REF() + e.get_ALT_allele(0);
|
|
1183
|
+
sort(model.begin(), model.end());
|
|
1184
|
+
|
|
1185
|
+
FILTER = e.get_FILTER();
|
|
1186
|
+
FILTER_to_Nsites[FILTER]++;
|
|
1187
|
+
if (model_to_idx.find(model) != model_to_idx.end())
|
|
1188
|
+
{
|
|
1189
|
+
switch (model_to_idx[model])
|
|
1190
|
+
{
|
|
1191
|
+
case 1:
|
|
1192
|
+
case 4:
|
|
1193
|
+
FILTER_to_TsTv[FILTER].first++;
|
|
1194
|
+
break;
|
|
1195
|
+
case 0:
|
|
1196
|
+
case 2:
|
|
1197
|
+
case 3:
|
|
1198
|
+
case 5:
|
|
1199
|
+
FILTER_to_TsTv[FILTER].second++;
|
|
1200
|
+
break;
|
|
1201
|
+
default:
|
|
1202
|
+
// Don't count this snp towards Ts/Tv
|
|
1203
|
+
break;
|
|
1204
|
+
}
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1208
|
+
vector<pair<int, string > > count_to_FILTER;
|
|
1209
|
+
for ( FILTER_to_Nsites_it=FILTER_to_Nsites.begin() ; FILTER_to_Nsites_it != FILTER_to_Nsites.end(); ++FILTER_to_Nsites_it )
|
|
1210
|
+
{
|
|
1211
|
+
FILTER = (*FILTER_to_Nsites_it).first;
|
|
1212
|
+
int Nsites = (*FILTER_to_Nsites_it).second;
|
|
1213
|
+
|
|
1214
|
+
count_to_FILTER.push_back(make_pair(Nsites, FILTER));
|
|
1215
|
+
}
|
|
1216
|
+
|
|
1217
|
+
sort(count_to_FILTER.begin(), count_to_FILTER.end());
|
|
1218
|
+
|
|
1219
|
+
string output = output_file_prefix + ".FILTER.summary";
|
|
1220
|
+
ofstream out(output.c_str());
|
|
1221
|
+
if (!out.is_open())
|
|
1222
|
+
error("Could not open Filter Summary Output File: " + output, 7);
|
|
1223
|
+
|
|
1224
|
+
out << "FILTER\tN_SNPs\tN_Ts\tN_Tv\tTs/Tv" << endl;
|
|
1225
|
+
|
|
1226
|
+
for (int i=count_to_FILTER.size()-1; i > -1; i--)
|
|
1227
|
+
{
|
|
1228
|
+
FILTER = count_to_FILTER[i].second;
|
|
1229
|
+
int Ts = FILTER_to_TsTv[FILTER].first;
|
|
1230
|
+
int Tv = FILTER_to_TsTv[FILTER].second;
|
|
1231
|
+
int Nsites = FILTER_to_Nsites[FILTER];
|
|
1232
|
+
out << FILTER << "\t" << Nsites << "\t";
|
|
1233
|
+
out << Ts << "\t" << Tv << "\t" << double(Ts)/Tv << endl;
|
|
1234
|
+
}
|
|
1235
|
+
|
|
1236
|
+
out.close();
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
void vcf_file::output_TsTv(const string &output_file_prefix, int bin_size)
|
|
1240
|
+
{
|
|
1241
|
+
// Output Ts/Tv ratios in bins of a given size.
|
|
1242
|
+
printLOG("Outputting Ts/Tv in bins of " + int2str(bin_size) + "bp\n");
|
|
1243
|
+
|
|
1244
|
+
map<string, unsigned int> model_to_idx;
|
|
1245
|
+
model_to_idx["AC"] = 0;
|
|
1246
|
+
model_to_idx["AG"] = 1;
|
|
1247
|
+
model_to_idx["AT"] = 2;
|
|
1248
|
+
model_to_idx["CG"] = 3;
|
|
1249
|
+
model_to_idx["CT"] = 4;
|
|
1250
|
+
model_to_idx["GT"] = 5;
|
|
1251
|
+
|
|
1252
|
+
map<string, int> max_pos;
|
|
1253
|
+
string vcf_line, CHROM;
|
|
1254
|
+
vcf_entry e(N_indv);
|
|
1255
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
1256
|
+
{
|
|
1257
|
+
if (include_entry[s] == true)
|
|
1258
|
+
{
|
|
1259
|
+
get_vcf_entry(s, vcf_line);
|
|
1260
|
+
e.reset(vcf_line);
|
|
1261
|
+
e.parse_basic_entry();
|
|
1262
|
+
|
|
1263
|
+
CHROM = e.get_CHROM();
|
|
1264
|
+
|
|
1265
|
+
if (max_pos.find(CHROM) != max_pos.end())
|
|
1266
|
+
{
|
|
1267
|
+
if (e.get_POS() > max_pos[CHROM])
|
|
1268
|
+
max_pos[CHROM] = e.get_POS();
|
|
1269
|
+
}
|
|
1270
|
+
else
|
|
1271
|
+
max_pos[CHROM] = e.get_POS();
|
|
1272
|
+
}
|
|
1273
|
+
}
|
|
1274
|
+
|
|
1275
|
+
map<string, int>::iterator it;
|
|
1276
|
+
|
|
1277
|
+
unsigned int N_bins;
|
|
1278
|
+
map<string, vector<int> > Ts_counts;
|
|
1279
|
+
map<string, vector<int> > Tv_counts;
|
|
1280
|
+
for (it=max_pos.begin(); it != max_pos.end(); ++it)
|
|
1281
|
+
{
|
|
1282
|
+
CHROM = (*it).first;
|
|
1283
|
+
N_bins = (unsigned int)((max_pos[CHROM] + bin_size) / double(bin_size));
|
|
1284
|
+
Ts_counts[CHROM].resize(N_bins, 0);
|
|
1285
|
+
Tv_counts[CHROM].resize(N_bins, 0);
|
|
1286
|
+
}
|
|
1287
|
+
|
|
1288
|
+
vector<unsigned int> model_counts(6,0);
|
|
1289
|
+
double C = 1.0 / double(bin_size);
|
|
1290
|
+
unsigned int idx;
|
|
1291
|
+
|
|
1292
|
+
string model;
|
|
1293
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
1294
|
+
{
|
|
1295
|
+
if (include_entry[s] == false)
|
|
1296
|
+
continue;
|
|
1297
|
+
|
|
1298
|
+
get_vcf_entry(s, vcf_line);
|
|
1299
|
+
e.reset(vcf_line);
|
|
1300
|
+
e.parse_basic_entry(true);
|
|
1301
|
+
|
|
1302
|
+
if (!e.is_biallelic_SNP())
|
|
1303
|
+
continue;
|
|
1304
|
+
|
|
1305
|
+
model = e.get_REF() + e.get_ALT_allele(0);
|
|
1306
|
+
sort(model.begin(), model.end());
|
|
1307
|
+
|
|
1308
|
+
CHROM = e.get_CHROM();
|
|
1309
|
+
idx = (unsigned int)(e.get_POS() * C);
|
|
1310
|
+
|
|
1311
|
+
if (model_to_idx.find(model) != model_to_idx.end())
|
|
1312
|
+
{
|
|
1313
|
+
model_counts[model_to_idx[model]]++;
|
|
1314
|
+
switch (model_to_idx[model])
|
|
1315
|
+
{
|
|
1316
|
+
case 1:
|
|
1317
|
+
case 4:
|
|
1318
|
+
Ts_counts[CHROM][idx]++;
|
|
1319
|
+
break;
|
|
1320
|
+
case 0:
|
|
1321
|
+
case 2:
|
|
1322
|
+
case 3:
|
|
1323
|
+
case 5:
|
|
1324
|
+
Tv_counts[CHROM][idx]++;
|
|
1325
|
+
break;
|
|
1326
|
+
default:
|
|
1327
|
+
error("Unknown idx\n");
|
|
1328
|
+
}
|
|
1329
|
+
}
|
|
1330
|
+
else
|
|
1331
|
+
warning("Unknown model type. Not a SNP? " + CHROM + ":" + int2str(e.get_POS()) +"\n");
|
|
1332
|
+
}
|
|
1333
|
+
|
|
1334
|
+
string output = output_file_prefix + ".TsTv";
|
|
1335
|
+
ofstream out(output.c_str());
|
|
1336
|
+
if (!out.is_open())
|
|
1337
|
+
error("Could not open TsTv Output File: " + output, 7);
|
|
1338
|
+
|
|
1339
|
+
out << "CHROM\tBinStart\tSNP_count\tTs/Tv" << endl;
|
|
1340
|
+
double ratio;
|
|
1341
|
+
for (it=max_pos.begin(); it != max_pos.end(); ++it)
|
|
1342
|
+
{
|
|
1343
|
+
CHROM = (*it).first;
|
|
1344
|
+
for (unsigned int s=0; s<Ts_counts[CHROM].size(); s++)
|
|
1345
|
+
{
|
|
1346
|
+
ratio = 0.0;
|
|
1347
|
+
if (Tv_counts[CHROM][s] != 0)
|
|
1348
|
+
ratio = double(Ts_counts[CHROM][s]) / Tv_counts[CHROM][s];
|
|
1349
|
+
out << CHROM << "\t" << s*bin_size << "\t" << Ts_counts[CHROM][s]+Tv_counts[CHROM][s] << "\t" << ratio << endl;
|
|
1350
|
+
}
|
|
1351
|
+
}
|
|
1352
|
+
out.close();
|
|
1353
|
+
|
|
1354
|
+
output = output_file_prefix + ".TsTv.summary";
|
|
1355
|
+
out.open(output.c_str());
|
|
1356
|
+
if (!out.is_open())
|
|
1357
|
+
error("Could not open TsTv Summary Output File: " + output, 7);
|
|
1358
|
+
|
|
1359
|
+
out << "MODEL\tCOUNT" << endl;
|
|
1360
|
+
out << "AC\t" << model_counts[0] << endl;
|
|
1361
|
+
out << "AG\t" << model_counts[1] << endl;
|
|
1362
|
+
out << "AT\t" << model_counts[2] << endl;
|
|
1363
|
+
out << "CG\t" << model_counts[3] << endl;
|
|
1364
|
+
out << "CT\t" << model_counts[4] << endl;
|
|
1365
|
+
out << "GT\t" << model_counts[5] << endl;
|
|
1366
|
+
unsigned int Ts = model_counts[1] + model_counts[4];
|
|
1367
|
+
unsigned int Tv = model_counts[0] + model_counts[2] + model_counts[3] + model_counts[5];
|
|
1368
|
+
out << "Ts\t" << Ts << endl;
|
|
1369
|
+
out << "Tv\t" << Tv << endl;
|
|
1370
|
+
|
|
1371
|
+
printLOG("Ts/Tv ratio: " + dbl2str(double(Ts)/Tv, 4) + "\n");
|
|
1372
|
+
|
|
1373
|
+
out.close();
|
|
1374
|
+
}
|
|
1375
|
+
|
|
1376
|
+
void vcf_file::output_TsTv_by_count(const string &output_file_prefix)
|
|
1377
|
+
{
|
|
1378
|
+
// Output Ts/Tv ratios in bins of a given size.
|
|
1379
|
+
printLOG("Outputting Ts/Tv by Alternative Allele Count\n");
|
|
1380
|
+
vector<unsigned int> Ts_counts, Tv_counts;
|
|
1381
|
+
unsigned int N_kept_indv = N_kept_individuals();
|
|
1382
|
+
Ts_counts.resize(2*N_kept_indv);
|
|
1383
|
+
Tv_counts.resize(2*N_kept_indv);
|
|
1384
|
+
|
|
1385
|
+
string vcf_line, model;
|
|
1386
|
+
vcf_entry e(N_indv);
|
|
1387
|
+
map<string, unsigned int> model_to_Ts_or_Tv;
|
|
1388
|
+
model_to_Ts_or_Tv["AC"] = 1;
|
|
1389
|
+
model_to_Ts_or_Tv["CA"] = 1;
|
|
1390
|
+
model_to_Ts_or_Tv["AG"] = 0; // Ts
|
|
1391
|
+
model_to_Ts_or_Tv["GA"] = 0; // Ts
|
|
1392
|
+
model_to_Ts_or_Tv["AT"] = 1;
|
|
1393
|
+
model_to_Ts_or_Tv["TA"] = 1;
|
|
1394
|
+
model_to_Ts_or_Tv["CG"] = 1;
|
|
1395
|
+
model_to_Ts_or_Tv["GC"] = 1;
|
|
1396
|
+
model_to_Ts_or_Tv["CT"] = 0; // Ts
|
|
1397
|
+
model_to_Ts_or_Tv["TC"] = 0; // Ts
|
|
1398
|
+
model_to_Ts_or_Tv["GT"] = 1;
|
|
1399
|
+
model_to_Ts_or_Tv["TG"] = 1;
|
|
1400
|
+
unsigned int idx;
|
|
1401
|
+
vector<int> allele_counts;
|
|
1402
|
+
unsigned int allele_count;
|
|
1403
|
+
unsigned int N_included_indv;
|
|
1404
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
1405
|
+
{
|
|
1406
|
+
if (include_entry[s] == true)
|
|
1407
|
+
{
|
|
1408
|
+
get_vcf_entry(s, vcf_line);
|
|
1409
|
+
e.reset(vcf_line);
|
|
1410
|
+
e.parse_basic_entry(true);
|
|
1411
|
+
|
|
1412
|
+
if (!e.is_biallelic_SNP())
|
|
1413
|
+
continue;
|
|
1414
|
+
|
|
1415
|
+
e.parse_genotype_entries(true);
|
|
1416
|
+
e.get_allele_counts(allele_counts, N_included_indv, include_indv, include_genotype[s]);
|
|
1417
|
+
allele_count = allele_counts[1];
|
|
1418
|
+
|
|
1419
|
+
model = e.get_REF() + e.get_ALT_allele(0);
|
|
1420
|
+
if (model_to_Ts_or_Tv.find(model) != model_to_Ts_or_Tv.end())
|
|
1421
|
+
{
|
|
1422
|
+
idx = model_to_Ts_or_Tv[model];
|
|
1423
|
+
if (idx == 0) // Ts
|
|
1424
|
+
Ts_counts[allele_count]++;
|
|
1425
|
+
else if (idx == 1) // Tv;
|
|
1426
|
+
Tv_counts[allele_count]++;
|
|
1427
|
+
else
|
|
1428
|
+
error("Unknown model type\n");
|
|
1429
|
+
}
|
|
1430
|
+
else
|
|
1431
|
+
warning("Unknown model type. Not a SNP? " + e.get_CHROM() + ":" + int2str(e.get_POS()) +"\n");
|
|
1432
|
+
}
|
|
1433
|
+
}
|
|
1434
|
+
|
|
1435
|
+
string output = output_file_prefix + ".TsTv.count";
|
|
1436
|
+
ofstream out(output.c_str());
|
|
1437
|
+
if (!out.is_open())
|
|
1438
|
+
error("Could not open TsTv by Count Output File: " + output, 7);
|
|
1439
|
+
|
|
1440
|
+
double ratio;
|
|
1441
|
+
out << "ALT_ALLELE_COUNT\tN_Ts\tN_Tv\tTs/Tv" << endl;
|
|
1442
|
+
for (unsigned int ui=0; ui<2*N_kept_indv; ui++)
|
|
1443
|
+
{
|
|
1444
|
+
ratio = double(Ts_counts[ui]) / Tv_counts[ui];
|
|
1445
|
+
out << ui << "\t" << Ts_counts[ui] << "\t" << Tv_counts[ui] << "\t" << ratio << endl;
|
|
1446
|
+
}
|
|
1447
|
+
out.close();
|
|
1448
|
+
}
|
|
1449
|
+
|
|
1450
|
+
void vcf_file::output_TsTv_by_quality(const string &output_file_prefix)
|
|
1451
|
+
{
|
|
1452
|
+
// Output Ts/Tv ratios in bins of a given size.
|
|
1453
|
+
printLOG("Outputting Ts/Tv By Quality\n");
|
|
1454
|
+
map<double, pair<unsigned int, unsigned int> > TsTv_counts;
|
|
1455
|
+
double max_qual = -numeric_limits<double>::max(), min_qual=numeric_limits<double>::max();
|
|
1456
|
+
|
|
1457
|
+
string vcf_line, model;
|
|
1458
|
+
vcf_entry e(N_indv);
|
|
1459
|
+
map<string, unsigned int> model_to_Ts_or_Tv;
|
|
1460
|
+
model_to_Ts_or_Tv["AC"] = 1;
|
|
1461
|
+
model_to_Ts_or_Tv["CA"] = 1;
|
|
1462
|
+
model_to_Ts_or_Tv["AG"] = 0; // Ts
|
|
1463
|
+
model_to_Ts_or_Tv["GA"] = 0; // Ts
|
|
1464
|
+
model_to_Ts_or_Tv["AT"] = 1;
|
|
1465
|
+
model_to_Ts_or_Tv["TA"] = 1;
|
|
1466
|
+
model_to_Ts_or_Tv["CG"] = 1;
|
|
1467
|
+
model_to_Ts_or_Tv["GC"] = 1;
|
|
1468
|
+
model_to_Ts_or_Tv["CT"] = 0; // Ts
|
|
1469
|
+
model_to_Ts_or_Tv["TC"] = 0; // Ts
|
|
1470
|
+
model_to_Ts_or_Tv["GT"] = 1;
|
|
1471
|
+
model_to_Ts_or_Tv["TG"] = 1;
|
|
1472
|
+
unsigned int idx;
|
|
1473
|
+
double QUAL;
|
|
1474
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
1475
|
+
{
|
|
1476
|
+
if (include_entry[s] == true)
|
|
1477
|
+
{
|
|
1478
|
+
get_vcf_entry(s, vcf_line);
|
|
1479
|
+
e.reset(vcf_line);
|
|
1480
|
+
e.parse_basic_entry(true);
|
|
1481
|
+
|
|
1482
|
+
if (!e.is_biallelic_SNP())
|
|
1483
|
+
continue;
|
|
1484
|
+
|
|
1485
|
+
QUAL = e.get_QUAL();
|
|
1486
|
+
if (QUAL > max_qual)
|
|
1487
|
+
max_qual = QUAL;
|
|
1488
|
+
if (QUAL < min_qual)
|
|
1489
|
+
min_qual = QUAL;
|
|
1490
|
+
|
|
1491
|
+
model = e.get_REF() + e.get_ALT_allele(0);
|
|
1492
|
+
if (model_to_Ts_or_Tv.find(model) != model_to_Ts_or_Tv.end())
|
|
1493
|
+
{
|
|
1494
|
+
idx = model_to_Ts_or_Tv[model];
|
|
1495
|
+
if (idx == 0) // Ts
|
|
1496
|
+
{
|
|
1497
|
+
TsTv_counts[QUAL].first++;
|
|
1498
|
+
}
|
|
1499
|
+
else if (idx == 1) // Tv;
|
|
1500
|
+
TsTv_counts[QUAL].second++;
|
|
1501
|
+
else
|
|
1502
|
+
error("Unknown model type\n");
|
|
1503
|
+
}
|
|
1504
|
+
else
|
|
1505
|
+
warning("Unknown model type. Not a SNP? " + e.get_CHROM() + ":" + int2str(e.get_POS()) +"\n");
|
|
1506
|
+
}
|
|
1507
|
+
}
|
|
1508
|
+
|
|
1509
|
+
string output = output_file_prefix + ".TsTv.qual";
|
|
1510
|
+
ofstream out(output.c_str());
|
|
1511
|
+
if (!out.is_open())
|
|
1512
|
+
error("Could not open TsTv by Count Output File: " + output, 7);
|
|
1513
|
+
|
|
1514
|
+
out << "QUAL_THRESHOLD";
|
|
1515
|
+
out << "\tN_Ts_LT_QUAL_THRESHOLD\tN_Tv_LT_QUAL_THRESHOLD\tTs/Tv_LT_QUAL_THRESHOLD";
|
|
1516
|
+
out << "\tN_Ts_GT_QUAL_THRESHOLD\tN_Tv_GT_QUAL_THRESHOLD\tTs/Tv_GT_QUAL_THRESHOLD" << endl;
|
|
1517
|
+
|
|
1518
|
+
unsigned int N_TsTv = TsTv_counts.size();
|
|
1519
|
+
|
|
1520
|
+
vector<double> Ts_sum_below(N_TsTv+1, 0.0), Tv_sum_below(N_TsTv+1, 0.0);
|
|
1521
|
+
vector<double> QUAL_vector(N_TsTv+1, 0.0);
|
|
1522
|
+
QUAL_vector[0] = min_qual;
|
|
1523
|
+
QUAL_vector[N_TsTv] = max_qual;
|
|
1524
|
+
idx = 1;
|
|
1525
|
+
for (map<double, pair<unsigned int, unsigned int> >::iterator it=TsTv_counts.begin(); it != TsTv_counts.end(); ++it)
|
|
1526
|
+
{
|
|
1527
|
+
QUAL = (it->first);
|
|
1528
|
+
double Ts = (it->second).first;
|
|
1529
|
+
double Tv = (it->second).second;
|
|
1530
|
+
Ts_sum_below[idx] = Ts_sum_below[idx-1]+Ts;
|
|
1531
|
+
Tv_sum_below[idx] = Tv_sum_below[idx-1]+Tv;
|
|
1532
|
+
QUAL_vector[idx-1] = QUAL;
|
|
1533
|
+
idx++;
|
|
1534
|
+
}
|
|
1535
|
+
QUAL_vector[N_TsTv] = max_qual;
|
|
1536
|
+
|
|
1537
|
+
vector<double> Ts_sum_above(N_TsTv+1, 0.0), Tv_sum_above(N_TsTv+1, 0.0);
|
|
1538
|
+
idx = N_TsTv;
|
|
1539
|
+
for (map<double, pair<unsigned int, unsigned int> >::reverse_iterator it=TsTv_counts.rbegin(); it != TsTv_counts.rend(); ++it)
|
|
1540
|
+
{
|
|
1541
|
+
QUAL = (it->first);
|
|
1542
|
+
double Ts = (it->second).first;
|
|
1543
|
+
double Tv = (it->second).second;
|
|
1544
|
+
Ts_sum_above[idx] = Ts_sum_above[idx+1]+Ts;
|
|
1545
|
+
Tv_sum_above[idx] = Tv_sum_above[idx+1]+Tv;
|
|
1546
|
+
idx--;
|
|
1547
|
+
}
|
|
1548
|
+
|
|
1549
|
+
double Ts_sum, Tv_sum, ratio;
|
|
1550
|
+
for (unsigned int ui=1; ui<(N_TsTv+1); ui++)
|
|
1551
|
+
{
|
|
1552
|
+
QUAL = QUAL_vector[ui-1];
|
|
1553
|
+
out << QUAL;
|
|
1554
|
+
Ts_sum = Ts_sum_below[ui-1]; Tv_sum = Tv_sum_below[ui-1];
|
|
1555
|
+
ratio = Ts_sum / Tv_sum;
|
|
1556
|
+
out << "\t" << Ts_sum << "\t" << Tv_sum << "\t" << ratio;
|
|
1557
|
+
Ts_sum = Ts_sum_above[ui+1]; Tv_sum = Tv_sum_above[ui+1];
|
|
1558
|
+
ratio = Ts_sum / Tv_sum;
|
|
1559
|
+
out << "\t" << Ts_sum << "\t" << Tv_sum << "\t" << ratio;
|
|
1560
|
+
out << endl;
|
|
1561
|
+
}
|
|
1562
|
+
out.close();
|
|
1563
|
+
}
|
|
1564
|
+
|
|
1565
|
+
void vcf_file::output_site_quality(const string &output_file_prefix)
|
|
1566
|
+
{
|
|
1567
|
+
// Output per-site quality information.
|
|
1568
|
+
printLOG("Outputting Quality for Each Site\n");
|
|
1569
|
+
string output = output_file_prefix + ".lqual";
|
|
1570
|
+
|
|
1571
|
+
ofstream out(output.c_str());
|
|
1572
|
+
if (!out.is_open())
|
|
1573
|
+
error("Could not open Site Depth Output File: " + output, 7);
|
|
1574
|
+
|
|
1575
|
+
out << "CHROM\tPOS\tQUAL" << endl;
|
|
1576
|
+
|
|
1577
|
+
string vcf_line;
|
|
1578
|
+
vcf_entry e(N_indv);
|
|
1579
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
1580
|
+
{
|
|
1581
|
+
if (include_entry[s] == false)
|
|
1582
|
+
continue;
|
|
1583
|
+
|
|
1584
|
+
get_vcf_entry(s, vcf_line);
|
|
1585
|
+
e.reset(vcf_line);
|
|
1586
|
+
e.parse_basic_entry();
|
|
1587
|
+
|
|
1588
|
+
out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e.get_QUAL() << endl;
|
|
1589
|
+
}
|
|
1590
|
+
out.close();
|
|
1591
|
+
}
|
|
1592
|
+
|
|
1593
|
+
void vcf_file::output_site_depth(const string &output_file_prefix, bool output_mean)
|
|
1594
|
+
{
|
|
1595
|
+
// Output per-site depth information
|
|
1596
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
1597
|
+
error("Require Genotypes in VCF file in order to output Site Depth Statistics.");
|
|
1598
|
+
|
|
1599
|
+
printLOG("Outputting Depth for Each Site\n");
|
|
1600
|
+
string output = output_file_prefix + ".ldepth";
|
|
1601
|
+
if (output_mean)
|
|
1602
|
+
output += ".mean";
|
|
1603
|
+
ofstream out(output.c_str());
|
|
1604
|
+
if (!out.is_open())
|
|
1605
|
+
error("Could not open Site Depth Output File: " + output, 7);
|
|
1606
|
+
|
|
1607
|
+
out << "CHROM\tPOS\t";
|
|
1608
|
+
if (output_mean)
|
|
1609
|
+
out << "MEAN_DEPTH\tVAR_DEPTH" << endl;
|
|
1610
|
+
else
|
|
1611
|
+
out << "SUM_DEPTH\tSUMSQ_DEPTH" << endl;
|
|
1612
|
+
|
|
1613
|
+
int depth;
|
|
1614
|
+
string vcf_line;
|
|
1615
|
+
vcf_entry e(N_indv);
|
|
1616
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
1617
|
+
{
|
|
1618
|
+
if (include_entry[s] == false)
|
|
1619
|
+
continue;
|
|
1620
|
+
|
|
1621
|
+
get_vcf_entry(s, vcf_line);
|
|
1622
|
+
e.reset(vcf_line);
|
|
1623
|
+
e.parse_basic_entry();
|
|
1624
|
+
|
|
1625
|
+
out << e.get_CHROM() << "\t" << e.get_POS() << "\t";
|
|
1626
|
+
|
|
1627
|
+
unsigned int sum=0;
|
|
1628
|
+
unsigned int sumsq=0;
|
|
1629
|
+
unsigned int n=0;
|
|
1630
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
1631
|
+
{
|
|
1632
|
+
if (include_indv[ui] == false)
|
|
1633
|
+
continue;
|
|
1634
|
+
if (include_genotype[s][ui] == false)
|
|
1635
|
+
continue;
|
|
1636
|
+
|
|
1637
|
+
e.parse_genotype_entry(ui, false, false, true);
|
|
1638
|
+
depth = e.get_indv_DEPTH(ui);
|
|
1639
|
+
if (depth >= 0)
|
|
1640
|
+
{
|
|
1641
|
+
sum += depth;
|
|
1642
|
+
sumsq += (depth*depth);
|
|
1643
|
+
n++;
|
|
1644
|
+
}
|
|
1645
|
+
}
|
|
1646
|
+
|
|
1647
|
+
if (output_mean)
|
|
1648
|
+
{
|
|
1649
|
+
double mean = double(sum) / n;
|
|
1650
|
+
double var = ((double(sumsq) / n) - (mean*mean)) * double(n) / double(n-1);
|
|
1651
|
+
out << mean << "\t" << var << endl;
|
|
1652
|
+
}
|
|
1653
|
+
else
|
|
1654
|
+
out << sum << "\t" << sumsq << endl;
|
|
1655
|
+
}
|
|
1656
|
+
out.close();
|
|
1657
|
+
}
|
|
1658
|
+
|
|
1659
|
+
void vcf_file::output_fst(const string &output_file_prefix, vcf_file &vcf_fst)
|
|
1660
|
+
{
|
|
1661
|
+
// Calculate, and output, Fst using the formula outlined in HapMap I
|
|
1662
|
+
// Namely:
|
|
1663
|
+
// Fst = 1 - (Pi_within / Pi_combined)
|
|
1664
|
+
// where
|
|
1665
|
+
// Pi_within = sum_j(nchoosek(n_j,2) * sum_i(2*n_ij * x_ij * (1-x_ij) / (n_ij -1))) / sum_j(nchoosek(n_j,2))
|
|
1666
|
+
// and
|
|
1667
|
+
// Pi_between = sum_i(2*n_i*x_i*(1-x_i) / (n_i - 1))
|
|
1668
|
+
// where j is the population index, and i is the SNP index
|
|
1669
|
+
printLOG("Outputting Fst estimates (for bi-allelic only)\n");
|
|
1670
|
+
|
|
1671
|
+
string output = output_file_prefix + ".fst";
|
|
1672
|
+
ofstream out(output.c_str());
|
|
1673
|
+
if (!out.is_open())
|
|
1674
|
+
error("Could not open Fst Output File: " + output, 7);
|
|
1675
|
+
|
|
1676
|
+
out << "CHROM\tPOS\tFST" << endl;
|
|
1677
|
+
|
|
1678
|
+
map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
|
|
1679
|
+
map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
|
|
1680
|
+
|
|
1681
|
+
return_site_union(vcf_fst, CHROMPOS_to_filepos_pair);
|
|
1682
|
+
|
|
1683
|
+
string vcf_line;
|
|
1684
|
+
|
|
1685
|
+
int n_1, n_2, n_1_choose_2 = 0, n_2_choose_2=0;
|
|
1686
|
+
int last_n_1=-1, last_n_2=-1;
|
|
1687
|
+
|
|
1688
|
+
unsigned int n_i1, n_i2, n_iT;
|
|
1689
|
+
int N_alleles1, N_alleles2;
|
|
1690
|
+
vector<int> allele_counts1, allele_counts2;
|
|
1691
|
+
double x_i1, x_i2, x_iT;
|
|
1692
|
+
int POS;
|
|
1693
|
+
int s1, s2;
|
|
1694
|
+
|
|
1695
|
+
double tmp1, tmp2, tmpT;
|
|
1696
|
+
double sum1=0.0, sum2=0.0, sumT=0.0;
|
|
1697
|
+
double Fst;
|
|
1698
|
+
string CHROM;
|
|
1699
|
+
|
|
1700
|
+
unsigned int N_intersecting_sites = 0;
|
|
1701
|
+
for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
|
|
1702
|
+
{
|
|
1703
|
+
s1 = CHROMPOS_to_filepos_pair_it->second.first;
|
|
1704
|
+
s2 = CHROMPOS_to_filepos_pair_it->second.second;
|
|
1705
|
+
|
|
1706
|
+
if ((s1 == -1) || (s2 == -1))
|
|
1707
|
+
continue;
|
|
1708
|
+
|
|
1709
|
+
CHROM = CHROMPOS_to_filepos_pair_it->first.first;
|
|
1710
|
+
POS = CHROMPOS_to_filepos_pair_it->first.second;
|
|
1711
|
+
|
|
1712
|
+
get_vcf_entry(s1, vcf_line);
|
|
1713
|
+
vcf_entry e1(N_indv, vcf_line);
|
|
1714
|
+
vcf_fst.get_vcf_entry(s2, vcf_line);
|
|
1715
|
+
vcf_entry e2(vcf_fst.N_indv, vcf_line);
|
|
1716
|
+
|
|
1717
|
+
e1.parse_basic_entry(true);
|
|
1718
|
+
e2.parse_basic_entry(true);
|
|
1719
|
+
|
|
1720
|
+
// Check sites have same alternative alleles
|
|
1721
|
+
N_alleles1 = e1.get_N_alleles();
|
|
1722
|
+
N_alleles2 = e2.get_N_alleles();
|
|
1723
|
+
|
|
1724
|
+
if ((N_alleles1 != 2) || (N_alleles2 != 2))
|
|
1725
|
+
{
|
|
1726
|
+
one_off_warning("\tFst: Only using biallelic SNPs.");
|
|
1727
|
+
continue;
|
|
1728
|
+
}
|
|
1729
|
+
|
|
1730
|
+
if ((N_alleles1 == 2) && (N_alleles2 == 2))
|
|
1731
|
+
if (e1.get_ALT_allele(0) != e2.get_ALT_allele(0))
|
|
1732
|
+
{
|
|
1733
|
+
one_off_warning("\tFst: Only using sites with matching reference alleles.");
|
|
1734
|
+
continue;
|
|
1735
|
+
}
|
|
1736
|
+
|
|
1737
|
+
e1.parse_genotype_entries(true);
|
|
1738
|
+
e2.parse_genotype_entries(true);
|
|
1739
|
+
|
|
1740
|
+
// Calculate allele frequencies
|
|
1741
|
+
e1.get_allele_counts(allele_counts1, n_i1, include_indv, include_genotype[s1]);
|
|
1742
|
+
e2.get_allele_counts(allele_counts2, n_i2, vcf_fst.include_indv, vcf_fst.include_genotype[s2]);
|
|
1743
|
+
|
|
1744
|
+
if ((n_i1 == 0) || (n_i2 == 0))
|
|
1745
|
+
continue;
|
|
1746
|
+
|
|
1747
|
+
n_1 = e1.get_N_chr(include_indv, include_genotype[s1]);
|
|
1748
|
+
n_2 = e2.get_N_chr(vcf_fst.include_indv, vcf_fst.include_genotype[s2]);
|
|
1749
|
+
|
|
1750
|
+
if (last_n_1 != -1)
|
|
1751
|
+
{
|
|
1752
|
+
if ((n_1 != last_n_1) || (n_2 != last_n_2))
|
|
1753
|
+
{
|
|
1754
|
+
error("Cannot mix sites with different ploidy. Are you including sex-chromosomes?\n"+CHROM+":"+int2str(POS)+"\n");
|
|
1755
|
+
}
|
|
1756
|
+
}
|
|
1757
|
+
else
|
|
1758
|
+
{
|
|
1759
|
+
last_n_1 = n_1;
|
|
1760
|
+
last_n_2 = n_2;
|
|
1761
|
+
}
|
|
1762
|
+
|
|
1763
|
+
n_1_choose_2 = n_1 * (n_1 - 1) / 2;
|
|
1764
|
+
n_2_choose_2 = n_2 * (n_2 - 1) / 2;
|
|
1765
|
+
|
|
1766
|
+
N_intersecting_sites++;
|
|
1767
|
+
|
|
1768
|
+
x_i1 = allele_counts1[0] / double(n_i1);
|
|
1769
|
+
x_i2 = allele_counts2[0] / double(n_i2);
|
|
1770
|
+
n_iT = (n_i1 + n_i2);
|
|
1771
|
+
x_iT = (allele_counts1[0] + allele_counts2[0]) / double(n_iT);
|
|
1772
|
+
|
|
1773
|
+
tmp1 = 2 * (n_i1 / (n_i1 - 1.0)) * x_i1 * (1-x_i1);
|
|
1774
|
+
tmp2 = 2 * (n_i2 / (n_i2 - 1.0)) * x_i2 * (1-x_i2);
|
|
1775
|
+
tmpT = 2 * (n_iT / (n_iT - 1.0)) * x_iT * (1-x_iT);
|
|
1776
|
+
|
|
1777
|
+
Fst = 1.0 - (((n_1_choose_2 * tmp1) + (n_2_choose_2 * tmp2)) / (n_1_choose_2 + n_2_choose_2) / tmpT);
|
|
1778
|
+
|
|
1779
|
+
out << CHROM << "\t" << POS << "\t" << Fst << endl;
|
|
1780
|
+
|
|
1781
|
+
sum1 += tmp1;
|
|
1782
|
+
sum2 += tmp2;
|
|
1783
|
+
sumT += tmpT;
|
|
1784
|
+
|
|
1785
|
+
last_n_1 = n_1; last_n_2 = n_2;
|
|
1786
|
+
}
|
|
1787
|
+
|
|
1788
|
+
Fst = 1.0 - (((n_1_choose_2 * sum1) + (n_2_choose_2 * sum2)) / (n_1_choose_2 + n_2_choose_2) / sumT);
|
|
1789
|
+
|
|
1790
|
+
printLOG("Found " + int2str(N_intersecting_sites) + " intersecting sites\n");
|
|
1791
|
+
printLOG("Fst = " + dbl2str(Fst, 6) + "\n");
|
|
1792
|
+
|
|
1793
|
+
out.close();
|
|
1794
|
+
}
|
|
1795
|
+
|
|
1796
|
+
|
|
1797
|
+
void vcf_file::output_fst_version_2(const string &output_file_prefix, const vector<string> &indv_files)
|
|
1798
|
+
{
|
|
1799
|
+
// Calculate Fst using individuals in one (rather than two VCF files)
|
|
1800
|
+
// Calculate, and output, Fst using the formula outlined in HapMap I
|
|
1801
|
+
// Namely:
|
|
1802
|
+
// Fst = 1 - (Pi_within / Pi_combined)
|
|
1803
|
+
// where
|
|
1804
|
+
// Pi_within = sum_j(nchoosek(n_j,2) * sum_i(2*n_ij * x_ij * (1-x_ij) / (n_ij -1))) / sum_j(nchoosek(n_j,2))
|
|
1805
|
+
// and
|
|
1806
|
+
// Pi_between = sum_i(2*n_i*x_i*(1-x_i) / (n_i - 1))
|
|
1807
|
+
// where j is the population index, and i is the SNP index
|
|
1808
|
+
|
|
1809
|
+
if (indv_files.size() == 1)
|
|
1810
|
+
{
|
|
1811
|
+
printLOG("Require at least two populations to estimate Fst. Skipping\n");
|
|
1812
|
+
return;
|
|
1813
|
+
}
|
|
1814
|
+
|
|
1815
|
+
printLOG("Outputting Fst estimates.\n");
|
|
1816
|
+
|
|
1817
|
+
// First, read in the relevant files.
|
|
1818
|
+
vector< vector<bool> > indvs_in_pops;
|
|
1819
|
+
unsigned int N_pops = indv_files.size();
|
|
1820
|
+
indvs_in_pops.resize(N_pops, vector<bool>(N_indv, false));
|
|
1821
|
+
vector<bool> all_indv(N_indv,false);
|
|
1822
|
+
map<string, int> indv_to_idx;
|
|
1823
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
1824
|
+
if (include_indv[ui] == true)
|
|
1825
|
+
indv_to_idx[indv[ui]] = ui;
|
|
1826
|
+
for (unsigned int ui=0; ui<N_pops; ui++)
|
|
1827
|
+
{
|
|
1828
|
+
ifstream indv_file(indv_files[ui].c_str());
|
|
1829
|
+
if (!indv_file.is_open())
|
|
1830
|
+
error("Could not open Individual file: " + indv_files[ui]);
|
|
1831
|
+
string line;
|
|
1832
|
+
string tmp_indv;
|
|
1833
|
+
stringstream ss;
|
|
1834
|
+
while (!indv_file.eof())
|
|
1835
|
+
{
|
|
1836
|
+
getline(indv_file, line);
|
|
1837
|
+
ss.str(line);
|
|
1838
|
+
ss >> tmp_indv;
|
|
1839
|
+
if (indv_to_idx.find(tmp_indv) != indv_to_idx.end())
|
|
1840
|
+
{
|
|
1841
|
+
indvs_in_pops[ui][indv_to_idx[tmp_indv]]=true;
|
|
1842
|
+
all_indv[indv_to_idx[tmp_indv]]=true;
|
|
1843
|
+
}
|
|
1844
|
+
ss.clear();
|
|
1845
|
+
}
|
|
1846
|
+
indv_file.close();
|
|
1847
|
+
}
|
|
1848
|
+
|
|
1849
|
+
string output = output_file_prefix + ".fst";
|
|
1850
|
+
ofstream out(output.c_str());
|
|
1851
|
+
if (!out.is_open())
|
|
1852
|
+
error("Could not open Fst Output File: " + output, 7);
|
|
1853
|
+
|
|
1854
|
+
out << "CHROM\tPOS\tFST" << endl;
|
|
1855
|
+
|
|
1856
|
+
vcf_entry e(N_indv);
|
|
1857
|
+
string vcf_line;
|
|
1858
|
+
vector<int> allele_counts1;
|
|
1859
|
+
double Fst_tot_num=0.0, Fst_tot_denom=0.0;
|
|
1860
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
1861
|
+
{
|
|
1862
|
+
if (include_entry[s] == false)
|
|
1863
|
+
continue;
|
|
1864
|
+
|
|
1865
|
+
get_vcf_entry(s, vcf_line);
|
|
1866
|
+
e.reset(vcf_line);
|
|
1867
|
+
e.parse_basic_entry(true);
|
|
1868
|
+
|
|
1869
|
+
if (e.get_N_alleles() != 2)
|
|
1870
|
+
{
|
|
1871
|
+
one_off_warning("\tFst: Only using biallelic sites.");
|
|
1872
|
+
continue;
|
|
1873
|
+
}
|
|
1874
|
+
|
|
1875
|
+
e.parse_full_entry(true);
|
|
1876
|
+
e.parse_genotype_entries(true);
|
|
1877
|
+
|
|
1878
|
+
unsigned int N_chr;
|
|
1879
|
+
e.get_allele_counts(allele_counts1, N_chr, all_indv, include_genotype[s]);
|
|
1880
|
+
double count_all = allele_counts1[1];
|
|
1881
|
+
double N_chr_all = N_chr;
|
|
1882
|
+
|
|
1883
|
+
if ((count_all == 0) || (count_all == N_chr_all))
|
|
1884
|
+
continue; // No polymorphism
|
|
1885
|
+
|
|
1886
|
+
vector<double> counts(N_pops, 0);
|
|
1887
|
+
vector<double> pop_N_chr(N_pops, 0);
|
|
1888
|
+
vector<double> pop_N_choose_2(N_pops, 0);
|
|
1889
|
+
for (unsigned int p=0; p<N_pops; p++)
|
|
1890
|
+
{
|
|
1891
|
+
e.get_allele_counts(allele_counts1, N_chr, indvs_in_pops[p], include_genotype[s]);
|
|
1892
|
+
counts[p] = allele_counts1[1];
|
|
1893
|
+
pop_N_chr[p] = N_chr;
|
|
1894
|
+
pop_N_choose_2[p] = N_chr * (N_chr-1.0) / 2.0;
|
|
1895
|
+
}
|
|
1896
|
+
|
|
1897
|
+
double Fst_SNP = 0;
|
|
1898
|
+
double f;
|
|
1899
|
+
double sum1=0.0;
|
|
1900
|
+
for (unsigned int p=0; p<N_pops; p++)
|
|
1901
|
+
{
|
|
1902
|
+
f = counts[p] / pop_N_chr[p];
|
|
1903
|
+
Fst_SNP += 2.0*pop_N_choose_2[p]*(pop_N_chr[p]/(pop_N_chr[p]-1.0))*f*(1.0-f);
|
|
1904
|
+
sum1 += pop_N_choose_2[p];
|
|
1905
|
+
}
|
|
1906
|
+
Fst_SNP /= sum1;
|
|
1907
|
+
Fst_tot_num += Fst_SNP;
|
|
1908
|
+
f = count_all / N_chr_all;
|
|
1909
|
+
double tmp = (2.0*(N_chr_all / (N_chr_all-1.0))*f*(1.0-f));
|
|
1910
|
+
Fst_SNP /= tmp;
|
|
1911
|
+
Fst_tot_denom += tmp;
|
|
1912
|
+
Fst_SNP = 1.0 - Fst_SNP;
|
|
1913
|
+
out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << Fst_SNP << endl;
|
|
1914
|
+
|
|
1915
|
+
// TODO add other methods of calculating Fst (such as Weir-Cockerham)
|
|
1916
|
+
}
|
|
1917
|
+
double Fst_tot = 1.0 - (Fst_tot_num / Fst_tot_denom);
|
|
1918
|
+
printLOG("Fst = " + dbl2str(Fst_tot, 6) + "\n");
|
|
1919
|
+
|
|
1920
|
+
out.close();
|
|
1921
|
+
}
|
|
1922
|
+
|
|
1923
|
+
void vcf_file::output_per_site_nucleotide_diversity(const string &output_file_prefix)
|
|
1924
|
+
{
|
|
1925
|
+
// Output nucleotide diversity, calculated on a per-site basis.
|
|
1926
|
+
// Pi = average number of pairwise differences
|
|
1927
|
+
// Assumes a constant distance of 1 between all possible mutations
|
|
1928
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
1929
|
+
error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics.");
|
|
1930
|
+
|
|
1931
|
+
printLOG("Outputting Per-Site Nucleotide Diversity Statistics...\n");
|
|
1932
|
+
string output_file = output_file_prefix + ".sites.pi";
|
|
1933
|
+
|
|
1934
|
+
ofstream out(output_file.c_str());
|
|
1935
|
+
if (!out.is_open()) error("Could not open output file: " + output_file, 12);
|
|
1936
|
+
out << "CHROM\tPOS\tPI" << endl;
|
|
1937
|
+
|
|
1938
|
+
string vcf_line, FORMAT_out;
|
|
1939
|
+
vcf_entry e(N_indv);
|
|
1940
|
+
pair<int, int> genotype1, genotype2;
|
|
1941
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
1942
|
+
{
|
|
1943
|
+
if (include_entry[s] == false)
|
|
1944
|
+
continue;
|
|
1945
|
+
|
|
1946
|
+
get_vcf_entry(s, vcf_line);
|
|
1947
|
+
e.reset(vcf_line);
|
|
1948
|
+
e.parse_basic_entry(true);
|
|
1949
|
+
|
|
1950
|
+
if (e.get_N_alleles() != 2)
|
|
1951
|
+
{
|
|
1952
|
+
one_off_warning("\tsitePi: Only using biallelic sites.");
|
|
1953
|
+
continue;
|
|
1954
|
+
}
|
|
1955
|
+
|
|
1956
|
+
e.parse_full_entry(true);
|
|
1957
|
+
e.parse_genotype_entries(true);
|
|
1958
|
+
|
|
1959
|
+
if (e.is_diploid(include_indv, include_genotype[s]) == false)
|
|
1960
|
+
{
|
|
1961
|
+
one_off_warning("\tsitePi: Only using fully diploid sites.");
|
|
1962
|
+
continue;
|
|
1963
|
+
}
|
|
1964
|
+
|
|
1965
|
+
int total_alleles_count = 0;
|
|
1966
|
+
int first_allele_count = 0;
|
|
1967
|
+
int first_allele = -1;
|
|
1968
|
+
for (unsigned int ui=0; ui < N_indv; ui++)
|
|
1969
|
+
{
|
|
1970
|
+
if (include_indv[ui] == false)
|
|
1971
|
+
continue;
|
|
1972
|
+
if (include_genotype[s][ui] == false)
|
|
1973
|
+
continue;
|
|
1974
|
+
e.get_indv_GENOTYPE_ids(ui, genotype1);
|
|
1975
|
+
if ((genotype1.first != -1) && (genotype1.second != -1))
|
|
1976
|
+
{
|
|
1977
|
+
total_alleles_count += 2;
|
|
1978
|
+
if (first_allele == -1)
|
|
1979
|
+
first_allele = genotype1.first; //initialize to the first allele found
|
|
1980
|
+
if (genotype1.first == first_allele)
|
|
1981
|
+
first_allele_count++;
|
|
1982
|
+
if (genotype1.second == first_allele)
|
|
1983
|
+
first_allele_count++;
|
|
1984
|
+
}
|
|
1985
|
+
}
|
|
1986
|
+
int n = total_alleles_count;
|
|
1987
|
+
int k = first_allele_count;
|
|
1988
|
+
double pi= (2.0*k*(n-k))/(n*(n-1));
|
|
1989
|
+
|
|
1990
|
+
out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << pi << endl;
|
|
1991
|
+
}
|
|
1992
|
+
}
|
|
1993
|
+
|
|
1994
|
+
// Output Tajima's D
|
|
1995
|
+
// Carlson et al. Genome Res (2005)
|
|
1996
|
+
void vcf_file::output_Tajima_D(const string &output_file_prefix, int window_size)
|
|
1997
|
+
{
|
|
1998
|
+
if (window_size <= 0)
|
|
1999
|
+
return;
|
|
2000
|
+
|
|
2001
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
2002
|
+
error("Require Genotypes in VCF file in order to output Tajima's D Statistic.");
|
|
2003
|
+
|
|
2004
|
+
printLOG("Outputting Tajima's D Statistic...\n");
|
|
2005
|
+
string output_file = output_file_prefix + ".Tajima.D";
|
|
2006
|
+
|
|
2007
|
+
double a1=0.0, a2=0.0, b1, b2, c1, c2, e1, e2;
|
|
2008
|
+
unsigned int n = N_kept_individuals()*2;
|
|
2009
|
+
if (n < 2)
|
|
2010
|
+
error("Require at least two chromosomes!");
|
|
2011
|
+
|
|
2012
|
+
for (unsigned int ui=1; ui<n; ui++)
|
|
2013
|
+
{
|
|
2014
|
+
a1 += 1.0 / double(ui);
|
|
2015
|
+
a2 += 1.0 / double(ui * ui);
|
|
2016
|
+
}
|
|
2017
|
+
b1 = double(n+1) / 3.0 / double(n-1);
|
|
2018
|
+
b2 = 2.0 * double(n*n + n + 3) / 9.0 / double(n) / double(n-1);
|
|
2019
|
+
c1 = b1 - (1.0 / a1);
|
|
2020
|
+
c2 = b2 - (double(n+2)/double(a1*n)) + (a2/a1/a1);
|
|
2021
|
+
e1 = c1 / a1;
|
|
2022
|
+
e2 = c2 / ((a1*a1) + a2);
|
|
2023
|
+
|
|
2024
|
+
// Find maximum position
|
|
2025
|
+
map<string, int> max_pos;
|
|
2026
|
+
string vcf_line, CHROM;
|
|
2027
|
+
vcf_entry e(N_indv);
|
|
2028
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
2029
|
+
{
|
|
2030
|
+
if (include_entry[s] == true)
|
|
2031
|
+
{
|
|
2032
|
+
get_vcf_entry(s, vcf_line);
|
|
2033
|
+
e.reset(vcf_line);
|
|
2034
|
+
e.parse_basic_entry();
|
|
2035
|
+
|
|
2036
|
+
CHROM = e.get_CHROM();
|
|
2037
|
+
|
|
2038
|
+
if (max_pos.find(CHROM) != max_pos.end())
|
|
2039
|
+
{
|
|
2040
|
+
if (e.get_POS() > max_pos[CHROM])
|
|
2041
|
+
max_pos[CHROM] = e.get_POS();
|
|
2042
|
+
}
|
|
2043
|
+
else
|
|
2044
|
+
max_pos[CHROM] = e.get_POS();
|
|
2045
|
+
}
|
|
2046
|
+
}
|
|
2047
|
+
|
|
2048
|
+
map<string, int>::iterator it;
|
|
2049
|
+
unsigned int N_bins;
|
|
2050
|
+
map<string, vector< pair<int, double> > > bins;
|
|
2051
|
+
for (it=max_pos.begin(); it != max_pos.end(); ++it)
|
|
2052
|
+
{
|
|
2053
|
+
CHROM = (*it).first;
|
|
2054
|
+
N_bins = (unsigned int)((max_pos[CHROM] + window_size) / double(window_size));
|
|
2055
|
+
bins[CHROM].resize(N_bins, make_pair(0,0));
|
|
2056
|
+
}
|
|
2057
|
+
|
|
2058
|
+
unsigned int idx;
|
|
2059
|
+
double C = 1.0 / double(window_size);
|
|
2060
|
+
vector<int> allele_counts;
|
|
2061
|
+
unsigned int N_non_missing_chr;
|
|
2062
|
+
unsigned int N_alleles;
|
|
2063
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
2064
|
+
{
|
|
2065
|
+
if (include_entry[s] == false)
|
|
2066
|
+
continue;
|
|
2067
|
+
|
|
2068
|
+
get_vcf_entry(s, vcf_line);
|
|
2069
|
+
e.reset(vcf_line);
|
|
2070
|
+
e.parse_basic_entry(true);
|
|
2071
|
+
N_alleles = e.get_N_alleles();
|
|
2072
|
+
|
|
2073
|
+
if (N_alleles != 2)
|
|
2074
|
+
{
|
|
2075
|
+
one_off_warning("\tTajimaD: Only using bialleleic sites.");
|
|
2076
|
+
continue;
|
|
2077
|
+
}
|
|
2078
|
+
|
|
2079
|
+
CHROM = e.get_CHROM();
|
|
2080
|
+
idx = (unsigned int)(e.get_POS() * C);
|
|
2081
|
+
e.parse_genotype_entries(true);
|
|
2082
|
+
|
|
2083
|
+
if (e.is_diploid(include_indv, include_genotype[s]) == false)
|
|
2084
|
+
{
|
|
2085
|
+
one_off_warning("\tTajimaD: Only using fully diploid sites.");
|
|
2086
|
+
continue;
|
|
2087
|
+
}
|
|
2088
|
+
|
|
2089
|
+
e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
|
|
2090
|
+
|
|
2091
|
+
double p = double(allele_counts[0]) / N_non_missing_chr;
|
|
2092
|
+
if ((p > 0.0) && (p < 1.0))
|
|
2093
|
+
{
|
|
2094
|
+
bins[CHROM][idx].first++;
|
|
2095
|
+
bins[CHROM][idx].second += p * (1.0-p);
|
|
2096
|
+
}
|
|
2097
|
+
}
|
|
2098
|
+
|
|
2099
|
+
ofstream out(output_file.c_str());
|
|
2100
|
+
if (!out.is_open()) error("Could not open output file: " + output_file, 12);
|
|
2101
|
+
out << "CHROM\tBIN_START\tN_SNPS\tTajimaD" << endl;
|
|
2102
|
+
|
|
2103
|
+
for (it=max_pos.begin(); it != max_pos.end(); ++it)
|
|
2104
|
+
{
|
|
2105
|
+
CHROM = (*it).first;
|
|
2106
|
+
bool output = false;
|
|
2107
|
+
for (unsigned int s=0; s<bins[CHROM].size(); s++)
|
|
2108
|
+
{
|
|
2109
|
+
int S = bins[CHROM][s].first;
|
|
2110
|
+
double D = 0;
|
|
2111
|
+
if (S > 1)
|
|
2112
|
+
{
|
|
2113
|
+
double pi = 2.0*bins[CHROM][s].second*n/double(n-1);
|
|
2114
|
+
double tw = double(S) / a1;
|
|
2115
|
+
double var = (e1*S) + e2*S*(S-1);
|
|
2116
|
+
D = (pi - tw) / sqrt(var);
|
|
2117
|
+
output = true;
|
|
2118
|
+
}
|
|
2119
|
+
if (S > 0)
|
|
2120
|
+
output = true;
|
|
2121
|
+
if (output == true)
|
|
2122
|
+
out << CHROM << "\t" << s*window_size << "\t" << bins[CHROM][s].first << "\t" << D << endl;
|
|
2123
|
+
}
|
|
2124
|
+
}
|
|
2125
|
+
|
|
2126
|
+
out.close();
|
|
2127
|
+
}
|
|
2128
|
+
|
|
2129
|
+
void vcf_file::output_windowed_nucleotide_diversity(const string &output_file_prefix, int window_size)
|
|
2130
|
+
{
|
|
2131
|
+
// Output nucleotide diversity, as calculated in windows.
|
|
2132
|
+
// Average number of pairwise differences in windows.
|
|
2133
|
+
if (window_size <= 0)
|
|
2134
|
+
return;
|
|
2135
|
+
|
|
2136
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
2137
|
+
error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics.");
|
|
2138
|
+
|
|
2139
|
+
printLOG("Outputting Windowed Nucleotide Diversity Statistics...\n");
|
|
2140
|
+
string output_file = output_file_prefix + ".windowed.pi";
|
|
2141
|
+
|
|
2142
|
+
// Find maximum position
|
|
2143
|
+
map<string, int> max_pos;
|
|
2144
|
+
map<string, int>::iterator it;
|
|
2145
|
+
string vcf_line, CHROM;
|
|
2146
|
+
vcf_entry e(N_indv);
|
|
2147
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
2148
|
+
{
|
|
2149
|
+
if (include_entry[s] == true)
|
|
2150
|
+
{
|
|
2151
|
+
get_vcf_entry(s, vcf_line);
|
|
2152
|
+
e.reset(vcf_line);
|
|
2153
|
+
e.parse_basic_entry();
|
|
2154
|
+
|
|
2155
|
+
CHROM = e.get_CHROM();
|
|
2156
|
+
|
|
2157
|
+
if (max_pos.find(CHROM) != max_pos.end())
|
|
2158
|
+
{
|
|
2159
|
+
if (e.get_POS() > max_pos[CHROM])
|
|
2160
|
+
max_pos[CHROM] = e.get_POS();
|
|
2161
|
+
}
|
|
2162
|
+
else
|
|
2163
|
+
max_pos[CHROM] = e.get_POS();
|
|
2164
|
+
}
|
|
2165
|
+
}
|
|
2166
|
+
|
|
2167
|
+
unsigned int N_bins;
|
|
2168
|
+
map<string, vector<pair<int, double> > > bins;
|
|
2169
|
+
for (it=max_pos.begin(); it != max_pos.end(); ++it)
|
|
2170
|
+
{
|
|
2171
|
+
CHROM = (*it).first;
|
|
2172
|
+
N_bins = (unsigned int)((max_pos[CHROM] + window_size) / double(window_size));
|
|
2173
|
+
bins[CHROM].resize(N_bins, make_pair(0,0));
|
|
2174
|
+
}
|
|
2175
|
+
|
|
2176
|
+
unsigned int idx;
|
|
2177
|
+
double C = 1.0 / double(window_size);
|
|
2178
|
+
vector<int> allele_counts;
|
|
2179
|
+
unsigned int N_non_missing_chr;
|
|
2180
|
+
unsigned int N_alleles;
|
|
2181
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
2182
|
+
{
|
|
2183
|
+
if (include_entry[s] == false)
|
|
2184
|
+
continue;
|
|
2185
|
+
|
|
2186
|
+
get_vcf_entry(s, vcf_line);
|
|
2187
|
+
e.reset(vcf_line);
|
|
2188
|
+
e.parse_basic_entry(true);
|
|
2189
|
+
N_alleles = e.get_N_alleles();
|
|
2190
|
+
|
|
2191
|
+
if (N_alleles != 2)
|
|
2192
|
+
{
|
|
2193
|
+
one_off_warning("\twindowPi: Only using bialleleic sites.");
|
|
2194
|
+
continue;
|
|
2195
|
+
}
|
|
2196
|
+
|
|
2197
|
+
CHROM = e.get_CHROM();
|
|
2198
|
+
idx = (unsigned int)(e.get_POS() * C);
|
|
2199
|
+
e.parse_genotype_entries(true);
|
|
2200
|
+
|
|
2201
|
+
if (e.is_diploid(include_indv, include_genotype[s]) == false)
|
|
2202
|
+
{
|
|
2203
|
+
one_off_warning("\twindowPi: Only using fully diploid sites.");
|
|
2204
|
+
continue;
|
|
2205
|
+
}
|
|
2206
|
+
|
|
2207
|
+
e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
|
|
2208
|
+
|
|
2209
|
+
double p = double(allele_counts[0]) / N_non_missing_chr;
|
|
2210
|
+
if ((p>0.0) && (p<1.0))
|
|
2211
|
+
{
|
|
2212
|
+
bins[CHROM][idx].first++;
|
|
2213
|
+
bins[CHROM][idx].second += (double(N_non_missing_chr) / (N_non_missing_chr - 1.0)) * 2.0 * p * (1.0 - p);
|
|
2214
|
+
}
|
|
2215
|
+
}
|
|
2216
|
+
|
|
2217
|
+
ofstream out(output_file.c_str());
|
|
2218
|
+
if (!out.is_open()) error("Could not open output file: " + output_file, 12);
|
|
2219
|
+
out << "CHROM\tBIN_START\tN_SNPS\tPI" << endl;
|
|
2220
|
+
|
|
2221
|
+
for (it=max_pos.begin(); it != max_pos.end(); ++it)
|
|
2222
|
+
{
|
|
2223
|
+
CHROM = (*it).first;
|
|
2224
|
+
bool output = false;
|
|
2225
|
+
for (unsigned int s=0; s<bins[CHROM].size(); s++)
|
|
2226
|
+
{
|
|
2227
|
+
if (bins[CHROM][s].first > 0)
|
|
2228
|
+
output = true;
|
|
2229
|
+
if (output == true)
|
|
2230
|
+
out << CHROM << "\t" << s*window_size << "\t" << bins[CHROM][s].first << "\t" << bins[CHROM][s].second << endl;
|
|
2231
|
+
}
|
|
2232
|
+
}
|
|
2233
|
+
|
|
2234
|
+
out.close();
|
|
2235
|
+
}
|
|
2236
|
+
|
|
2237
|
+
/*
|
|
2238
|
+
void vcf_file::output_windowed_nucleotide_diversity(const string &output_file_prefix, int window_size)
|
|
2239
|
+
{
|
|
2240
|
+
// Output nucleotide diversity, as calculated in windows.
|
|
2241
|
+
// Average number of pairwise differences in windows.
|
|
2242
|
+
// Requires phased data.
|
|
2243
|
+
if (window_size <= 0)
|
|
2244
|
+
return;
|
|
2245
|
+
|
|
2246
|
+
if (has_genotypes == false)
|
|
2247
|
+
error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics.");
|
|
2248
|
+
|
|
2249
|
+
printLOG("Outputting Windowed Nucleotide Diversity Statistics...\n");
|
|
2250
|
+
string output_file = output_file_prefix + ".windowed.pi";
|
|
2251
|
+
|
|
2252
|
+
map<string, int>::iterator it;
|
|
2253
|
+
|
|
2254
|
+
// Find maximum position
|
|
2255
|
+
map<string, int> max_pos;
|
|
2256
|
+
string vcf_line, CHROM;
|
|
2257
|
+
vcf_entry e(N_indv);
|
|
2258
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
2259
|
+
{
|
|
2260
|
+
if (include_entry[s] == true)
|
|
2261
|
+
{
|
|
2262
|
+
get_vcf_entry(s, vcf_line);
|
|
2263
|
+
e.reset(vcf_line);
|
|
2264
|
+
e.parse_basic_entry();
|
|
2265
|
+
|
|
2266
|
+
CHROM = e.get_CHROM();
|
|
2267
|
+
|
|
2268
|
+
if (max_pos.find(CHROM) != max_pos.end())
|
|
2269
|
+
{
|
|
2270
|
+
if (e.get_POS() > max_pos[CHROM])
|
|
2271
|
+
max_pos[CHROM] = e.get_POS();
|
|
2272
|
+
}
|
|
2273
|
+
else
|
|
2274
|
+
max_pos[CHROM] = e.get_POS();
|
|
2275
|
+
}
|
|
2276
|
+
}
|
|
2277
|
+
|
|
2278
|
+
unsigned int N_bins;
|
|
2279
|
+
map<string, vector<pair<int, double> > > bins;
|
|
2280
|
+
for (it=max_pos.begin(); it != max_pos.end(); ++it)
|
|
2281
|
+
{
|
|
2282
|
+
CHROM = (*it).first;
|
|
2283
|
+
N_bins = (unsigned int)((max_pos[CHROM] + window_size) / double(window_size));
|
|
2284
|
+
bins[CHROM].resize(N_bins, make_pair(0,0));
|
|
2285
|
+
}
|
|
2286
|
+
|
|
2287
|
+
unsigned int last_idx = (unsigned)(-1);
|
|
2288
|
+
unsigned int idx;
|
|
2289
|
+
string last_CHROM;
|
|
2290
|
+
vector<vector<int> > haplotypes(2*N_indv);
|
|
2291
|
+
pair<int, int> genotype1;
|
|
2292
|
+
unsigned int N_SNPs=0;;
|
|
2293
|
+
double C = 1.0 / double(window_size);
|
|
2294
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
2295
|
+
{
|
|
2296
|
+
if (include_entry[s] == false)
|
|
2297
|
+
continue;
|
|
2298
|
+
|
|
2299
|
+
get_vcf_entry(s, vcf_line);
|
|
2300
|
+
e.reset(vcf_line);
|
|
2301
|
+
e.parse_basic_entry();
|
|
2302
|
+
|
|
2303
|
+
CHROM = e.get_CHROM();
|
|
2304
|
+
idx = (unsigned int)(e.get_POS() * C);
|
|
2305
|
+
|
|
2306
|
+
if (((last_idx != idx) || (CHROM != last_CHROM)) && (last_idx != (unsigned)-1))
|
|
2307
|
+
{ // Process haplotype window.
|
|
2308
|
+
double pi=0.0;
|
|
2309
|
+
double n=0.0;
|
|
2310
|
+
for (unsigned int ui=0; ui<(haplotypes.size()-1); ui++)
|
|
2311
|
+
{
|
|
2312
|
+
if (include_indv[ui/2] == false)
|
|
2313
|
+
continue;
|
|
2314
|
+
for (unsigned int uj=(ui+1); uj<haplotypes.size(); uj++)
|
|
2315
|
+
{
|
|
2316
|
+
if (include_indv[uj/2] == false)
|
|
2317
|
+
continue;
|
|
2318
|
+
for (unsigned int snp=0; snp<N_SNPs; snp++)
|
|
2319
|
+
{
|
|
2320
|
+
if ((haplotypes[ui][snp] != -1) && (haplotypes[uj][snp] != -1))
|
|
2321
|
+
{
|
|
2322
|
+
if (haplotypes[ui][snp] != haplotypes[uj][snp])
|
|
2323
|
+
{ pi++; }
|
|
2324
|
+
n++;
|
|
2325
|
+
}
|
|
2326
|
+
}
|
|
2327
|
+
}
|
|
2328
|
+
}
|
|
2329
|
+
pi /= n;
|
|
2330
|
+
bins[last_CHROM][last_idx].first = N_SNPs;
|
|
2331
|
+
bins[last_CHROM][last_idx].second = pi;
|
|
2332
|
+
|
|
2333
|
+
N_SNPs = 0;
|
|
2334
|
+
for (unsigned int ui=0; ui<haplotypes.size(); ui++)
|
|
2335
|
+
{
|
|
2336
|
+
haplotypes[ui].clear();
|
|
2337
|
+
}
|
|
2338
|
+
}
|
|
2339
|
+
|
|
2340
|
+
e.parse_genotype_entries(true);
|
|
2341
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
2342
|
+
{
|
|
2343
|
+
if (include_indv[ui] == false)
|
|
2344
|
+
continue;
|
|
2345
|
+
|
|
2346
|
+
if (include_genotype[s][ui] == true)
|
|
2347
|
+
{
|
|
2348
|
+
e.get_indv_GENOTYPE_ids(ui, genotype1);
|
|
2349
|
+
haplotypes[(2*ui)].push_back(genotype1.first);
|
|
2350
|
+
haplotypes[(2*ui)+1].push_back(genotype1.second);
|
|
2351
|
+
}
|
|
2352
|
+
else
|
|
2353
|
+
{
|
|
2354
|
+
haplotypes[(2*ui)].push_back(-1);
|
|
2355
|
+
haplotypes[(2*ui)+1].push_back(-1);
|
|
2356
|
+
}
|
|
2357
|
+
}
|
|
2358
|
+
N_SNPs++;
|
|
2359
|
+
last_CHROM = CHROM;
|
|
2360
|
+
last_idx = idx;
|
|
2361
|
+
}
|
|
2362
|
+
|
|
2363
|
+
if (N_SNPs > 0)
|
|
2364
|
+
{ // Output last window
|
|
2365
|
+
double pi=0.0;
|
|
2366
|
+
double n=0.0;
|
|
2367
|
+
for (unsigned int ui=0; ui<(haplotypes.size()-1); ui++)
|
|
2368
|
+
{
|
|
2369
|
+
if (include_indv[ui/2] == false)
|
|
2370
|
+
continue;
|
|
2371
|
+
for (unsigned int uj=ui+1; uj<haplotypes.size(); uj++)
|
|
2372
|
+
{
|
|
2373
|
+
if (include_indv[uj/2] == false)
|
|
2374
|
+
continue;
|
|
2375
|
+
for (unsigned int snp=0; snp<N_SNPs; snp++)
|
|
2376
|
+
{
|
|
2377
|
+
if ((haplotypes[ui][snp] != -1) && (haplotypes[uj][snp] != -1))
|
|
2378
|
+
{
|
|
2379
|
+
if (haplotypes[ui][snp] != haplotypes[uj][snp])
|
|
2380
|
+
pi++;
|
|
2381
|
+
n++;
|
|
2382
|
+
}
|
|
2383
|
+
}
|
|
2384
|
+
}
|
|
2385
|
+
}
|
|
2386
|
+
pi /= n;
|
|
2387
|
+
bins[last_CHROM][last_idx].first = N_SNPs;
|
|
2388
|
+
bins[last_CHROM][last_idx].second = pi;
|
|
2389
|
+
}
|
|
2390
|
+
|
|
2391
|
+
ofstream out(output_file.c_str());
|
|
2392
|
+
if (!out.is_open()) error("Could not open output file: " + output_file, 12);
|
|
2393
|
+
out << "CHROM\tBIN_START\tN_SNPS\tPI" << endl;
|
|
2394
|
+
|
|
2395
|
+
for (it=max_pos.begin(); it != max_pos.end(); ++it)
|
|
2396
|
+
{
|
|
2397
|
+
CHROM = (*it).first;
|
|
2398
|
+
for (unsigned int s=0; s<bins[CHROM].size(); s++)
|
|
2399
|
+
{
|
|
2400
|
+
out << CHROM << "\t" << s*window_size << "\t" << bins[CHROM][s].first << "\t" << bins[CHROM][s].second << endl;
|
|
2401
|
+
}
|
|
2402
|
+
}
|
|
2403
|
+
|
|
2404
|
+
out.close();
|
|
2405
|
+
}
|
|
2406
|
+
*/
|
|
2407
|
+
|
|
2408
|
+
void vcf_file::output_kept_and_removed_sites(const string &output_file_prefix)
|
|
2409
|
+
{
|
|
2410
|
+
// Output lists of sites that have been filtered (or not).
|
|
2411
|
+
printLOG("Outputting Kept and Removed Sites...\n");
|
|
2412
|
+
string output_file1 = output_file_prefix + ".kept.sites";
|
|
2413
|
+
string output_file2 = output_file_prefix + ".removed.sites";
|
|
2414
|
+
|
|
2415
|
+
string vcf_line, CHROM;
|
|
2416
|
+
int POS;
|
|
2417
|
+
vcf_entry e(N_indv);
|
|
2418
|
+
|
|
2419
|
+
ofstream out1(output_file1.c_str());
|
|
2420
|
+
if (!out1.is_open()) error("Could not open output file: " + output_file1, 12);
|
|
2421
|
+
out1 << "CHROM\tPOS" << endl;
|
|
2422
|
+
|
|
2423
|
+
ofstream out2(output_file2.c_str());
|
|
2424
|
+
if (!out2.is_open()) error("Could not open output file: " + output_file2, 12);
|
|
2425
|
+
out2 << "CHROM\tPOS" << endl;
|
|
2426
|
+
|
|
2427
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
2428
|
+
{
|
|
2429
|
+
get_vcf_entry(s, vcf_line);
|
|
2430
|
+
e.reset(vcf_line);
|
|
2431
|
+
e.parse_basic_entry();
|
|
2432
|
+
POS = e.get_POS();
|
|
2433
|
+
CHROM = e.get_CHROM();
|
|
2434
|
+
if (include_entry[s] == true)
|
|
2435
|
+
{
|
|
2436
|
+
out1 << CHROM << "\t" << POS << endl;
|
|
2437
|
+
}
|
|
2438
|
+
else
|
|
2439
|
+
{
|
|
2440
|
+
out2 << CHROM << "\t" << POS << endl;
|
|
2441
|
+
}
|
|
2442
|
+
}
|
|
2443
|
+
out1.close();
|
|
2444
|
+
out2.close();
|
|
2445
|
+
}
|
|
2446
|
+
|
|
2447
|
+
|
|
2448
|
+
void vcf_file::output_LROH(const string &output_file_prefix)
|
|
2449
|
+
{
|
|
2450
|
+
// Detect and output Long Runs of Homozygosity, following the method
|
|
2451
|
+
// developed by Adam Boyko, and described in Auton et al., Genome Research, 2009
|
|
2452
|
+
// (Although using Forward-backwards algorithm in place of Viterbi).
|
|
2453
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
2454
|
+
error("Require Genotypes in VCF file in order to output LROH.");
|
|
2455
|
+
|
|
2456
|
+
printLOG("Outputting Long Runs of Homozygosity (Experimental)... \n");
|
|
2457
|
+
string output_file = output_file_prefix + ".LROH";
|
|
2458
|
+
|
|
2459
|
+
unsigned int nGen=4; // Number of generations since common ancestry
|
|
2460
|
+
double genotype_error_rate = 0.01; // Assumed genotype error rate
|
|
2461
|
+
double p_auto_prior = 0.05; // Prior probability of being in autozygous state
|
|
2462
|
+
double p_auto_threshold = 0.99; // Threshold for reporting autozygous region
|
|
2463
|
+
int min_SNPs=0; // Threshold for reporting autozygous region
|
|
2464
|
+
|
|
2465
|
+
string vcf_line, CHROM;
|
|
2466
|
+
int POS;
|
|
2467
|
+
vcf_entry e(N_indv);
|
|
2468
|
+
pair<int, int> alleles;
|
|
2469
|
+
vector<unsigned int> s_vector;
|
|
2470
|
+
vector<pair<double, double> > p_emission;
|
|
2471
|
+
vector<vector<double> > p_trans;
|
|
2472
|
+
|
|
2473
|
+
ofstream out(output_file.c_str());
|
|
2474
|
+
if (!out.is_open()) error("Could not open output file: " + output_file, 12);
|
|
2475
|
+
out << "CHROM\tAUTO_START\tAUTO_END\tN_SNPs\tINDV" << endl;
|
|
2476
|
+
|
|
2477
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
2478
|
+
{
|
|
2479
|
+
if (include_indv[ui] == false)
|
|
2480
|
+
continue;
|
|
2481
|
+
|
|
2482
|
+
printLOG("\t" + indv[ui] + "\n");
|
|
2483
|
+
|
|
2484
|
+
int last_POS = -1;
|
|
2485
|
+
s_vector.resize(0); p_emission.resize(0); p_trans.resize(0);
|
|
2486
|
+
|
|
2487
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
2488
|
+
{
|
|
2489
|
+
if ((include_entry[s] == false) || (include_genotype[s][ui] == false))
|
|
2490
|
+
continue;
|
|
2491
|
+
|
|
2492
|
+
get_vcf_entry(s, vcf_line);
|
|
2493
|
+
e.reset(vcf_line);
|
|
2494
|
+
e.parse_basic_entry(true);
|
|
2495
|
+
|
|
2496
|
+
if (e.get_N_alleles() != 2)
|
|
2497
|
+
{
|
|
2498
|
+
one_off_warning("\tLROH: Only using bialleleic sites.");
|
|
2499
|
+
continue; // TODO: Probably could do without this...
|
|
2500
|
+
}
|
|
2501
|
+
|
|
2502
|
+
POS = e.get_POS();
|
|
2503
|
+
|
|
2504
|
+
e.parse_genotype_entry(ui, true);
|
|
2505
|
+
e.get_indv_GENOTYPE_ids(ui, alleles);
|
|
2506
|
+
|
|
2507
|
+
if (e.get_indv_ploidy(ui) != 2)
|
|
2508
|
+
{
|
|
2509
|
+
one_off_warning("\tLROH: Only using diploid sites.");
|
|
2510
|
+
continue;
|
|
2511
|
+
}
|
|
2512
|
+
|
|
2513
|
+
if ((alleles.first == -1) || (alleles.second == -1))
|
|
2514
|
+
continue;
|
|
2515
|
+
|
|
2516
|
+
unsigned int X = alleles.first + alleles.second;
|
|
2517
|
+
|
|
2518
|
+
// Calculate heterozyogosity of this site.
|
|
2519
|
+
// TODO: Would be better to do this once, but for simplicity, do it for each individual.
|
|
2520
|
+
unsigned int N_genotypes = 0;
|
|
2521
|
+
unsigned int N_hets = 0;
|
|
2522
|
+
for (unsigned int uj=0; uj<N_indv; uj++)
|
|
2523
|
+
{
|
|
2524
|
+
if ((include_indv[uj] == false) || (include_genotype[s][ui] == false))
|
|
2525
|
+
continue;
|
|
2526
|
+
|
|
2527
|
+
e.parse_genotype_entry(uj, true);
|
|
2528
|
+
e.get_indv_GENOTYPE_ids(uj, alleles);
|
|
2529
|
+
if ((alleles.first != -1) && (alleles.second != -1))
|
|
2530
|
+
{
|
|
2531
|
+
N_genotypes++;
|
|
2532
|
+
if (alleles.first != alleles.second)
|
|
2533
|
+
N_hets++;
|
|
2534
|
+
}
|
|
2535
|
+
}
|
|
2536
|
+
double h = N_hets / double(N_genotypes);
|
|
2537
|
+
double p_emission_given_nonauto;
|
|
2538
|
+
double p_emission_given_auto;
|
|
2539
|
+
|
|
2540
|
+
if (X == 1)
|
|
2541
|
+
{ // Heterozygote
|
|
2542
|
+
p_emission_given_nonauto = h;
|
|
2543
|
+
p_emission_given_auto = genotype_error_rate;
|
|
2544
|
+
p_emission.push_back(make_pair(p_emission_given_auto, p_emission_given_nonauto));
|
|
2545
|
+
}
|
|
2546
|
+
else
|
|
2547
|
+
{ // Homozygote
|
|
2548
|
+
p_emission_given_nonauto = 1.0-h;
|
|
2549
|
+
p_emission_given_auto = 1.0-genotype_error_rate;
|
|
2550
|
+
p_emission.push_back(make_pair(p_emission_given_auto, p_emission_given_nonauto));
|
|
2551
|
+
}
|
|
2552
|
+
|
|
2553
|
+
double r = 0;
|
|
2554
|
+
if (last_POS > 0)
|
|
2555
|
+
{ // Assume 1cM/Mb.
|
|
2556
|
+
r = (POS - last_POS) / 1000000.0 / 100.0; // Morgans
|
|
2557
|
+
}
|
|
2558
|
+
|
|
2559
|
+
double e = (1.0 - exp(-2.0*nGen*r));
|
|
2560
|
+
double p_trans_auto_to_nonauto = (1.0 - p_auto_prior) * e; //A[1]
|
|
2561
|
+
double p_trans_nonauto_to_auto = p_auto_prior * e; //A[2]
|
|
2562
|
+
double p_trans_auto_to_auto = 1.0 - p_trans_nonauto_to_auto; //A[0]
|
|
2563
|
+
double p_trans_nonauto_to_nonauto = 1.0 - p_trans_auto_to_nonauto; // A[3]
|
|
2564
|
+
vector<double> A(4);
|
|
2565
|
+
A[0] = p_trans_auto_to_auto;
|
|
2566
|
+
A[1] = p_trans_auto_to_nonauto;
|
|
2567
|
+
A[2] = p_trans_nonauto_to_auto;
|
|
2568
|
+
A[3] = p_trans_nonauto_to_nonauto;
|
|
2569
|
+
|
|
2570
|
+
s_vector.push_back(s);
|
|
2571
|
+
|
|
2572
|
+
p_trans.push_back(A);
|
|
2573
|
+
last_POS = POS;
|
|
2574
|
+
}
|
|
2575
|
+
|
|
2576
|
+
// Forward-backward algorithm
|
|
2577
|
+
int N_obs = (int)p_emission.size();
|
|
2578
|
+
if (N_obs == 0)
|
|
2579
|
+
continue;
|
|
2580
|
+
|
|
2581
|
+
vector<vector<double> > alpha(N_obs, vector<double>(2,0));
|
|
2582
|
+
vector<vector<double> > beta(N_obs, vector<double>(2,0));
|
|
2583
|
+
|
|
2584
|
+
alpha[0][0] = p_emission[0].first;
|
|
2585
|
+
alpha[0][1] = p_emission[0].second;
|
|
2586
|
+
for (int i=1; i<N_obs; i++)
|
|
2587
|
+
{
|
|
2588
|
+
alpha[i][0] = alpha[i-1][0] * p_trans[i-1][0] * p_emission[i].first;
|
|
2589
|
+
alpha[i][0] += alpha[i-1][1] * p_trans[i-1][2] * p_emission[i].first;
|
|
2590
|
+
|
|
2591
|
+
alpha[i][1] = alpha[i-1][1] * p_trans[i-1][3] * p_emission[i].second;
|
|
2592
|
+
alpha[i][1] += alpha[i-1][0] * p_trans[i-1][1] * p_emission[i].second;
|
|
2593
|
+
|
|
2594
|
+
while (alpha[i][0] + alpha[i][1] < 1e-20)
|
|
2595
|
+
{ // Renormalise to prevent underflow
|
|
2596
|
+
alpha[i][0] *= 1e20;
|
|
2597
|
+
alpha[i][1] *= 1e20;
|
|
2598
|
+
}
|
|
2599
|
+
}
|
|
2600
|
+
|
|
2601
|
+
beta[N_obs-1][0] = 1.0;
|
|
2602
|
+
beta[N_obs-1][1] = 1.0;
|
|
2603
|
+
for (int i=N_obs-2; i>=0; i--)
|
|
2604
|
+
{
|
|
2605
|
+
beta[i][0] = beta[i+1][0] * p_trans[i][0] * p_emission[i].first;
|
|
2606
|
+
beta[i][0] += beta[i+1][1] * p_trans[i][2] * p_emission[i].first;
|
|
2607
|
+
|
|
2608
|
+
beta[i][1] = beta[i+1][1] * p_trans[i][3] * p_emission[i].second;
|
|
2609
|
+
beta[i][1] += beta[i+1][0] * p_trans[i][1] * p_emission[i].second;
|
|
2610
|
+
|
|
2611
|
+
while (beta[i][0] + beta[i][1] < 1e-20)
|
|
2612
|
+
{ // Renormalise to prevent underflow
|
|
2613
|
+
beta[i][0] *= 1e20;
|
|
2614
|
+
beta[i][1] *= 1e20;
|
|
2615
|
+
}
|
|
2616
|
+
}
|
|
2617
|
+
|
|
2618
|
+
// Calculate probability of each site being autozygous
|
|
2619
|
+
vector<double> p_auto(N_obs);
|
|
2620
|
+
for (int i=0; i<N_obs; i++)
|
|
2621
|
+
{
|
|
2622
|
+
p_auto[i] = alpha[i][0] * beta[i][0] / (alpha[i][0] * beta[i][0] + alpha[i][1] * beta[i][1]);
|
|
2623
|
+
}
|
|
2624
|
+
|
|
2625
|
+
// Generate output
|
|
2626
|
+
// TODO: Would be good to report actual limits of homozygosity
|
|
2627
|
+
// (i.e. extend regions out until first heterozygote),
|
|
2628
|
+
// as opposed to regions with p>threshold.
|
|
2629
|
+
// TODO: Also would be good to report heterozygotic SNPs found in homozygotic regions.
|
|
2630
|
+
bool in_auto=false;
|
|
2631
|
+
int start_pos=0, end_pos=0;
|
|
2632
|
+
int N_SNPs = 0;
|
|
2633
|
+
for (int i=0; i<N_obs; i++)
|
|
2634
|
+
{
|
|
2635
|
+
if (p_auto[i] > p_auto_threshold)
|
|
2636
|
+
{
|
|
2637
|
+
if (in_auto == false)
|
|
2638
|
+
{ // Start of autozygous region
|
|
2639
|
+
unsigned int s = s_vector[i];
|
|
2640
|
+
get_vcf_entry(s, vcf_line);
|
|
2641
|
+
e.reset(vcf_line);
|
|
2642
|
+
e.parse_basic_entry(true);
|
|
2643
|
+
CHROM = e.get_CHROM();
|
|
2644
|
+
start_pos = e.get_POS();
|
|
2645
|
+
}
|
|
2646
|
+
N_SNPs++;
|
|
2647
|
+
in_auto = true;
|
|
2648
|
+
}
|
|
2649
|
+
else
|
|
2650
|
+
{
|
|
2651
|
+
if (in_auto == true)
|
|
2652
|
+
{ // end of autozygous region
|
|
2653
|
+
unsigned int s = s_vector[i];
|
|
2654
|
+
get_vcf_entry(s, vcf_line);
|
|
2655
|
+
e.reset(vcf_line);
|
|
2656
|
+
e.parse_basic_entry(true);
|
|
2657
|
+
end_pos = e.get_POS();
|
|
2658
|
+
if (N_SNPs >= min_SNPs)
|
|
2659
|
+
out << CHROM << "\t" << start_pos << "\t" << end_pos << "\t" << N_SNPs << "\t" << indv[ui] << endl;
|
|
2660
|
+
}
|
|
2661
|
+
in_auto = false;
|
|
2662
|
+
N_SNPs = 0;
|
|
2663
|
+
}
|
|
2664
|
+
}
|
|
2665
|
+
if (in_auto == true)
|
|
2666
|
+
{ // Report final region if needed
|
|
2667
|
+
unsigned int s = s_vector[N_obs-1];
|
|
2668
|
+
get_vcf_entry(s, vcf_line);
|
|
2669
|
+
e.reset(vcf_line);
|
|
2670
|
+
e.parse_basic_entry(true);
|
|
2671
|
+
end_pos = e.get_POS();
|
|
2672
|
+
if (N_SNPs >= min_SNPs)
|
|
2673
|
+
out << CHROM << "\t" << start_pos << "\t" << end_pos << "\t" << N_SNPs << "\t" << indv[ui] << endl;
|
|
2674
|
+
}
|
|
2675
|
+
}
|
|
2676
|
+
out.close();
|
|
2677
|
+
}
|
|
2678
|
+
|
|
2679
|
+
void vcf_file::output_indv_relatedness(const string &output_file_prefix)
|
|
2680
|
+
{
|
|
2681
|
+
// Calculate and output a relatedness statistic based on the method of
|
|
2682
|
+
// Yang et al, 2010 (doi:10.1038/ng.608). Specifically, calculate the
|
|
2683
|
+
// unadjusted Ajk statistic (equation 6 of paper).
|
|
2684
|
+
// Expectation of Ajk is zero for individuals within a populations, and
|
|
2685
|
+
// one for an individual with themselves.
|
|
2686
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
2687
|
+
error("Require Genotypes in VCF file in order to output Individual Relatedness.");
|
|
2688
|
+
|
|
2689
|
+
printLOG("Outputting Individual Relatedness\n");
|
|
2690
|
+
string output = output_file_prefix + ".relatedness";
|
|
2691
|
+
ofstream out(output.c_str());
|
|
2692
|
+
if (!out.is_open())
|
|
2693
|
+
error("Could not open Individual Relatedness Output File: " + output, 2);
|
|
2694
|
+
out << "INDV1\tINDV2\tRELATEDNESS" << endl;
|
|
2695
|
+
|
|
2696
|
+
string vcf_line;
|
|
2697
|
+
vcf_entry e(N_indv);
|
|
2698
|
+
vector<int> allele_counts;
|
|
2699
|
+
unsigned int N_alleles, N_non_missing_chr;
|
|
2700
|
+
double freq;
|
|
2701
|
+
pair<int, int> geno_id;
|
|
2702
|
+
vector<vector<double> > Ajk(N_indv, vector<double>(N_indv, 0.0));
|
|
2703
|
+
vector<vector<double> > N_sites(N_indv, vector<double>(N_indv, 0.0));
|
|
2704
|
+
|
|
2705
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
2706
|
+
{
|
|
2707
|
+
if (include_entry[s] == false)
|
|
2708
|
+
continue;
|
|
2709
|
+
|
|
2710
|
+
get_vcf_entry(s, vcf_line);
|
|
2711
|
+
e.reset(vcf_line);
|
|
2712
|
+
|
|
2713
|
+
e.parse_basic_entry(true);
|
|
2714
|
+
N_alleles = e.get_N_alleles();
|
|
2715
|
+
|
|
2716
|
+
if (N_alleles != 2)
|
|
2717
|
+
{
|
|
2718
|
+
one_off_warning("\tRelatedness: Only using biallelic sites.");
|
|
2719
|
+
continue; // Only use biallelic loci
|
|
2720
|
+
}
|
|
2721
|
+
|
|
2722
|
+
e.parse_genotype_entries(true);
|
|
2723
|
+
if (e.is_diploid(include_indv, include_genotype[s]) == false)
|
|
2724
|
+
{
|
|
2725
|
+
one_off_warning("\tRelatedness: Only using fully diploid sites.");
|
|
2726
|
+
continue;
|
|
2727
|
+
}
|
|
2728
|
+
|
|
2729
|
+
e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
|
|
2730
|
+
freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency
|
|
2731
|
+
|
|
2732
|
+
if ((freq <= numeric_limits<double>::epsilon()) || (freq >= (1.0-numeric_limits<double>::epsilon())))
|
|
2733
|
+
continue;
|
|
2734
|
+
|
|
2735
|
+
vector<double> x(N_indv, -1.0);
|
|
2736
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
2737
|
+
{
|
|
2738
|
+
if (include_indv[ui] == false)
|
|
2739
|
+
continue;
|
|
2740
|
+
|
|
2741
|
+
e.get_indv_GENOTYPE_ids(ui, geno_id);
|
|
2742
|
+
x[ui] = geno_id.first + geno_id.second;
|
|
2743
|
+
}
|
|
2744
|
+
|
|
2745
|
+
double div = 1.0/(2.0*freq*(1.0-freq));
|
|
2746
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
2747
|
+
{
|
|
2748
|
+
if ((include_indv[ui] == false) || (include_genotype[s][ui] == false) || (x[ui] < 0))
|
|
2749
|
+
continue;
|
|
2750
|
+
Ajk[ui][ui] += (x[ui]*x[ui] - (1 + 2.0*freq)*x[ui] + 2.0*freq*freq) * div;
|
|
2751
|
+
N_sites[ui][ui]++;
|
|
2752
|
+
for (unsigned int uj=(ui+1); uj<N_indv; uj++)
|
|
2753
|
+
{
|
|
2754
|
+
if ((include_indv[uj] == false) || (include_genotype[s][uj] == false) || (x[uj] < 0))
|
|
2755
|
+
continue;
|
|
2756
|
+
Ajk[ui][uj] += (x[ui] - 2.0*freq) * (x[uj] - 2.0*freq) * div;
|
|
2757
|
+
N_sites[ui][uj]++;
|
|
2758
|
+
}
|
|
2759
|
+
}
|
|
2760
|
+
}
|
|
2761
|
+
|
|
2762
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
2763
|
+
{
|
|
2764
|
+
if (include_indv[ui] == false)
|
|
2765
|
+
continue;
|
|
2766
|
+
Ajk[ui][ui] = 1.0 + (Ajk[ui][ui] / N_sites[ui][ui]);
|
|
2767
|
+
out << indv[ui] << "\t" << indv[ui] << "\t" << Ajk[ui][ui] << endl;
|
|
2768
|
+
for (unsigned int uj=(ui+1); uj<N_indv; uj++)
|
|
2769
|
+
{
|
|
2770
|
+
if (include_indv[uj] == false)
|
|
2771
|
+
continue;
|
|
2772
|
+
Ajk[ui][uj] /= N_sites[ui][uj];
|
|
2773
|
+
out << indv[ui] << "\t" << indv[uj] << "\t" << Ajk[ui][uj] << endl;
|
|
2774
|
+
}
|
|
2775
|
+
}
|
|
2776
|
+
|
|
2777
|
+
out.close();
|
|
2778
|
+
}
|
|
2779
|
+
|
|
2780
|
+
void vcf_file::output_PCA(const string &output_file_prefix, bool use_normalisation, int SNP_loadings_N_PCs)
|
|
2781
|
+
{
|
|
2782
|
+
#ifndef VCFTOOLS_PCA
|
|
2783
|
+
use_normalisation = true;
|
|
2784
|
+
SNP_loadings_N_PCs = -1;
|
|
2785
|
+
string out = output_file_prefix;
|
|
2786
|
+
out = "Cannot run PCA analysis. Vcftools has been compiled without PCA enabled (requires LAPACK).";
|
|
2787
|
+
error(out);
|
|
2788
|
+
#else
|
|
2789
|
+
// Output PCA, following method of Patterson, Price and Reich 2006.
|
|
2790
|
+
if ((has_genotypes == false) | (N_kept_individuals() == 0))
|
|
2791
|
+
error("Require Genotypes in VCF file in order to perform PCA.");
|
|
2792
|
+
|
|
2793
|
+
if (use_normalisation)
|
|
2794
|
+
printLOG("Outputting Principal Component Analysis (with normalisation)\n");
|
|
2795
|
+
else
|
|
2796
|
+
printLOG("Outputting Principal Component Analysis (without normalisation)\n");
|
|
2797
|
+
string output = output_file_prefix + ".pca";
|
|
2798
|
+
ofstream out(output.c_str());
|
|
2799
|
+
if (!out.is_open())
|
|
2800
|
+
error("Could not open Principal Component Analysis Output File: " + output, 2);
|
|
2801
|
+
|
|
2802
|
+
unsigned int N_indvs = N_kept_individuals();
|
|
2803
|
+
unsigned int N_sites = N_kept_sites();
|
|
2804
|
+
|
|
2805
|
+
if (N_indvs >= N_sites)
|
|
2806
|
+
error("PCA computation requires that there are more sites than individuals.");
|
|
2807
|
+
|
|
2808
|
+
string vcf_line;
|
|
2809
|
+
vcf_entry e(N_indv);
|
|
2810
|
+
pair<int, int> geno_id;
|
|
2811
|
+
double x, freq;
|
|
2812
|
+
vector<int> allele_counts;
|
|
2813
|
+
unsigned int N_alleles, N_non_missing_chr;
|
|
2814
|
+
|
|
2815
|
+
// Store list of included individuals
|
|
2816
|
+
vector<string> included_indvs(N_indvs);
|
|
2817
|
+
unsigned int ui_prime = 0;
|
|
2818
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
2819
|
+
{
|
|
2820
|
+
if (include_indv[ui] == false)
|
|
2821
|
+
continue;
|
|
2822
|
+
included_indvs[ui_prime] = indv[ui];
|
|
2823
|
+
ui_prime++;
|
|
2824
|
+
}
|
|
2825
|
+
|
|
2826
|
+
// Potentially uses a lot of memory. Should issue a warning about this.
|
|
2827
|
+
double **M = new double*[N_indvs]; // m rows = indv
|
|
2828
|
+
for (unsigned int ui=0; ui<N_indvs; ui++)
|
|
2829
|
+
M[ui] = new double[N_sites]; // n columns
|
|
2830
|
+
|
|
2831
|
+
// Populate M
|
|
2832
|
+
unsigned int s_prime = 0;
|
|
2833
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
2834
|
+
{
|
|
2835
|
+
if (include_entry[s]==false)
|
|
2836
|
+
continue;
|
|
2837
|
+
|
|
2838
|
+
get_vcf_entry(s, vcf_line);
|
|
2839
|
+
e.reset(vcf_line);
|
|
2840
|
+
|
|
2841
|
+
e.parse_basic_entry(true);
|
|
2842
|
+
N_alleles = e.get_N_alleles();
|
|
2843
|
+
if (N_alleles != 2)
|
|
2844
|
+
error("PCA only works for biallelic sites.");
|
|
2845
|
+
|
|
2846
|
+
e.parse_genotype_entries(true);
|
|
2847
|
+
if (e.is_diploid(include_indv, include_genotype[s]) == false)
|
|
2848
|
+
error("PCA only works for fully diploid sites.");
|
|
2849
|
+
|
|
2850
|
+
e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
|
|
2851
|
+
freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency
|
|
2852
|
+
|
|
2853
|
+
if ((freq <= numeric_limits<double>::epsilon()) || (freq >= (1.0-numeric_limits<double>::epsilon())))
|
|
2854
|
+
continue;
|
|
2855
|
+
|
|
2856
|
+
double mu = freq*2.0;
|
|
2857
|
+
double div = 1.0 / sqrt(freq * (1.0-freq));
|
|
2858
|
+
|
|
2859
|
+
ui_prime = 0;
|
|
2860
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
2861
|
+
{
|
|
2862
|
+
if (include_indv[ui] == false)
|
|
2863
|
+
continue;
|
|
2864
|
+
|
|
2865
|
+
e.get_indv_GENOTYPE_ids(ui, geno_id);
|
|
2866
|
+
x = geno_id.first + geno_id.second;
|
|
2867
|
+
if (x > -1)
|
|
2868
|
+
{
|
|
2869
|
+
if (use_normalisation == true)
|
|
2870
|
+
M[ui_prime][s_prime] = (x - mu) * div;
|
|
2871
|
+
else
|
|
2872
|
+
M[ui_prime][s_prime] = (x - mu);
|
|
2873
|
+
}
|
|
2874
|
+
ui_prime++;
|
|
2875
|
+
}
|
|
2876
|
+
s_prime++;
|
|
2877
|
+
}
|
|
2878
|
+
|
|
2879
|
+
// Now construct X = (1/n)MM'.
|
|
2880
|
+
double **X = new double *[N_indvs];
|
|
2881
|
+
for (unsigned int ui=0; ui<N_indvs; ui++)
|
|
2882
|
+
X[ui] = new double[N_indvs];
|
|
2883
|
+
|
|
2884
|
+
for (unsigned int ui=0; ui<N_indvs; ui++)
|
|
2885
|
+
for (unsigned int uj=0; uj<N_indvs; uj++)
|
|
2886
|
+
X[ui][uj] = 0;
|
|
2887
|
+
|
|
2888
|
+
// Only populate one half of matrix
|
|
2889
|
+
for (unsigned int ui=0; ui<N_indvs; ui++)
|
|
2890
|
+
for (unsigned int uj=ui; uj<N_indvs; uj++)
|
|
2891
|
+
for (unsigned int s=0; s<N_sites; s++)
|
|
2892
|
+
X[ui][uj] += M[ui][s] * M[uj][s];
|
|
2893
|
+
|
|
2894
|
+
delete [] M;
|
|
2895
|
+
|
|
2896
|
+
// Populate other half
|
|
2897
|
+
for (unsigned int ui=0; ui<N_indvs; ui++)
|
|
2898
|
+
for (unsigned int uj=0; uj<ui; uj++)
|
|
2899
|
+
X[ui][uj] = X[uj][ui];
|
|
2900
|
+
|
|
2901
|
+
for (unsigned int ui=0; ui<N_indvs; ui++)
|
|
2902
|
+
for (unsigned int uj=0; uj<N_indvs; uj++)
|
|
2903
|
+
X[ui][uj] /= N_sites;
|
|
2904
|
+
|
|
2905
|
+
double *Er = new double[N_indvs];
|
|
2906
|
+
double *Ei = new double[N_indvs];
|
|
2907
|
+
double **Evecs = new double*[N_indvs];
|
|
2908
|
+
for (unsigned int ui=0; ui<N_indvs; ui++)
|
|
2909
|
+
Evecs[ui] = new double[N_indvs];
|
|
2910
|
+
|
|
2911
|
+
// Call LAPACK routine to calculate eigenvectors and eigenvalues
|
|
2912
|
+
dgeev(X, N_indvs, Er, Ei, Evecs);
|
|
2913
|
+
|
|
2914
|
+
// Check there are no complex eigenvalues.
|
|
2915
|
+
for (unsigned int ui=0; ui<N_indvs; ui++)
|
|
2916
|
+
if (Ei[ui] != 0)
|
|
2917
|
+
error("Complex eigenvalue.");
|
|
2918
|
+
|
|
2919
|
+
// Output results
|
|
2920
|
+
out << "INDV";
|
|
2921
|
+
for (unsigned int ui=0; ui<N_indvs; ui++)
|
|
2922
|
+
out << "\tEIG_" << ui;
|
|
2923
|
+
out << endl;
|
|
2924
|
+
|
|
2925
|
+
out << "EIGENVALUE";
|
|
2926
|
+
for (unsigned int ui=0; ui<N_indvs; ui++)
|
|
2927
|
+
out << "\t" << Er[ui];
|
|
2928
|
+
out << endl;
|
|
2929
|
+
|
|
2930
|
+
// Output eigenvectors (as columns)
|
|
2931
|
+
for (unsigned int ui=0; ui<N_indvs; ui++)
|
|
2932
|
+
{
|
|
2933
|
+
out << included_indvs[ui];
|
|
2934
|
+
for (unsigned int uj=0; uj<N_indvs; uj++)
|
|
2935
|
+
out << "\t" << Evecs[ui][uj];
|
|
2936
|
+
out << endl;
|
|
2937
|
+
}
|
|
2938
|
+
|
|
2939
|
+
out.close();
|
|
2940
|
+
|
|
2941
|
+
if (SNP_loadings_N_PCs > 0)
|
|
2942
|
+
{ // Output SNP loadings
|
|
2943
|
+
printLOG("Outputting " + int2str(SNP_loadings_N_PCs) + " SNP loadings\n");
|
|
2944
|
+
output = output_file_prefix + ".pca.loadings";
|
|
2945
|
+
out.open(output.c_str());
|
|
2946
|
+
if (!out.good())
|
|
2947
|
+
error("Could not open Principal Component SNP Loading Output File: " + output, 2);
|
|
2948
|
+
out << "CHROM\tPOS";
|
|
2949
|
+
for (unsigned int ui=0; ui<(unsigned int)SNP_loadings_N_PCs; ui++)
|
|
2950
|
+
out << "\tGAMMA_" << ui;
|
|
2951
|
+
out << endl;
|
|
2952
|
+
|
|
2953
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
2954
|
+
{
|
|
2955
|
+
if (include_entry[s]==false)
|
|
2956
|
+
continue;
|
|
2957
|
+
|
|
2958
|
+
get_vcf_entry(s, vcf_line);
|
|
2959
|
+
e.reset(vcf_line);
|
|
2960
|
+
|
|
2961
|
+
e.parse_basic_entry(true);
|
|
2962
|
+
N_alleles = e.get_N_alleles();
|
|
2963
|
+
if (N_alleles != 2)
|
|
2964
|
+
error("PCA only works for biallelic sites.");
|
|
2965
|
+
|
|
2966
|
+
e.parse_genotype_entries(true);
|
|
2967
|
+
if (e.is_diploid(include_indv, include_genotype[s]) == false)
|
|
2968
|
+
error("PCA only works for fully diploid sites.");
|
|
2969
|
+
|
|
2970
|
+
e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
|
|
2971
|
+
freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency
|
|
2972
|
+
|
|
2973
|
+
if ((freq <= numeric_limits<double>::epsilon()) || (freq >= (1.0-numeric_limits<double>::epsilon())))
|
|
2974
|
+
continue;
|
|
2975
|
+
|
|
2976
|
+
vector<double> gamma(SNP_loadings_N_PCs, 0.0);
|
|
2977
|
+
vector<double> a_sum(SNP_loadings_N_PCs, 0.0);
|
|
2978
|
+
|
|
2979
|
+
ui_prime = 0;
|
|
2980
|
+
for (unsigned int ui=0; ui<N_indv; ui++)
|
|
2981
|
+
{
|
|
2982
|
+
if (include_indv[ui] == false)
|
|
2983
|
+
continue;
|
|
2984
|
+
|
|
2985
|
+
e.get_indv_GENOTYPE_ids(ui, geno_id);
|
|
2986
|
+
x = geno_id.first + geno_id.second;
|
|
2987
|
+
if (x > -1)
|
|
2988
|
+
{
|
|
2989
|
+
for (unsigned int uj=0; uj<(unsigned int)SNP_loadings_N_PCs; uj++)
|
|
2990
|
+
{
|
|
2991
|
+
gamma[uj] += (x * Evecs[ui_prime][uj]);
|
|
2992
|
+
a_sum[uj] += (Evecs[ui_prime][uj]*Evecs[ui_prime][uj]);
|
|
2993
|
+
}
|
|
2994
|
+
}
|
|
2995
|
+
ui_prime++;
|
|
2996
|
+
}
|
|
2997
|
+
|
|
2998
|
+
out << e.get_CHROM() << "\t" << e.get_POS();
|
|
2999
|
+
for (unsigned int uj=0; uj<(unsigned int)SNP_loadings_N_PCs; uj++)
|
|
3000
|
+
out << "\t" << gamma[uj] / a_sum[uj];
|
|
3001
|
+
out << endl;
|
|
3002
|
+
}
|
|
3003
|
+
out.close();
|
|
3004
|
+
}
|
|
3005
|
+
|
|
3006
|
+
delete [] Er;
|
|
3007
|
+
delete [] Ei;
|
|
3008
|
+
delete [] Evecs;
|
|
3009
|
+
delete [] X;
|
|
3010
|
+
#endif
|
|
3011
|
+
}
|
|
3012
|
+
|